diff --git a/.travis.yml b/.travis.yml index e3d7a6d..e9725b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,9 +20,6 @@ before_install: - sudo apt-get -y install python-joblib=0.8.3-1-cdb1 - sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1 - # Install pysal - - sudo pip install -I pysal==1.11.2 - - sudo apt-get -y install python-scipy=0.14.0-2-cdb6 - sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2 - sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2 diff --git a/NEWS.md b/NEWS.md index 3d07756..19aa56c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,15 @@ +0.7.0 (2018-02-23) +------------------ +* Updated Moran and Markov documentation [#179](https://github.com/CartoDB/crankshaft/pull/179) [#155](https://github.com/CartoDB/crankshaft/pull/155) +* Updated examples in documentation [#193](https://github.com/CartoDB/crankshaft/pull/193) +* Better error management for empty values [#157](https://github.com/CartoDB/crankshaft/pull/157) +* Added nonspatial kmeans with class framework [#150](https://github.com/CartoDB/crankshaft/pull/150) +* Added multipolygons and geometry collections support to PIA analyssis [#165](https://github.com/CartoDB/crankshaft/pull/165) +* Upgraded PySAL to v1.14.3 [#198](https://github.com/CartoDB/crankshaft/pull/198) + 0.6.1 (2017-11-23) -* Add VOLATILITY and PARALLEL categories to PostgreSQL functions +------------------ +* Added VOLATILITY and PARALLEL categories to PostgreSQL functions [#183](https://github.com/CartoDB/crankshaft/pull/183) 0.6.0 (2017-11-08) ------------------ diff --git a/doc/02_moran.md b/doc/02_moran.md index e83c2f1..a00c33e 100644 --- a/doc/02_moran.md +++ b/doc/02_moran.md @@ -1,5 +1,14 @@ ## Areas of Interest Functions +A family of analyses to uncover groupings of areas with consistently high or low values (clusters) and smaller areas with values unlike those around them (outliers). A cluster is labeled by an 'HH' (high value compared to the entire dataset in an area with other high values), or its opposite 'LL'. An outlier is labeled by an 'LH' (low value surrounded by high values) or an 'HL' (the opposite). Each cluster and outlier classification has an associated p-value, a measure of how significant the pattern of highs and lows is compared to a random distribution. + +These functions have two forms: local and global. The local versions classify every input geometry while the global function gives a rating of the overall clustering characteristics of the dataset. Both forms accept an optional denomiator (see the rate versions) if, for example, working with count data and a denominator is needed. + +### Notes + +* Rows with null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, or use a `LEFT JOIN` to get null outputs from the analysis. +* Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified within the `subquery` argument. + ### CDB_AreasOfInterestLocal(subquery text, column_name text) This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I. @@ -29,6 +38,7 @@ A table with the following columns. | vals | NUMERIC | Values from `'column_name'`. | + #### Example Usage ```sql @@ -37,8 +47,10 @@ SELECT aoi.quads, aoi.significance, c.num_cyclists_per_total_population -FROM CDB_AreasOfInterestLocal('SELECT * FROM commute_data' - 'num_cyclists_per_total_population') As aoi +FROM + cdb_crankshaft.CDB_AreasOfInterestLocal( + 'SELECT * FROM commute_data' + 'num_cyclists_per_total_population') As aoi JOIN commute_data As c ON c.cartodb_id = aoi.rowid; ``` @@ -71,8 +83,12 @@ A table with the following columns. #### Examples ```sql -SELECT * -FROM CDB_AreasOfInterestGlobal('SELECT * FROM commute_data', 'num_cyclists_per_total_population') +SELECT + * +FROM + cdb_crankshaft.CDB_AreasOfInterestGlobal( + 'SELECT * FROM commute_data', + 'num_cyclists_per_total_population') ``` ### CDB_AreasOfInterestLocalRate(subquery text, numerator_column text, denominator_column text) @@ -102,7 +118,7 @@ A table with the following columns. | quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. | | significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. | | rowid | INT | Row id of the values which correspond to the input rows. | -| vals | NUMERIC | Values from `'column_name'`. | +| vals | NUMERIC | Standardized rate (centered on the mean and normalized by the standard deviation) calculated from `numerator` and `denominator`. This is calculated by [Assuncao Rate](http://pysal.readthedocs.io/en/latest/library/esda/smoothing.html?highlight=assuncao#pysal.esda.smoothing.assuncao_rate) in the PySAL library. | #### Example Usage @@ -113,9 +129,11 @@ SELECT aoi.quads, aoi.significance, c.cyclists_per_total_population -FROM CDB_AreasOfInterestLocalRate('SELECT * FROM commute_data' - 'num_cyclists', - 'total_population') As aoi +FROM + cdb_crankshaft.CDB_AreasOfInterestLocalRate( + 'SELECT * FROM commute_data' + 'num_cyclists', + 'total_population') As aoi JOIN commute_data As c ON c.cartodb_id = aoi.rowid; ``` @@ -149,10 +167,13 @@ A table with the following columns. #### Examples ```sql -SELECT * -FROM CDB_AreasOfInterestGlobalRate('SELECT * FROM commute_data', - 'num_cyclists', - 'total_population') +SELECT + * +FROM + cdb_crankshaft.CDB_AreasOfInterestGlobalRate( + 'SELECT * FROM commute_data', + 'num_cyclists', + 'total_population') ``` ## Hotspot, Coldspot, and Outlier Functions diff --git a/doc/04_markov.md b/doc/04_markov.md index a45df59..250078c 100644 --- a/doc/04_markov.md +++ b/doc/04_markov.md @@ -8,7 +8,7 @@ This function takes time series data associated with geometries and outputs like | Name | Type | Description | |------|------|-------------| -| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments | +| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments. Tables in queries must exist in user's database (i.e., no CTEs at present) | | column_names | TEXT Array | Names of column that form the history of measurements for the geometries (e.g., `Array['y2011', 'y2012', 'y2013', 'y2014', 'y2015', 'y2016']`). | | num_classes (optional) | INT | Number of quantile classes to separate data into. | | weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). | @@ -30,18 +30,29 @@ A table with the following columns. | rowid | NUMERIC | id of the row that corresponds to the `id_col` (by default `cartodb_id` of the input rows) | +#### Notes + +* Rows will null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, etc. +* Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified in the `subquery` parameter. + + #### Example Usage ```sql SELECT c.cartodb_id, c.the_geom, + c.the_geom_webmercator, m.trend, m.trend_up, m.trend_down, m.volatility -FROM CDB_SpatialMarkovTrend('SELECT * FROM nyc_real_estate' - Array['m03y2009','m03y2010','m03y2011','m03y2012','m03y2013','m03y2014','m03y2015','m03y2016']) As m +FROM + cdb_crankshaft.CDB_SpatialMarkovTrend( + 'SELECT * FROM nyc_real_estate' + Array['m03y2009', 'm03y2010', 'm03y2011', + 'm03y2012', 'm03y2013', 'm03y2014', + 'm03y2015','m03y2016']) As m JOIN nyc_real_estate As c ON c.cartodb_id = m.rowid; ``` diff --git a/doc/07_gravity.md b/doc/07_gravity.md index e4e439e..47d6db2 100644 --- a/doc/07_gravity.md +++ b/doc/07_gravity.md @@ -54,9 +54,9 @@ with t as ( SELECT array_agg(cartodb_id::bigint) as id, array_agg(the_geom) as g, - array_agg(coalesce(gla,0)::numeric) as w + array_agg(coalesce(gla, 0)::numeric) as w FROM - abel.centros_comerciales_de_madrid + centros_comerciales_de_madrid WHERE not no_cc ), s as ( @@ -67,12 +67,15 @@ SELECT FROM sscc_madrid ) -select +SELECT g.the_geom, - trunc(g.h,2) as h, + trunc(g.h, 2) as h, round(g.hpop) as hpop, - trunc(g.dist/1000,2) as dist_km -FROM t, s, CDB_Gravity1(t.id, t.g, t.w, s.id, s.g, s.p, newmall_ID, 100000, 5000) g + trunc(g.dist/1000, 2) as dist_km +FROM + t, + s, + cdb_crankshaft.CDB_Gravity(t.id, t.g, t.w, s.id, s.g, s.p, newmall_ID, 100000, 5000) as g ``` diff --git a/doc/08_interpolation.md b/doc/08_interpolation.md index c17269e..3fae966 100644 --- a/doc/08_interpolation.md +++ b/doc/08_interpolation.md @@ -44,11 +44,18 @@ Default values: #### Example Usage ```sql -with a as ( - select +WITH a as ( + SELECT array_agg(the_geom) as geomin, array_agg(temp::numeric) as colin - from table_4804232032 + FROM table_4804232032 ) -SELECT CDB_SpatialInterpolation(geomin, colin, CDB_latlng(41.38, 2.15),1) FROM a; +SELECT + cdb_crankshaft.CDB_SpatialInterpolation( + geomin, + colin, + CDB_latlng(41.38, 2.15), + 1) +FROM + a ``` diff --git a/doc/09_voronoi.md b/doc/09_voronoi.md index 1a19103..223f43d 100644 --- a/doc/09_voronoi.md +++ b/doc/09_voronoi.md @@ -27,12 +27,20 @@ PostGIS wil include this in future versions ([doc for dev branch](http://postgis ```sql WITH a AS ( SELECT - ARRAY[ST_GeomFromText('POINT(2.1744 41.403)', 4326),ST_GeomFromText('POINT(2.1228 41.380)', 4326),ST_GeomFromText('POINT(2.1511 41.374)', 4326),ST_GeomFromText('POINT(2.1528 41.413)', 4326),ST_GeomFromText('POINT(2.165 41.391)', 4326),ST_GeomFromText('POINT(2.1498 41.371)', 4326),ST_GeomFromText('POINT(2.1533 41.368)', 4326),ST_GeomFromText('POINT(2.131386 41.41399)', 4326)] AS geomin + ARRAY[ + ST_GeomFromText('POINT(2.1744 41.403)', 4326), + ST_GeomFromText('POINT(2.1228 41.380)', 4326), + ST_GeomFromText('POINT(2.1511 41.374)', 4326), + ST_GeomFromText('POINT(2.1528 41.413)', 4326), + ST_GeomFromText('POINT(2.165 41.391)', 4326), + ST_GeomFromText('POINT(2.1498 41.371)', 4326), + ST_GeomFromText('POINT(2.1533 41.368)', 4326), + ST_GeomFromText('POINT(2.131386 41.41399)', 4326) + ] AS geomin ) SELECT - st_transform( - (st_dump(CDB_voronoi(geomin, 0.2, 1e-9) - )).geom - , 3857) as the_geom_webmercator + ST_TRANSFORM( + (ST_Dump(cdb_crankshaft.CDB_Voronoi(geomin, 0.2, 1e-9))).geom, + 3857) as the_geom_webmercator FROM a; ``` diff --git a/doc/11_kmeans.md b/doc/11_kmeans.md index 6153010..97d85aa 100644 --- a/doc/11_kmeans.md +++ b/doc/11_kmeans.md @@ -1,17 +1,17 @@ ## K-Means Functions -### CDB_KMeans(subquery text, no_clusters INTEGER) +k-means clustering is a popular technique for finding clusters in data by minimizing the intra-cluster 'distance' and maximizing the inter-cluster 'distance'. The distance is defined in the parameter space of the variables entered. -This function attempts to find n clusters within the input data. It will return a table to CartoDB ids and -the number of the cluster each point in the input was assigend to. +### CDB_KMeans(subquery text, no_clusters integer) +This function attempts to find `no_clusters` clusters within the input data based on the geographic distribution. It will return a table with ids and the cluster classification of each point input assuming `the_geom` is not null-valued. If `the_geom` is null-valued, the point will not be considered in the analysis. #### Arguments | Name | Type | Description | |------|------|-------------| | subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments | -| no\_clusters | INTEGER | The number of clusters to try and find | +| no\_clusters | INTEGER | The number of clusters to find | #### Returns @@ -19,25 +19,28 @@ A table with the following columns. | Column Name | Type | Description | |-------------|------|-------------| -| cartodb\_id | INTEGER | The CartoDB id of the row in the input table.| -| cluster\_no | INTEGER | The cluster that this point belongs to. | +| cartodb\_id | INTEGER | The row id of the row from the input table | +| cluster\_no | INTEGER | The cluster that this point belongs to | #### Example Usage ```sql -SELECT - customers.*, - km.cluster_no - FROM cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) km, customers_3 - WHERE customers.cartodb_id = km.cartodb_id +SELECT + customers.*, + km.cluster_no +FROM + cdb_crankshaft.CDB_KMeans('SELECT * from customers' , 6) As km, + customers +WHERE + customers.cartodb_id = km.cartodb_id ``` ### CDB_WeightedMean(subquery text, weight_column text, category_column text) Function that computes the weighted centroid of a number of clusters by some weight column. -### Arguments +### Arguments | Name | Type | Description | |------|------|-------------| @@ -45,18 +48,75 @@ Function that computes the weighted centroid of a number of clusters by some wei | weight\_column | TEXT | The name of the column to use as a weight | | category\_column | TEXT | The name of the column to use as a category | -### Returns +### Returns A table with the following columns. | Column Name | Type | Description | |-------------|------|-------------| | the\_geom | GEOMETRY | A point for the weighted cluster center | -| class | INTEGER | The cluster class | +| class | INTEGER | The cluster class | -### Example Usage +### Example Usage -```sql -SELECT ST_TRANSFORM(the_geom, 3857) as the_geom_webmercator, class -FROM cdb_weighted_mean('SELECT *, customer_value FROM customers','customer_value','cluster_no') +```sql +SELECT + ST_Transform(km.the_geom, 3857) As the_geom_webmercator, + km.class +FROM + cdb_crankshaft.CDB_WeightedMean( + 'SELECT *, customer_value FROM customers', + 'customer_value', + 'cluster_no') As km ``` + +## CDB_KMeansNonspatial(subquery text, colnames text[], no_clusters int) + +K-means clustering classifies the rows of your dataset into `no_clusters` by finding the centers (means) of the variables in `colnames` and classifying each row by it's proximity to the nearest center. This method partitions space into distinct Voronoi cells. + +As a standard machine learning method, k-means clustering is an unsupervised learning technique that finds the natural clustering of values. For instance, it is useful for finding subgroups in census data leading to demographic segmentation. + +### Arguments + +| Name | Type | Description | +|------|------|-------------| +| query | TEXT | SQL query to expose the data to be used in the analysis (e.g., `SELECT * FROM iris_data`). It should contain at least the columns specified in `colnames` and the `id_colname`. | +| colnames | TEXT[] | Array of columns to be used in the analysis (e.g., `Array['petal_width', 'sepal_length', 'petal_length']`). | +| no\_clusters | INTEGER | Number of clusters for the classification of the data | +| id\_col (optional) | TEXT | The id column (default: 'cartodb_id') for identifying rows | +| standarize (optional) | BOOLEAN | Setting this to true (default) standardizes the data to have a mean at zero and a standard deviation of 1 | + +### Returns + +A table with the following columns. + +| Column | Type | Description | +|--------|------|-------------| +| cluster_label | TEXT | Label that a cluster belongs to, number from 0 to `no_clusters - 1`. | +| cluster_center | JSON | Center of the cluster that a row belongs to. The keys of the JSON object are the `colnames`, with values that are the center of the respective cluster | +| silhouettes | NUMERIC | [Silhouette score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html#sklearn.metrics.silhouette_score) of the cluster label | +| inertia | NUMERIC | Sum of squared distances of samples to their closest cluster center | +| rowid | BIGINT | id of the original row for associating back with the original data | + +### Example Usage + +```sql +SELECT + customers.*, + km.cluster_label, + km.cluster_center, + km.silhouettes +FROM + cdb_crankshaft.CDB_KMeansNonspatial( + 'SELECT * FROM customers', + Array['customer_value', 'avg_amt_spent', 'home_median_income'], + 7) As km, + customers +WHERE + customers.cartodb_id = km.rowid +``` + +### Resources + +- Read more in [scikit-learn's documentation](http://scikit-learn.org/stable/modules/clustering.html#k-means) +- [K-means basics](https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials) diff --git a/doc/12_segmentation.md b/doc/12_segmentation.md index b6b0c95..055554b 100644 --- a/doc/12_segmentation.md +++ b/doc/12_segmentation.md @@ -3,7 +3,7 @@ ### CDB_CreateAndPredictSegment(query TEXT, variable_name TEXT, target_query TEXT) -This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data. +This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data. #### Arguments @@ -34,12 +34,12 @@ A table with the following columns. SELECT * from cdb_crankshaft.CDB_CreateAndPredictSegment( 'SELECT agg, median_rent::numeric, male_pop::numeric, female_pop::numeric FROM late_night_agg', 'agg', -'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny'); +'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny'); ``` ### CDB_CreateAndPredictSegment(target numeric[], train_features numeric[], prediction_features numeric[], prediction_ids numeric[]) -This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data. +This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data. #### Arguments @@ -76,7 +76,7 @@ WITH training As ( FROM late_night_agg), target AS ( SELECT cdb_crankshaft.CDB_PyAgg(Array[median_rent, male_pop, female_pop]::Numeric[]) As features, - array_agg(cartodb_id) As cartodb_ids FROM late_night_agg) + array_agg(cartodb_id) As cartodb_ids FROM late_night_agg) SELECT cdb_crankshaft.CDB_CreateAndPredictSegment(training.target, training.features, target.features, target.cartodb_ids) FROM training, target; diff --git a/doc/13_PIA.md b/doc/13_PIA.md index c8986e2..155f193 100644 --- a/doc/13_PIA.md +++ b/doc/13_PIA.md @@ -23,11 +23,17 @@ Function to find the [PIA](https://en.wikipedia.org/wiki/Pole_of_inaccessibility #### Example Usage ```sql -with a as( - select st_geomfromtext('POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))', 3857) as g +WITH a as ( + SELECT + ST_GeomFromText( + 'POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))', + 3857) as g ), b as ( - select ST_Transform(g, 4326) as g from a + SELECT ST_Transform(g, 4326) as g + FROM a ) -SELECT st_astext(CDB_PIA(g)) from b; +SELECT + ST_AsText(cdb_crankshaft.CDB_PIA(g)) +FROM b ``` diff --git a/doc/14_densify.md b/doc/14_densify.md index 2cec7e6..962ad60 100644 --- a/doc/14_densify.md +++ b/doc/14_densify.md @@ -24,12 +24,22 @@ Returns a table object #### Example Usage ```sql -with data as ( - select - ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin, - ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),ST_GeomFromText('POINT(2.1228 41.3809)'),ST_GeomFromText('POINT(2.1511 41.3742)'),ST_GeomFromText('POINT(2.1528 41.4136)'),ST_GeomFromText('POINT(2.165 41.3917)'),ST_GeomFromText('POINT(2.1498 41.3713)'),ST_GeomFromText('POINT(2.1533 41.3683)'),ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin +WITH data as ( + SELECT + ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin, + ARRAY[ + ST_GeomFromText('POINT(2.1744 41.4036)'), + ST_GeomFromText('POINT(2.1228 41.3809)'), + ST_GeomFromText('POINT(2.1511 41.3742)'), + ST_GeomFromText('POINT(2.1528 41.4136)'), + ST_GeomFromText('POINT(2.165 41.3917)'), + ST_GeomFromText('POINT(2.1498 41.3713)'), + ST_GeomFromText('POINT(2.1533 41.3683)'), + ST_GeomFromText('POINT(2.131386 41.413998)') + ] as geomin ) -select CDB_Densify(geomin, colin, 2) from data; +SELECT cdb_crankshaft.CDB_Densify(geomin, colin, 2) +FROM data ``` diff --git a/doc/15_tinmap.md b/doc/15_tinmap.md index 240acbe..a04c5f3 100644 --- a/doc/15_tinmap.md +++ b/doc/15_tinmap.md @@ -26,11 +26,19 @@ Returns a table object #### Example Usage ```sql -with data as ( - select - ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin, - ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),ST_GeomFromText('POINT(2.1228 41.3809)'),ST_GeomFromText('POINT(2.1511 41.3742)'),ST_GeomFromText('POINT(2.1528 41.4136)'),ST_GeomFromText('POINT(2.165 41.3917)'),ST_GeomFromText('POINT(2.1498 41.3713)'),ST_GeomFromText('POINT(2.1533 41.3683)'),ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin +WITH data as ( + SELECT + ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin, + ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'), + ST_GeomFromText('POINT(2.1228 41.3809)'), + ST_GeomFromText('POINT(2.1511 41.3742)'), + ST_GeomFromText('POINT(2.1528 41.4136)'), + ST_GeomFromText('POINT(2.165 41.3917)'), + ST_GeomFromText('POINT(2.1498 41.3713)'), + ST_GeomFromText('POINT(2.1533 41.3683)'), + ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin ) -select CDB_TINmap(geomin, colin, 2) from data; +SELECT cdb_crankshaft.CDB_TINmap(geomin, colin, 2) +FROM data ``` diff --git a/doc/18_outliers.md b/doc/18_outliers.md index f557529..29bbc70 100644 --- a/doc/18_outliers.md +++ b/doc/18_outliers.md @@ -43,7 +43,7 @@ With a table `website_visits` and a column of the number of website visits in un ```sql SELECT id, - CDB_StaticOutlier(visits_10k, 11.0) As outlier, + cdb_crankshaft.CDB_StaticOutlier(visits_10k, 11.0) As outlier, visits_10k FROM website_visits ``` @@ -93,7 +93,7 @@ WITH cte As ( unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k ) SELECT - (CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).* + (cdb_crankshaft.CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).* FROM cte; ``` @@ -144,7 +144,7 @@ WITH cte As ( unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k ) SELECT - (CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).* + (cdb_crankshaft.CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).* FROM cte; ``` diff --git a/release/crankshaft--0.6.1--0.7.0.sql b/release/crankshaft--0.6.1--0.7.0.sql new file mode 100644 index 0000000..cc66ac6 --- /dev/null +++ b/release/crankshaft--0.6.1--0.7.0.sql @@ -0,0 +1,2165 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.7.0'::text; +$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT PARALLEL SAFE; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION + CDB_PyAggS(current_state Numeric[], current_row Numeric[]) + returns NUMERIC[] as $$ + BEGIN + if array_upper(current_state,1) is null then + RAISE NOTICE 'setting state %',array_upper(current_row,1); + current_state[1] = array_upper(current_row,1); + end if; + return array_cat(current_state,current_row) ; + END + $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_pyagg' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( + SFUNC = CDB_PyAggS, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{}" + ); + END IF; +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + target NUMERIC[], + features NUMERIC[], + target_features NUMERIC[], + target_ids NUMERIC[], + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC) +AS $$ + import numpy as np + import plpy + + from crankshaft.segmentation import create_and_predict_segment_agg + model_params = {'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf} + + def unpack2D(data): + dimension = data.pop(0) + a = np.array(data, dtype=float) + return a.reshape(len(a)/dimension, dimension) + + return create_and_predict_segment_agg(np.array(target, dtype=float), + unpack2D(features), + unpack2D(target_features), + target_ids, + model_params) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment ( + query TEXT, + variable_name TEXT, + target_table TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import create_and_predict_segment + model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} + return create_and_predict_segment(query,variable_name,target_table, model_params) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN target_query text, + IN weight_column text, + IN source_query text, + IN pop_column text, + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_id bigint[]; + t_geom geometry[]; + t_weight numeric[]; + s_id bigint[]; + s_geom geometry[]; + s_pop numeric[]; +BEGIN + EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight; + EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop; + RETURN QUERY + SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g; +END; +$$ language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN t_id bigint[], + IN t_geom geometry[], + IN t_weight numeric[], + IN s_id bigint[], + IN s_geom geometry[], + IN s_pop numeric[], + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_type text; + s_type text; + t_center geometry[]; + s_center geometry[]; +BEGIN + t_type := GeometryType(t_geom[1]); + s_type := GeometryType(s_geom[1]); + IF t_type = 'POINT' THEN + t_center := t_geom; + ELSE + WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp; + END IF; + IF s_type = 'POINT' THEN + s_center := s_geom; + ELSE + WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp; + END IF; + RETURN QUERY + with target0 as( + SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td + ), + source0 as( + SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp + ), + prev0 as( + SELECT + source0.sg, + source0.sd as sourc_id, + coalesce(source0.sp,0) as sp, + target.td as targ_id, + coalesce(target.tw,0) as tw, + GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance + FROM source0 + CROSS JOIN LATERAL + ( + SELECT + * + FROM target0 + WHERE tw > minval + AND ST_DWithin(geography(source0.sc), geography(tc), radius) + ) AS target + ), + deno as( + SELECT + sourc_id, + sum(tw/distance) as h_deno + FROM + prev0 + GROUP BY sourc_id + ) + SELECT + p.sg as the_geom, + p.sourc_id as source_id, + p.targ_id as target_id, + case when p.distance > 1 then p.distance else 0.0 end as dist, + 100*(p.tw/p.distance)/d.h_deno as h, + p.sp*(p.tw/p.distance)/d.h_deno as hpop + FROM + prev0 p, + deno d + WHERE + p.targ_id = target AND + p.sourc_id = d.sourc_id; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- 0: nearest neighbor(s) +-- 1: barymetric +-- 2: IDW +-- 3: krigin ---> TO DO + + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN query text, + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + output numeric; +BEGIN + EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs; + SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a; + + RETURN output; +END; +$$ +language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN geomin geometry[], + IN colin numeric[], + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + gs2 geometry[]; + vs2 numeric[]; + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- output := -999.999; + + -- nearest neighbors + -- p1: limit the number of neighbors, 0-> closest one + IF method = 0 THEN + + IF p1 = 0 THEN + p1 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer) + SELECT avg(b.v) INTO output FROM b; + RETURN output; + + -- barymetric + ELSIF method = 1 THEN + WITH a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b), + d as (SELECT v FROM c WHERE ST_Within(point, v)) + SELECT v INTO g FROM d; + IF g is null THEN + -- out of the realm of the input data + RETURN -888.888; + END IF; + -- vertex of the selected cell + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg); + RETURN output; + + -- IDW + -- p1: limit the number of neighbors, 0->no limit + -- p2: order of distance decay, 0-> order 1 + ELSIF method = 2 THEN + + IF p2 = 0 THEN + p2 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g) + SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b; + IF p1::integer>0 THEN + gs2:=gs; + vs2:=vs; + FOR i IN 1..p1 + LOOP + gs2 := gs2 || gs[i]; + vs2 := vs2 || vs[i]; + END LOOP; + ELSE + gs2:=gs; + vs2:=vs; + END IF; + + WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v), + b as ( + SELECT + (1/ST_distance(point, a.g)^p2::integer) as k, + (a.v/ST_distance(point, a.g)^p2::integer) as f + FROM a + ) + SELECT sum(b.f)/sum(b.k) INTO output FROM b; + RETURN output; + + -- krigin + ELSIF method = 3 THEN + + -- TO DO + + END IF; + + RETURN -777.777; + +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- ============================================================================================= +-- +-- CDB_Voronoi +-- +-- ============================================================================================= +CREATE OR REPLACE FUNCTION CDB_voronoi( + IN geomin geometry[], + IN buffer numeric DEFAULT 0.5, + IN tolerance numeric DEFAULT 1e-9 + ) +RETURNS geometry AS $$ +DECLARE + geomout geometry; +BEGIN + -- we need to make the geometry calculations in (pseudo)meters!!! + with a as ( + SELECT unnest(geomin) as g1 + ), + b as( + SELECT st_transform(g1, 3857) g2 from a + ) + SELECT array_agg(g2) INTO geomin from b; + + WITH + convexhull_1 as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r + ), + clipper as( + SELECT + st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g + FROM convexhull_1 a + ), + env0 as ( + SELECT + (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e + FROM convexhull_1 a + ), + env as ( + SELECT + array_agg(env0.e) as e + FROM env0 + ), + sample AS ( + SELECT + ST_Collect(geomin || env.e) as geom + FROM env + ), + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as cg + ), + tin as ( + SELECT + ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd + FROM + sample + ), + tin_polygons as ( + SELECT + (gd).Path as id, + (gd).Geom as pg, + ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct + FROM tin + ), + tin_lines as ( + SELECT + id, + ST_ExteriorRing(pg) as lg + FROM tin_polygons + ), + tin_nodes as ( + SELECT + id, + ST_PointN(lg,1) p1, + ST_PointN(lg,2) p2, + ST_PointN(lg,3) p3 + FROM tin_lines + ), + tin_edges AS ( + SELECT + p.id, + UNNEST(ARRAY[ + ST_MakeLine(n.p1,n.p2) , + ST_MakeLine(n.p2,n.p3) , + ST_MakeLine(n.p3,n.p1)]) as Edge, + ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct, + CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN + TRUE + ELSE FALSE END AS ctx, + p.pg, + ST_within(p.ct, convexhull.cg) as ctin + FROM + tin_polygons p, + tin_nodes n, + convexhull + WHERE p.id = n.id + ), + voro_nodes as ( + SELECT + CASE WHEN x.ctx = TRUE THEN + ST_Centroid(x.edge) + ELSE + x.ct + END as xct, + CASE WHEN y.id is null THEN + CASE WHEN x.ctin = TRUE THEN + ST_SetSRID(ST_MakePoint( + ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)), + ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer)) + ), ST_SRID(x.ct)) + END + ELSE + y.ct + END as yct + FROM + tin_edges x + LEFT OUTER JOIN + tin_edges y + ON x.id <> y.id AND ST_Equals(x.edge, y.edge) + ), + voro_edges as( + SELECT + ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v + FROM + voro_nodes + ), + voro_cells as( + SELECT + ST_Polygonize( + ST_Node( + ST_LineMerge( + ST_Union(v, ST_ExteriorRing( + ST_Convexhull(v) + ) + ) + ) + ) + ) as g + FROM + voro_edges + ), + voro_set as( + SELECT + (st_dump(v.g)).geom as g + FROM voro_cells v + ), + clipped_voro as( + SELECT + ST_intersection(c.g, v.g) as g + FROM + voro_set v, + clipper c + WHERE + ST_GeometryType(v.g) = 'ST_Polygon' + ) + SELECT + st_collect( + ST_Transform( + ST_ConvexHull(g), + 4326 + ) + ) + INTO geomout + FROM + clipped_voro; + RETURN geomout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +/** ---------------------------------------------------------------------------------------- + * @function : FindCircle + * @precis : Function that determines if three points form a circle. If so a table containing + * centre and radius is returned. If not, a null table is returned. + * @version : 1.0.1 + * @param : p_pt1 : First point in curve + * @param : p_pt2 : Second point in curve + * @param : p_pt3 : Third point in curve + * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle + * or NULL if three points do not form a circle. + * @history : Simon Greener - Feb 2012 - Original coding. + * Rafa de la Torre - Aug 2016 - Small fix for type checking + * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories + * @copyright : Simon Greener @ 2012 + * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/) +**/ +CREATE OR REPLACE FUNCTION _Find_Circle( + IN p_pt1 geometry, + IN p_pt2 geometry, + IN p_pt3 geometry) + RETURNS geometry AS +$BODY$ +DECLARE + v_Centre geometry; + v_radius NUMERIC; + v_CX NUMERIC; + v_CY NUMERIC; + v_dA NUMERIC; + v_dB NUMERIC; + v_dC NUMERIC; + v_dD NUMERIC; + v_dE NUMERIC; + v_dF NUMERIC; + v_dG NUMERIC; +BEGIN + IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR + ST_GeometryType(p_pt2) <> 'ST_Point' OR + ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN + RAISE EXCEPTION 'All supplied geometries must be points.'; + RETURN NULL; + END IF; + v_dA := ST_X(p_pt2) - ST_X(p_pt1); + v_dB := ST_Y(p_pt2) - ST_Y(p_pt1); + v_dC := ST_X(p_pt3) - ST_X(p_pt1); + v_dD := ST_Y(p_pt3) - ST_Y(p_pt1); + v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2)); + v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3)); + v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2))); + -- If v_dG is zero then the three points are collinear and no finite-radius + -- circle through them exists. + IF ( v_dG = 0 ) THEN + RETURN NULL; + ELSE + v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG; + v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG; + v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) ); + END IF; + RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1)); +END; +$BODY$ + LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; + +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import Moran + # TODO: use named parameters or a dictionary + moran = Moran() + return moran.global_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.global_rate_stat(subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +-- Moran's I Local Rate (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Spatial k-means clustering + +CREATE OR REPLACE FUNCTION CDB_KMeans( + query TEXT, + no_clusters INTEGER, + no_init INTEGER DEFAULT 20 +) +RETURNS TABLE( + cartodb_id INTEGER, + cluster_no INTEGER +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.spatial(query, no_clusters, no_init) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Non-spatial k-means clustering +-- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- no_clusters: number of requested clusters +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column + +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + no_clusters INTEGER, + standardize BOOLEAN DEFAULT true, + id_col TEXT DEFAULT 'cartodb_id' +) +RETURNS TABLE( + cluster_label text, + cluster_center json, + silhouettes numeric, + inertia numeric, + rowid bigint +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.nonspatial(query, colnames, no_clusters, + standardize=standardize, + id_col=id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( + state NUMERIC[], + the_geom GEOMETRY(Point, 4326), + weight NUMERIC +) +RETURNS Numeric[] AS $$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[]) +RETURNS GEOMETRY AS +$$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_weightedmean' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( + SFUNC = CDB_WeightedMeanS, + FINALFUNC = CDB_WeightedMeanF, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{0.0,0.0,0.0}" + ); + END IF; +END +$$ LANGUAGE plpgsql; +-- Spatial Markov + +-- input table format: +-- id | geom | date_1 | date_2 | date_3 +-- 1 | Pt1 | 12.3 | 13.1 | 14.2 +-- 2 | Pt2 | 11.0 | 13.2 | 12.5 +-- ... +-- Sample Function call: +-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate', +-- Array['date_1', 'date_2', 'date_3']) + +CREATE OR REPLACE FUNCTION + CDB_SpatialMarkovTrend ( + subquery TEXT, + time_cols TEXT[], + num_classes INT DEFAULT 7, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT) +AS $$ + + from crankshaft.space_time_dynamics import Markov + markov = Markov() + + ## TODO: use named parameters or a dictionary + return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- input table format: identical to above but in a predictable format +-- Sample function call: +-- SELECT cdb_spatial_markov('SELECT * FROM real_estate', +-- 'date_1') + + +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col_min text, +-- time_col_max text, +-- date_format text, -- '_YYYY_MM_DD' +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- +-- -- input table format: +-- -- id | geom | date | measurement +-- -- 1 | Pt1 | 12/3 | 13.2 +-- -- 2 | Pt2 | 11/5 | 11.3 +-- -- 3 | Pt1 | 11/13 | 12.9 +-- -- 4 | Pt3 | 12/19 | 10.1 +-- -- ... +-- +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col text, +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +CREATE OR REPLACE FUNCTION CDB_PIA( + IN polygon geometry, + IN tolerance numeric DEFAULT 1.0 + ) +RETURNS geometry AS $$ +DECLARE + env geometry[]; + cells geometry[]; + cell geometry; + best_c geometry; + best_d numeric; + test_d numeric; + test_mx numeric; + test_h numeric; + test_cells geometry[]; + width numeric; + height numeric; + h numeric; + i integer; + n integer; + sqr numeric; + p geometry; +BEGIN + sqr := 0.5*(|/2.0); + polygon := ST_Transform(polygon, 3857); + + -- grid #0 cell size + height := ST_YMax(polygon) - ST_YMin(polygon); + width := ST_XMax(polygon) - ST_XMin(polygon); + h := 0.5*LEAST(height, width); + + -- grid #0 + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c + ) + SELECT array_agg(c) INTO cells FROM c1; + + -- 1st guess: centroid + best_c := polygon; + best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon)); + + -- looping the loop + n := array_length(cells,1); + i := 1; + LOOP + + EXIT WHEN i > n; + + cell := cells[i]; + + i := i+1; + + -- cell side size, it's square + test_h := ST_XMax(cell) - ST_XMin(cell) ; + + -- check distance + test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell)); + + IF test_d > best_d THEN + best_d := test_d; + best_c := cell; + END IF; + + -- longest distance within the cell + test_mx := test_d + (test_h * sqr); + + -- if the cell has no chance to contains the desired point, continue + CONTINUE WHEN test_mx - best_d <= tolerance; + + -- resample the cell + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c + ) + SELECT array_agg(c) INTO test_cells FROM c1; + + -- concat the new cells to the former array + cells := cells || test_cells; + + -- prepare next iteration + n := array_length(cells,1); + + END LOOP; + + RETURN ST_transform(ST_Centroid(best_c), 4326); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + + +-- signed distance point to polygon with holes +-- negative is the point is out the polygon +-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm +CREATE OR REPLACE FUNCTION _Signed_Dist( + IN polygon geometry, + IN point geometry + ) +RETURNS numeric AS $$ +DECLARE + pols geometry[]; + pol geometry; + i integer; + j integer; + within integer; + w integer; + holes integer; + dist numeric; + d numeric; +BEGIN + dist := 1e999; + WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection; + FOR j in 1..array_length(pols, 1) + LOOP + pol := pols[j]; + d := dist; + SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d; + SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w; + SELECT ST_NumInteriorRings(pol) INTO holes; + IF holes > 0 THEN + FOR i IN 1..holes + LOOP + SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d; + END LOOP; + END IF; + IF d < dist THEN + dist:= d; + within := w; + END IF; + END LOOP; + dist := dist * within::numeric; + RETURN dist; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- +-- Iterative densification of a set of points using Delaunay triangulation +-- the new points have as assigned value the average value of the 3 vertex (centroid) +-- +-- @param geomin - array of geometries (points) +-- +-- @param colin - array of numeric values in that points +-- +-- @param iterations - integer, number of iterations +-- +-- +-- Returns: TABLE(geomout geometry, colout numeric) +-- +-- +CREATE OR REPLACE FUNCTION CDB_Densify( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + geotemp geometry[]; + coltemp numeric[]; + i integer; + gs geometry[]; + g geometry; + vertex geometry[]; + va numeric; + vb numeric; + vc numeric; + center geometry; + centerval numeric; + tmp integer; +BEGIN + geotemp := geomin; + coltemp := colin; + FOR i IN 1..iterations + LOOP + -- generate TIN + WITH a as (SELECT unnest(geotemp) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + -- loop cells + FOREACH g IN ARRAY gs + LOOP + -- append centroid + SELECT ST_Centroid(g) INTO center; + geotemp := array_append(geotemp, center); + -- retrieve the value of each vertex + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + END LOOP; + RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_TINmap( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + p geometry[]; + vals numeric[]; + gs geometry[]; + g geometry; + vertex geometry[]; + centerval numeric; + va numeric; + vb numeric; + vc numeric; + coltemp numeric[]; +BEGIN + SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens; + WITH a as (SELECT unnest(p) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + FOREACH g IN ARRAY gs + LOOP + -- retrieve the vertex of each triangle + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- Getis-Ord's G +-- Hotspot/Coldspot Analysis tool +CREATE OR REPLACE FUNCTION + CDB_GetisOrdsG( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 999, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT) +AS $$ + from crankshaft.clustering import Getis + getis = Getis() + return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- TODO: make a version that accepts the values as arrays + +-- Find outliers using a static threshold +-- +CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric) +RETURNS boolean +AS $$ +BEGIN + + RETURN column_value > threshold; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ; + +-- Find outliers by a percentage above the threshold +-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false` + +CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[]) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT avg(i) INTO avg_val + FROM unnest(column_values) As x(i); + + IF avg_val = 0 THEN + RAISE EXCEPTION 'Mean value is zero. Try another outlier method.'; + END IF; + + SELECT array_agg( + outlier_fraction < i / avg_val) INTO out_vals + FROM unnest(column_values) As x(i); + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Find outliers above a given number of standard deviations from the mean + +CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + stddev_val numeric; + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT stddev(i), avg(i) INTO stddev_val, avg_val + FROM unnest(column_values) As x(i); + + IF stddev_val = 0 THEN + RAISE EXCEPTION 'Standard deviation of input data is zero'; + END IF; + + IF is_symmetric THEN + SELECT array_agg( + abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + ELSE + SELECT array_agg( + (i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + END IF; + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_Contour( + IN geomin geometry[], + IN colin numeric[], + IN buffer numeric, + IN intmethod integer, + IN classmethod integer, + IN steps integer, + IN max_time integer DEFAULT 60000 + ) +RETURNS TABLE( + the_geom geometry, + bin integer, + min_value numeric, + max_value numeric, + avg_value numeric +) AS $$ +DECLARE + cell_count integer; + tin geometry[]; + resolution integer; +BEGIN + + -- nasty trick to override issue #121 + IF max_time = 0 THEN + max_time = -90; + END IF; + resolution := max_time; + max_time := -1 * resolution; + + -- calc the optimal number of cells for the current dataset + SELECT + CASE intmethod + WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073) + WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797) + WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218) + ELSE 10000 + END INTO cell_count; + + -- we don't have iterative barycentric interpolation in CDB_interpolation, + -- and it's a costy function, so let's make a custom one here till + -- we update the code + -- tin := ARRAY[]::geometry[]; + IF intmethod=1 THEN + WITH + a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b) + SELECT array_agg(v) INTO tin FROM c; + END IF; + -- Delaunay stuff performed just ONCE!! + + -- magic + RETURN QUERY + WITH + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r + ), + envelope as ( + SELECT + st_expand(a.g, a.r) as e + FROM convexhull a + ), + envelope3857 as( + SELECT + ST_Transform(e, 3857) as geom + FROM envelope + ), + resolution as( + SELECT + CASE WHEN resolution <= 0 THEN + round(|/ ( + ST_area(geom) / abs(cell_count) + )) + ELSE + resolution + END AS cell + FROM envelope3857 + ), + grid as( + SELECT + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom + FROM envelope3857 e, resolution r + ), + interp as( + SELECT + geom, + CASE + WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) + ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + END as val + FROM grid + ), + classes as( + SELECT CASE + WHEN classmethod = 0 THEN + cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps) + WHEN classmethod = 1 THEN + cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps) + WHEN classmethod = 2 THEN + cdb_crankshaft.CDB_JenksBins(array_agg(val), steps) + ELSE + cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps) + END as b + FROM interp + where val is not null + ), + classified as( + SELECT + i.*, + width_bucket(i.val, c.b) as bucket + FROM interp i left join classes c + ON 1=1 + ), + classified2 as( + SELECT + geom, + val, + CASE + WHEN bucket = steps THEN bucket - 1 + ELSE bucket + END as b + FROM classified + ), + final as( + SELECT + st_union(geom) as the_geom, + b as bin, + min(val) as min_value, + max(val) as max_value, + avg(val) as avg_value + FROM classified2 + GROUP BY bin + ) + SELECT + * + FROM final + where final.bin is not null + ; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + +-- ===================================================================== +-- Interp in grid, so we can use barycentric with a precalculated tin (NNI) +-- ===================================================================== +CREATE OR REPLACE FUNCTION _interp_in_tin( + IN geomin geometry[], + IN colin numeric[], + IN tin geometry[], + IN point geometry + ) +RETURNS numeric AS +$$ +DECLARE + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- get the cell the point is within + WITH + a as (SELECT unnest(tin) as v), + b as (SELECT v FROM a WHERE ST_Within(point, v)) + SELECT v INTO g FROM b; + + -- if we're out of the data realm, + -- return null + IF g is null THEN + RETURN null; + END IF; + + -- vertex of the selected cell + WITH a AS ( + SELECT (ST_DumpPoints(g)).geom AS v + ) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + -- calc the areas + SELECT + ST_area(g), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1); + RETURN output; +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql STABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION +CDB_GWR(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + filtered_t_vals JSON, predicted numeric, + residuals numeric, r_squared numeric, bandwidth numeric, + rowid bigint) +AS $$ + +from crankshaft.regression import GWR + +gwr = GWR() + +return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION +CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', + geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + r_squared numeric, predicted numeric, rowid bigint) +AS $$ + +from crankshaft.regression import GWR +gwr = GWR() + +return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE; diff --git a/release/crankshaft--0.7.0.sql b/release/crankshaft--0.7.0.sql new file mode 100644 index 0000000..cc66ac6 --- /dev/null +++ b/release/crankshaft--0.7.0.sql @@ -0,0 +1,2165 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.7.0'::text; +$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT PARALLEL SAFE; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION + CDB_PyAggS(current_state Numeric[], current_row Numeric[]) + returns NUMERIC[] as $$ + BEGIN + if array_upper(current_state,1) is null then + RAISE NOTICE 'setting state %',array_upper(current_row,1); + current_state[1] = array_upper(current_row,1); + end if; + return array_cat(current_state,current_row) ; + END + $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_pyagg' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( + SFUNC = CDB_PyAggS, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{}" + ); + END IF; +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + target NUMERIC[], + features NUMERIC[], + target_features NUMERIC[], + target_ids NUMERIC[], + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC) +AS $$ + import numpy as np + import plpy + + from crankshaft.segmentation import create_and_predict_segment_agg + model_params = {'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf} + + def unpack2D(data): + dimension = data.pop(0) + a = np.array(data, dtype=float) + return a.reshape(len(a)/dimension, dimension) + + return create_and_predict_segment_agg(np.array(target, dtype=float), + unpack2D(features), + unpack2D(target_features), + target_ids, + model_params) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment ( + query TEXT, + variable_name TEXT, + target_table TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import create_and_predict_segment + model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} + return create_and_predict_segment(query,variable_name,target_table, model_params) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN target_query text, + IN weight_column text, + IN source_query text, + IN pop_column text, + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_id bigint[]; + t_geom geometry[]; + t_weight numeric[]; + s_id bigint[]; + s_geom geometry[]; + s_pop numeric[]; +BEGIN + EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight; + EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop; + RETURN QUERY + SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g; +END; +$$ language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN t_id bigint[], + IN t_geom geometry[], + IN t_weight numeric[], + IN s_id bigint[], + IN s_geom geometry[], + IN s_pop numeric[], + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_type text; + s_type text; + t_center geometry[]; + s_center geometry[]; +BEGIN + t_type := GeometryType(t_geom[1]); + s_type := GeometryType(s_geom[1]); + IF t_type = 'POINT' THEN + t_center := t_geom; + ELSE + WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp; + END IF; + IF s_type = 'POINT' THEN + s_center := s_geom; + ELSE + WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp; + END IF; + RETURN QUERY + with target0 as( + SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td + ), + source0 as( + SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp + ), + prev0 as( + SELECT + source0.sg, + source0.sd as sourc_id, + coalesce(source0.sp,0) as sp, + target.td as targ_id, + coalesce(target.tw,0) as tw, + GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance + FROM source0 + CROSS JOIN LATERAL + ( + SELECT + * + FROM target0 + WHERE tw > minval + AND ST_DWithin(geography(source0.sc), geography(tc), radius) + ) AS target + ), + deno as( + SELECT + sourc_id, + sum(tw/distance) as h_deno + FROM + prev0 + GROUP BY sourc_id + ) + SELECT + p.sg as the_geom, + p.sourc_id as source_id, + p.targ_id as target_id, + case when p.distance > 1 then p.distance else 0.0 end as dist, + 100*(p.tw/p.distance)/d.h_deno as h, + p.sp*(p.tw/p.distance)/d.h_deno as hpop + FROM + prev0 p, + deno d + WHERE + p.targ_id = target AND + p.sourc_id = d.sourc_id; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- 0: nearest neighbor(s) +-- 1: barymetric +-- 2: IDW +-- 3: krigin ---> TO DO + + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN query text, + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + output numeric; +BEGIN + EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs; + SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a; + + RETURN output; +END; +$$ +language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN geomin geometry[], + IN colin numeric[], + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + gs2 geometry[]; + vs2 numeric[]; + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- output := -999.999; + + -- nearest neighbors + -- p1: limit the number of neighbors, 0-> closest one + IF method = 0 THEN + + IF p1 = 0 THEN + p1 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer) + SELECT avg(b.v) INTO output FROM b; + RETURN output; + + -- barymetric + ELSIF method = 1 THEN + WITH a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b), + d as (SELECT v FROM c WHERE ST_Within(point, v)) + SELECT v INTO g FROM d; + IF g is null THEN + -- out of the realm of the input data + RETURN -888.888; + END IF; + -- vertex of the selected cell + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg); + RETURN output; + + -- IDW + -- p1: limit the number of neighbors, 0->no limit + -- p2: order of distance decay, 0-> order 1 + ELSIF method = 2 THEN + + IF p2 = 0 THEN + p2 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g) + SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b; + IF p1::integer>0 THEN + gs2:=gs; + vs2:=vs; + FOR i IN 1..p1 + LOOP + gs2 := gs2 || gs[i]; + vs2 := vs2 || vs[i]; + END LOOP; + ELSE + gs2:=gs; + vs2:=vs; + END IF; + + WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v), + b as ( + SELECT + (1/ST_distance(point, a.g)^p2::integer) as k, + (a.v/ST_distance(point, a.g)^p2::integer) as f + FROM a + ) + SELECT sum(b.f)/sum(b.k) INTO output FROM b; + RETURN output; + + -- krigin + ELSIF method = 3 THEN + + -- TO DO + + END IF; + + RETURN -777.777; + +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- ============================================================================================= +-- +-- CDB_Voronoi +-- +-- ============================================================================================= +CREATE OR REPLACE FUNCTION CDB_voronoi( + IN geomin geometry[], + IN buffer numeric DEFAULT 0.5, + IN tolerance numeric DEFAULT 1e-9 + ) +RETURNS geometry AS $$ +DECLARE + geomout geometry; +BEGIN + -- we need to make the geometry calculations in (pseudo)meters!!! + with a as ( + SELECT unnest(geomin) as g1 + ), + b as( + SELECT st_transform(g1, 3857) g2 from a + ) + SELECT array_agg(g2) INTO geomin from b; + + WITH + convexhull_1 as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r + ), + clipper as( + SELECT + st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g + FROM convexhull_1 a + ), + env0 as ( + SELECT + (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e + FROM convexhull_1 a + ), + env as ( + SELECT + array_agg(env0.e) as e + FROM env0 + ), + sample AS ( + SELECT + ST_Collect(geomin || env.e) as geom + FROM env + ), + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as cg + ), + tin as ( + SELECT + ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd + FROM + sample + ), + tin_polygons as ( + SELECT + (gd).Path as id, + (gd).Geom as pg, + ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct + FROM tin + ), + tin_lines as ( + SELECT + id, + ST_ExteriorRing(pg) as lg + FROM tin_polygons + ), + tin_nodes as ( + SELECT + id, + ST_PointN(lg,1) p1, + ST_PointN(lg,2) p2, + ST_PointN(lg,3) p3 + FROM tin_lines + ), + tin_edges AS ( + SELECT + p.id, + UNNEST(ARRAY[ + ST_MakeLine(n.p1,n.p2) , + ST_MakeLine(n.p2,n.p3) , + ST_MakeLine(n.p3,n.p1)]) as Edge, + ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct, + CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN + TRUE + ELSE FALSE END AS ctx, + p.pg, + ST_within(p.ct, convexhull.cg) as ctin + FROM + tin_polygons p, + tin_nodes n, + convexhull + WHERE p.id = n.id + ), + voro_nodes as ( + SELECT + CASE WHEN x.ctx = TRUE THEN + ST_Centroid(x.edge) + ELSE + x.ct + END as xct, + CASE WHEN y.id is null THEN + CASE WHEN x.ctin = TRUE THEN + ST_SetSRID(ST_MakePoint( + ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)), + ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer)) + ), ST_SRID(x.ct)) + END + ELSE + y.ct + END as yct + FROM + tin_edges x + LEFT OUTER JOIN + tin_edges y + ON x.id <> y.id AND ST_Equals(x.edge, y.edge) + ), + voro_edges as( + SELECT + ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v + FROM + voro_nodes + ), + voro_cells as( + SELECT + ST_Polygonize( + ST_Node( + ST_LineMerge( + ST_Union(v, ST_ExteriorRing( + ST_Convexhull(v) + ) + ) + ) + ) + ) as g + FROM + voro_edges + ), + voro_set as( + SELECT + (st_dump(v.g)).geom as g + FROM voro_cells v + ), + clipped_voro as( + SELECT + ST_intersection(c.g, v.g) as g + FROM + voro_set v, + clipper c + WHERE + ST_GeometryType(v.g) = 'ST_Polygon' + ) + SELECT + st_collect( + ST_Transform( + ST_ConvexHull(g), + 4326 + ) + ) + INTO geomout + FROM + clipped_voro; + RETURN geomout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +/** ---------------------------------------------------------------------------------------- + * @function : FindCircle + * @precis : Function that determines if three points form a circle. If so a table containing + * centre and radius is returned. If not, a null table is returned. + * @version : 1.0.1 + * @param : p_pt1 : First point in curve + * @param : p_pt2 : Second point in curve + * @param : p_pt3 : Third point in curve + * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle + * or NULL if three points do not form a circle. + * @history : Simon Greener - Feb 2012 - Original coding. + * Rafa de la Torre - Aug 2016 - Small fix for type checking + * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories + * @copyright : Simon Greener @ 2012 + * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/) +**/ +CREATE OR REPLACE FUNCTION _Find_Circle( + IN p_pt1 geometry, + IN p_pt2 geometry, + IN p_pt3 geometry) + RETURNS geometry AS +$BODY$ +DECLARE + v_Centre geometry; + v_radius NUMERIC; + v_CX NUMERIC; + v_CY NUMERIC; + v_dA NUMERIC; + v_dB NUMERIC; + v_dC NUMERIC; + v_dD NUMERIC; + v_dE NUMERIC; + v_dF NUMERIC; + v_dG NUMERIC; +BEGIN + IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR + ST_GeometryType(p_pt2) <> 'ST_Point' OR + ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN + RAISE EXCEPTION 'All supplied geometries must be points.'; + RETURN NULL; + END IF; + v_dA := ST_X(p_pt2) - ST_X(p_pt1); + v_dB := ST_Y(p_pt2) - ST_Y(p_pt1); + v_dC := ST_X(p_pt3) - ST_X(p_pt1); + v_dD := ST_Y(p_pt3) - ST_Y(p_pt1); + v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2)); + v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3)); + v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2))); + -- If v_dG is zero then the three points are collinear and no finite-radius + -- circle through them exists. + IF ( v_dG = 0 ) THEN + RETURN NULL; + ELSE + v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG; + v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG; + v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) ); + END IF; + RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1)); +END; +$BODY$ + LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; + +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import Moran + # TODO: use named parameters or a dictionary + moran = Moran() + return moran.global_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.global_rate_stat(subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +-- Moran's I Local Rate (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Spatial k-means clustering + +CREATE OR REPLACE FUNCTION CDB_KMeans( + query TEXT, + no_clusters INTEGER, + no_init INTEGER DEFAULT 20 +) +RETURNS TABLE( + cartodb_id INTEGER, + cluster_no INTEGER +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.spatial(query, no_clusters, no_init) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Non-spatial k-means clustering +-- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- no_clusters: number of requested clusters +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column + +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + no_clusters INTEGER, + standardize BOOLEAN DEFAULT true, + id_col TEXT DEFAULT 'cartodb_id' +) +RETURNS TABLE( + cluster_label text, + cluster_center json, + silhouettes numeric, + inertia numeric, + rowid bigint +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.nonspatial(query, colnames, no_clusters, + standardize=standardize, + id_col=id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( + state NUMERIC[], + the_geom GEOMETRY(Point, 4326), + weight NUMERIC +) +RETURNS Numeric[] AS $$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[]) +RETURNS GEOMETRY AS +$$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_weightedmean' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( + SFUNC = CDB_WeightedMeanS, + FINALFUNC = CDB_WeightedMeanF, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{0.0,0.0,0.0}" + ); + END IF; +END +$$ LANGUAGE plpgsql; +-- Spatial Markov + +-- input table format: +-- id | geom | date_1 | date_2 | date_3 +-- 1 | Pt1 | 12.3 | 13.1 | 14.2 +-- 2 | Pt2 | 11.0 | 13.2 | 12.5 +-- ... +-- Sample Function call: +-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate', +-- Array['date_1', 'date_2', 'date_3']) + +CREATE OR REPLACE FUNCTION + CDB_SpatialMarkovTrend ( + subquery TEXT, + time_cols TEXT[], + num_classes INT DEFAULT 7, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT) +AS $$ + + from crankshaft.space_time_dynamics import Markov + markov = Markov() + + ## TODO: use named parameters or a dictionary + return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- input table format: identical to above but in a predictable format +-- Sample function call: +-- SELECT cdb_spatial_markov('SELECT * FROM real_estate', +-- 'date_1') + + +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col_min text, +-- time_col_max text, +-- date_format text, -- '_YYYY_MM_DD' +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- +-- -- input table format: +-- -- id | geom | date | measurement +-- -- 1 | Pt1 | 12/3 | 13.2 +-- -- 2 | Pt2 | 11/5 | 11.3 +-- -- 3 | Pt1 | 11/13 | 12.9 +-- -- 4 | Pt3 | 12/19 | 10.1 +-- -- ... +-- +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col text, +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +CREATE OR REPLACE FUNCTION CDB_PIA( + IN polygon geometry, + IN tolerance numeric DEFAULT 1.0 + ) +RETURNS geometry AS $$ +DECLARE + env geometry[]; + cells geometry[]; + cell geometry; + best_c geometry; + best_d numeric; + test_d numeric; + test_mx numeric; + test_h numeric; + test_cells geometry[]; + width numeric; + height numeric; + h numeric; + i integer; + n integer; + sqr numeric; + p geometry; +BEGIN + sqr := 0.5*(|/2.0); + polygon := ST_Transform(polygon, 3857); + + -- grid #0 cell size + height := ST_YMax(polygon) - ST_YMin(polygon); + width := ST_XMax(polygon) - ST_XMin(polygon); + h := 0.5*LEAST(height, width); + + -- grid #0 + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c + ) + SELECT array_agg(c) INTO cells FROM c1; + + -- 1st guess: centroid + best_c := polygon; + best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon)); + + -- looping the loop + n := array_length(cells,1); + i := 1; + LOOP + + EXIT WHEN i > n; + + cell := cells[i]; + + i := i+1; + + -- cell side size, it's square + test_h := ST_XMax(cell) - ST_XMin(cell) ; + + -- check distance + test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell)); + + IF test_d > best_d THEN + best_d := test_d; + best_c := cell; + END IF; + + -- longest distance within the cell + test_mx := test_d + (test_h * sqr); + + -- if the cell has no chance to contains the desired point, continue + CONTINUE WHEN test_mx - best_d <= tolerance; + + -- resample the cell + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c + ) + SELECT array_agg(c) INTO test_cells FROM c1; + + -- concat the new cells to the former array + cells := cells || test_cells; + + -- prepare next iteration + n := array_length(cells,1); + + END LOOP; + + RETURN ST_transform(ST_Centroid(best_c), 4326); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + + +-- signed distance point to polygon with holes +-- negative is the point is out the polygon +-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm +CREATE OR REPLACE FUNCTION _Signed_Dist( + IN polygon geometry, + IN point geometry + ) +RETURNS numeric AS $$ +DECLARE + pols geometry[]; + pol geometry; + i integer; + j integer; + within integer; + w integer; + holes integer; + dist numeric; + d numeric; +BEGIN + dist := 1e999; + WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection; + FOR j in 1..array_length(pols, 1) + LOOP + pol := pols[j]; + d := dist; + SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d; + SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w; + SELECT ST_NumInteriorRings(pol) INTO holes; + IF holes > 0 THEN + FOR i IN 1..holes + LOOP + SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d; + END LOOP; + END IF; + IF d < dist THEN + dist:= d; + within := w; + END IF; + END LOOP; + dist := dist * within::numeric; + RETURN dist; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- +-- Iterative densification of a set of points using Delaunay triangulation +-- the new points have as assigned value the average value of the 3 vertex (centroid) +-- +-- @param geomin - array of geometries (points) +-- +-- @param colin - array of numeric values in that points +-- +-- @param iterations - integer, number of iterations +-- +-- +-- Returns: TABLE(geomout geometry, colout numeric) +-- +-- +CREATE OR REPLACE FUNCTION CDB_Densify( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + geotemp geometry[]; + coltemp numeric[]; + i integer; + gs geometry[]; + g geometry; + vertex geometry[]; + va numeric; + vb numeric; + vc numeric; + center geometry; + centerval numeric; + tmp integer; +BEGIN + geotemp := geomin; + coltemp := colin; + FOR i IN 1..iterations + LOOP + -- generate TIN + WITH a as (SELECT unnest(geotemp) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + -- loop cells + FOREACH g IN ARRAY gs + LOOP + -- append centroid + SELECT ST_Centroid(g) INTO center; + geotemp := array_append(geotemp, center); + -- retrieve the value of each vertex + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + END LOOP; + RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_TINmap( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + p geometry[]; + vals numeric[]; + gs geometry[]; + g geometry; + vertex geometry[]; + centerval numeric; + va numeric; + vb numeric; + vc numeric; + coltemp numeric[]; +BEGIN + SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens; + WITH a as (SELECT unnest(p) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + FOREACH g IN ARRAY gs + LOOP + -- retrieve the vertex of each triangle + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- Getis-Ord's G +-- Hotspot/Coldspot Analysis tool +CREATE OR REPLACE FUNCTION + CDB_GetisOrdsG( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 999, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT) +AS $$ + from crankshaft.clustering import Getis + getis = Getis() + return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- TODO: make a version that accepts the values as arrays + +-- Find outliers using a static threshold +-- +CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric) +RETURNS boolean +AS $$ +BEGIN + + RETURN column_value > threshold; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ; + +-- Find outliers by a percentage above the threshold +-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false` + +CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[]) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT avg(i) INTO avg_val + FROM unnest(column_values) As x(i); + + IF avg_val = 0 THEN + RAISE EXCEPTION 'Mean value is zero. Try another outlier method.'; + END IF; + + SELECT array_agg( + outlier_fraction < i / avg_val) INTO out_vals + FROM unnest(column_values) As x(i); + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Find outliers above a given number of standard deviations from the mean + +CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + stddev_val numeric; + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT stddev(i), avg(i) INTO stddev_val, avg_val + FROM unnest(column_values) As x(i); + + IF stddev_val = 0 THEN + RAISE EXCEPTION 'Standard deviation of input data is zero'; + END IF; + + IF is_symmetric THEN + SELECT array_agg( + abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + ELSE + SELECT array_agg( + (i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + END IF; + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_Contour( + IN geomin geometry[], + IN colin numeric[], + IN buffer numeric, + IN intmethod integer, + IN classmethod integer, + IN steps integer, + IN max_time integer DEFAULT 60000 + ) +RETURNS TABLE( + the_geom geometry, + bin integer, + min_value numeric, + max_value numeric, + avg_value numeric +) AS $$ +DECLARE + cell_count integer; + tin geometry[]; + resolution integer; +BEGIN + + -- nasty trick to override issue #121 + IF max_time = 0 THEN + max_time = -90; + END IF; + resolution := max_time; + max_time := -1 * resolution; + + -- calc the optimal number of cells for the current dataset + SELECT + CASE intmethod + WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073) + WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797) + WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218) + ELSE 10000 + END INTO cell_count; + + -- we don't have iterative barycentric interpolation in CDB_interpolation, + -- and it's a costy function, so let's make a custom one here till + -- we update the code + -- tin := ARRAY[]::geometry[]; + IF intmethod=1 THEN + WITH + a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b) + SELECT array_agg(v) INTO tin FROM c; + END IF; + -- Delaunay stuff performed just ONCE!! + + -- magic + RETURN QUERY + WITH + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r + ), + envelope as ( + SELECT + st_expand(a.g, a.r) as e + FROM convexhull a + ), + envelope3857 as( + SELECT + ST_Transform(e, 3857) as geom + FROM envelope + ), + resolution as( + SELECT + CASE WHEN resolution <= 0 THEN + round(|/ ( + ST_area(geom) / abs(cell_count) + )) + ELSE + resolution + END AS cell + FROM envelope3857 + ), + grid as( + SELECT + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom + FROM envelope3857 e, resolution r + ), + interp as( + SELECT + geom, + CASE + WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) + ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + END as val + FROM grid + ), + classes as( + SELECT CASE + WHEN classmethod = 0 THEN + cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps) + WHEN classmethod = 1 THEN + cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps) + WHEN classmethod = 2 THEN + cdb_crankshaft.CDB_JenksBins(array_agg(val), steps) + ELSE + cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps) + END as b + FROM interp + where val is not null + ), + classified as( + SELECT + i.*, + width_bucket(i.val, c.b) as bucket + FROM interp i left join classes c + ON 1=1 + ), + classified2 as( + SELECT + geom, + val, + CASE + WHEN bucket = steps THEN bucket - 1 + ELSE bucket + END as b + FROM classified + ), + final as( + SELECT + st_union(geom) as the_geom, + b as bin, + min(val) as min_value, + max(val) as max_value, + avg(val) as avg_value + FROM classified2 + GROUP BY bin + ) + SELECT + * + FROM final + where final.bin is not null + ; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + +-- ===================================================================== +-- Interp in grid, so we can use barycentric with a precalculated tin (NNI) +-- ===================================================================== +CREATE OR REPLACE FUNCTION _interp_in_tin( + IN geomin geometry[], + IN colin numeric[], + IN tin geometry[], + IN point geometry + ) +RETURNS numeric AS +$$ +DECLARE + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- get the cell the point is within + WITH + a as (SELECT unnest(tin) as v), + b as (SELECT v FROM a WHERE ST_Within(point, v)) + SELECT v INTO g FROM b; + + -- if we're out of the data realm, + -- return null + IF g is null THEN + RETURN null; + END IF; + + -- vertex of the selected cell + WITH a AS ( + SELECT (ST_DumpPoints(g)).geom AS v + ) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + -- calc the areas + SELECT + ST_area(g), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1); + RETURN output; +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql STABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION +CDB_GWR(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + filtered_t_vals JSON, predicted numeric, + residuals numeric, r_squared numeric, bandwidth numeric, + rowid bigint) +AS $$ + +from crankshaft.regression import GWR + +gwr = GWR() + +return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION +CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', + geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + r_squared numeric, predicted numeric, rowid bigint) +AS $$ + +from crankshaft.regression import GWR +gwr = GWR() + +return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE; diff --git a/release/crankshaft.control b/release/crankshaft.control index 216a89f..7d5a93a 100644 --- a/release/crankshaft.control +++ b/release/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.6.1' +default_version = '0.7.0' requires = 'plpythonu, postgis' superuser = true schema = cdb_crankshaft diff --git a/release/python/0.7.0/crankshaft/crankshaft/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/__init__.py new file mode 100644 index 0000000..82b2b87 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/__init__.py @@ -0,0 +1,7 @@ +"""Import all modules""" +import crankshaft.random_seeds +import crankshaft.clustering +import crankshaft.space_time_dynamics +import crankshaft.segmentation +import crankshaft.regression +import analysis_data_provider diff --git a/release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py b/release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py new file mode 100644 index 0000000..3d5225a --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py @@ -0,0 +1,98 @@ +"""class for fetching data""" +import plpy +import pysal_utils as pu + +NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows ' + 'for null values and fill in appropriately.') + + +def verify_data(func): + """decorator to verify data result before returning to algorithm""" + def wrapper(*args, **kwargs): + """Error checking""" + try: + data = func(*args, **kwargs) + if not data: + plpy.error(NULL_VALUE_ERROR) + else: + return data + except Exception as err: + plpy.error('Analysis failed: {}'.format(err)) + + return [] + + return wrapper + + +class AnalysisDataProvider(object): + @verify_data + def get_getis(self, w_type, params): + """fetch data for getis ord's g""" + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + + @verify_data + def get_markov(self, w_type, params): + """fetch data for spatial markov""" + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + + @verify_data + def get_moran(self, w_type, params): + """fetch data for moran's i analyses""" + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + + @verify_data + def get_nonspatial_kmeans(self, params): + """ + Fetch data for non-spatial k-means. + + Inputs - a dict (params) with the following keys: + colnames: a (text) list of column names (e.g., + `['andy', 'cookie']`) + id_col: the name of the id column (e.g., `'cartodb_id'`) + subquery: the subquery for exposing the data (e.g., + SELECT * FROM favorite_things) + Output: + A SQL query for packaging the data for consumption within + `KMeans().nonspatial`. Format will be a list of length one, + with the first element a dict with keys ('rowid', 'attr1', + 'attr2', ...) + """ + agg_cols = ', '.join([ + 'array_agg({0}) As arr_col{1}'.format(val, idx+1) + for idx, val in enumerate(params['colnames']) + ]) + query = ''' + SELECT {cols}, array_agg({id_col}) As rowid + FROM ({subquery}) As a + '''.format(subquery=params['subquery'], + id_col=params['id_col'], + cols=agg_cols).strip() + return plpy.execute(query) + + @verify_data + def get_spatial_kmeans(self, params): + """fetch data for spatial kmeans""" + query = ''' + SELECT + array_agg("{id_col}" ORDER BY "{id_col}") as ids, + array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs, + array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys + FROM ({subquery}) As a + WHERE "{geom_col}" IS NOT NULL + '''.format(**params) + return plpy.execute(query) + + @verify_data + def get_gwr(self, params): + """fetch data for gwr analysis""" + query = pu.gwr_query(params) + return plpy.execute(query) + + @verify_data + def get_gwr_predict(self, params): + """fetch data for gwr predict""" + query = pu.gwr_predict_query(params) + return plpy.execute(query) diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py new file mode 100644 index 0000000..d9682fa --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py @@ -0,0 +1,4 @@ +"""Import all functions from for clustering""" +from moran import * +from kmeans import * +from getis import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py new file mode 100644 index 0000000..2bee3a2 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py @@ -0,0 +1,50 @@ +""" +Getis-Ord's G geostatistics (hotspot/coldspot analysis) +""" + +import pysal as ps +from collections import OrderedDict + +# crankshaft modules +import crankshaft.pysal_utils as pu +from crankshaft.analysis_data_provider import AnalysisDataProvider + +# High level interface --------------------------------------- + + +class Getis(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def getis_ord(self, subquery, attr, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Getis-Ord's G* + Implementation building neighbors with a PostGIS database and PySAL's + Getis-Ord's G* hotspot/coldspot module. + Andy Eschbacher + """ + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors if kNN is chosen + + params = OrderedDict([("id_col", id_col), + ("attr1", attr), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_getis(w_type, params) + attr_vals = pu.get_attributes(result) + + # build PySAL weight object + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate Getis-Ord's G* z- and p-values + getis = ps.esda.getisord.G_Local(attr_vals, weight, + star=True, permutations=permutations) + + return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order) diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py new file mode 100644 index 0000000..6d22d44 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py @@ -0,0 +1,113 @@ +from sklearn.cluster import KMeans +import numpy as np + +from crankshaft.analysis_data_provider import AnalysisDataProvider + + +class Kmeans(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def spatial(self, query, no_clusters, no_init=20): + """ + find centers based on clusters of latitude/longitude pairs + query: SQL query that has a WGS84 geometry (the_geom) + """ + params = {"subquery": query, + "geom_col": "the_geom", + "id_col": "cartodb_id"} + + result = self.data_provider.get_spatial_kmeans(params) + + # Unpack query response + xs = result[0]['xs'] + ys = result[0]['ys'] + ids = result[0]['ids'] + + km = KMeans(n_clusters=no_clusters, n_init=no_init) + labels = km.fit_predict(zip(xs, ys)) + return zip(ids, labels) + + def nonspatial(self, subquery, colnames, no_clusters=5, + standardize=True, id_col='cartodb_id'): + """ + Arguments: + query (string): A SQL query to retrieve the data required to do the + k-means clustering analysis, like so: + SELECT * FROM iris_flower_data + colnames (list): a list of the column names which contain the data + of interest, like so: ['sepal_width', + 'petal_width', + 'sepal_length', + 'petal_length'] + no_clusters (int): number of clusters (greater than zero) + id_col (string): name of the input id_column + + Returns: + A list of tuples with the following columns: + cluster labels: a label for the cluster that the row belongs to + centers: center of the cluster that this row belongs to + silhouettes: silhouette measure for this value + rowid: row that these values belong to (corresponds to the value in + `id_col`) + """ + import json + from sklearn import metrics + + params = { + "colnames": colnames, + "subquery": subquery, + "id_col": id_col + } + + data = self.data_provider.get_nonspatial_kmeans(params) + + # fill array with values for k-means clustering + if standardize: + cluster_columns = _scale_data( + _extract_columns(data)) + else: + cluster_columns = _extract_columns(data) + + kmeans = KMeans(n_clusters=no_clusters, + random_state=0).fit(cluster_columns) + + centers = [json.dumps(dict(zip(colnames, c))) + for c in kmeans.cluster_centers_[kmeans.labels_]] + + silhouettes = metrics.silhouette_samples(cluster_columns, + kmeans.labels_, + metric='sqeuclidean') + + return zip(kmeans.labels_, + centers, + silhouettes, + [kmeans.inertia_] * kmeans.labels_.shape[0], + data[0]['rowid']) + + +# -- Preprocessing steps + +def _extract_columns(data): + """ + Extract the features from the query and pack them into a NumPy array + data (list of dicts): result of the kmeans request + """ + # number of columns minus rowid column + n_cols = len(data[0]) - 1 + return np.array([data[0]['arr_col{0}'.format(i+1)] + for i in xrange(n_cols)], + dtype=float).T + + +def _scale_data(features): + """ + Scale all input columns to center on 0 with a standard devation of 1 + features (numpy matrix): features of dimension (n_features, n_samples) + """ + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + return scaler.fit_transform(features) diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py new file mode 100644 index 0000000..0d5753f --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py @@ -0,0 +1,208 @@ +""" +Moran's I geostatistics (global clustering & outliers presence) +""" + +# TODO: Fill in local neighbors which have null/NoneType values with the +# average of the their neighborhood + +import pysal as ps +from collections import OrderedDict +from crankshaft.analysis_data_provider import AnalysisDataProvider + +# crankshaft module +import crankshaft.pysal_utils as pu + +# High level interface --------------------------------------- + + +class Moran(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def global_stat(self, subquery, attr_name, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I (global) + Implementation building neighbors with a PostGIS database and Moran's I + core clusters with PySAL. + Andy Eschbacher + """ + params = OrderedDict([("id_col", id_col), + ("attr1", attr_name), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + attr_vals = pu.get_attributes(result) + + # calculate weights + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate moran global + moran_global = ps.esda.moran.Moran(attr_vals, weight, + permutations=permutations) + + return zip([moran_global.I], [moran_global.EI]) + + def local_stat(self, subquery, attr, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I implementation for PL/Python + Andy Eschbacher + """ + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors + + params = OrderedDict([("id_col", id_col), + ("attr1", attr), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + attr_vals = pu.get_attributes(result) + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local(attr_vals, weight, + permutations=permutations) + + # find quadrants for each geometry + quads = quad_position(lisa.q) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + + def global_rate_stat(self, subquery, numerator, denominator, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I Rate (global) + Andy Eschbacher + """ + params = OrderedDict([("id_col", id_col), + ("attr1", numerator), + ("attr2", denominator), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + numer = pu.get_attributes(result, 1) + denom = pu.get_attributes(result, 2) + + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate moran global rate + lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight, + permutations=permutations) + + return zip([lisa_rate.I], [lisa_rate.EI]) + + def local_rate_stat(self, subquery, numerator, denominator, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I Local Rate + Andy Eschbacher + """ + # geometries with values that are null are ignored + # resulting in a collection of not as near neighbors + + params = OrderedDict([("id_col", id_col), + ("numerator", numerator), + ("denominator", denominator), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + numer = pu.get_attributes(result, 1) + denom = pu.get_attributes(result, 2) + + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight, + permutations=permutations) + + # find quadrants for each geometry + quads = quad_position(lisa.q) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + + def local_bivariate_stat(self, subquery, attr1, attr2, + permutations, geom_col, id_col, + w_type, num_ngbrs): + """ + Moran's I (local) Bivariate (untested) + """ + + params = OrderedDict([("id_col", id_col), + ("attr1", attr1), + ("attr2", attr2), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + attr1_vals = pu.get_attributes(result, 1) + attr2_vals = pu.get_attributes(result, 2) + + # create weights + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight, + permutations=permutations) + + # find clustering of significance + lisa_sig = quad_position(lisa.q) + + return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order) + +# Low level functions ---------------------------------------- + + +def map_quads(coord): + """ + Map a quadrant number to Moran's I designation + HH=1, LH=2, LL=3, HL=4 + Input: + @param coord (int): quadrant of a specific measurement + Output: + classification (one of 'HH', 'LH', 'LL', or 'HL') + """ + if coord == 1: + return 'HH' + elif coord == 2: + return 'LH' + elif coord == 3: + return 'LL' + elif coord == 4: + return 'HL' + else: + return None + + +def quad_position(quads): + """ + Produce Moran's I classification based of n + Input: + @param quads ndarray: an array of quads classified by + 1-4 (PySAL default) + Output: + @param list: an array of quads classied by 'HH', 'LL', etc. + """ + return [map_quads(q) for q in quads] diff --git a/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py new file mode 100644 index 0000000..fdf073b --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py @@ -0,0 +1,2 @@ +"""Import all functions for pysal_utils""" +from crankshaft.pysal_utils.pysal_utils import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py new file mode 100644 index 0000000..6b02f6d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py @@ -0,0 +1,251 @@ +""" + Utilities module for generic PySAL functionality, mainly centered on + translating queries into numpy arrays or PySAL weights objects +""" + +import numpy as np +import pysal as ps + + +def construct_neighbor_query(w_type, query_vals): + """Return query (a string) used for finding neighbors + @param w_type text: type of neighbors to calculate ('knn' or 'queen') + @param query_vals dict: values used to construct the query + """ + + if w_type.lower() == 'knn': + return knn(query_vals) + else: + return queen(query_vals) + + +# Build weight object +def get_weight(query_res, w_type='knn', num_ngbrs=5): + """ + Construct PySAL weight from return value of query + @param query_res dict-like: query results with attributes and neighbors + """ + + neighbors = {x['id']: x['neighbors'] for x in query_res} + print 'len of neighbors: %d' % len(neighbors) + + built_weight = ps.W(neighbors) + built_weight.transform = 'r' + + return built_weight + + +def query_attr_select(params, table_ref=True): + """ + Create portion of SELECT statement for attributes inolved in query. + Defaults to order in the params + @param params: dict of information used in query (column names, + table name, etc.) + Example: + OrderedDict([('numerator', 'price'), + ('denominator', 'sq_meters'), + ('subquery', 'SELECT * FROM interesting_data')]) + Output: + "i.\"price\"::numeric As attr1, " \ + "i.\"sq_meters\"::numeric As attr2, " + """ + + attr_string = "" + template = "\"%(col)s\"::numeric As attr%(alias_num)s, " + + if table_ref: + template = "i." + template + + if ('time_cols' in params) or ('ind_vars' in params): + # if markov or gwr analysis + attrs = (params['time_cols'] if 'time_cols' in params + else params['ind_vars']) + if 'ind_vars' in params: + template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, " + + for idx, val in enumerate(attrs): + attr_string += template % {"col": val, "alias_num": idx + 1} + else: + # if moran's analysis + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'subquery', + 'num_ngbrs', 'subquery')] + + for idx, val in enumerate(attrs): + attr_string += template % {"col": params[val], + "alias_num": idx + 1} + + return attr_string + + +def query_attr_where(params, table_ref=True): + """ + Construct where conditions when building neighbors query + Create portion of WHERE clauses for weeding out NULL-valued geometries + Input: dict of params: + {'subquery': ..., + 'numerator': 'data1', + 'denominator': 'data2', + '': ...} + Output: + 'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL' + Input: + {'subquery': ..., + 'time_cols': ['time1', 'time2', 'time3'], + 'etc': ...} + Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT + NULL AND idx_replace."time3" IS NOT NULL' + """ + attr_string = [] + template = "\"%s\" IS NOT NULL" + if table_ref: + template = "idx_replace." + template + + if ('time_cols' in params) or ('ind_vars' in params): + # markov or gwr where clauses + attrs = (params['time_cols'] if 'time_cols' in params + else params['ind_vars']) + # add values to template + for attr in attrs: + attr_string.append(template % attr) + else: + # moran where clauses + + # get keys + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'subquery', + 'num_ngbrs', 'subquery')] + + # add values to template + for attr in attrs: + attr_string.append(template % params[attr]) + + if 'denominator' in attrs: + attr_string.append( + "idx_replace.\"%s\" <> 0" % params['denominator']) + + out = " AND ".join(attr_string) + + return out + + +def knn(params): + """SQL query for k-nearest neighbors. + @param vars: dict of values to fill template + """ + + attr_select = query_attr_select(params, table_ref=True) + attr_where = query_attr_where(params, table_ref=True) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = ''' + SELECT + i."{id_col}" As id, + %(attr_select)s + (SELECT ARRAY(SELECT j."{id_col}" + FROM ({subquery}) As j + WHERE i."{id_col}" <> j."{id_col}" AND + %(attr_where_j)s AND + j."{geom_col}" IS NOT NULL + ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC + LIMIT {num_ngbrs})) As neighbors + FROM ({subquery}) As i + WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL + ORDER BY i."{id_col}" ASC; + ''' % replacements + + return query.format(**params) + + +# SQL query for finding queens neighbors (all contiguous polygons) +def queen(params): + """SQL query for queen neighbors. + @param params dict: information to fill query + """ + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = ''' + SELECT + i."{id_col}" As id, + %(attr_select)s + (SELECT ARRAY(SELECT j."{id_col}" + FROM ({subquery}) As j + WHERE i."{id_col}" <> j."{id_col}" AND + ST_Touches(i."{geom_col}", j."{geom_col}") AND + %(attr_where_j)s)) As neighbors + FROM ({subquery}) As i + WHERE + %(attr_where_i)s + ORDER BY i."{id_col}" ASC; + ''' % replacements + + return query.format(**params) + + +def gwr_query(params): + """ + GWR query + """ + + replacements = {"ind_vars_select": query_attr_select(params, + table_ref=None), + "ind_vars_where": query_attr_where(params, + table_ref=None)} + + query = ''' + SELECT + array_agg(ST_X(ST_Centroid("{geom_col}"))) As x, + array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y, + array_agg("{dep_var}") As dep_var, + %(ind_vars_select)s + array_agg("{id_col}") As rowid + FROM ({subquery}) As q + WHERE + "{dep_var}" IS NOT NULL AND + %(ind_vars_where)s + ''' % replacements + + return query.format(**params).strip() + + +def gwr_predict_query(params): + """ + GWR query + """ + + replacements = {"ind_vars_select": query_attr_select(params, + table_ref=None), + "ind_vars_where": query_attr_where(params, + table_ref=None)} + + query = ''' + SELECT + array_agg(ST_X(ST_Centroid({geom_col}))) As x, + array_agg(ST_Y(ST_Centroid({geom_col}))) As y, + array_agg({dep_var}) As dep_var, + %(ind_vars_select)s + array_agg({id_col}) As rowid + FROM ({subquery}) As q + WHERE + %(ind_vars_where)s + ''' % replacements + + return query.format(**params).strip() +# to add more weight methods open a ticket or pull request + + +def get_attributes(query_res, attr_num=1): + """ + @param query_res: query results with attributes and neighbors + @param attr_num: attribute number (1, 2, ...) + """ + return np.array([x['attr' + str(attr_num)] for x in query_res], + dtype=np.float) diff --git a/release/python/0.7.0/crankshaft/crankshaft/random_seeds.py b/release/python/0.7.0/crankshaft/crankshaft/random_seeds.py new file mode 100644 index 0000000..c55ba14 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/random_seeds.py @@ -0,0 +1,12 @@ +"""Random seed generator used for non-deterministic functions in crankshaft""" +import random +import numpy + + +def set_random_seeds(value): + """ + Set the seeds of the RNGs (Random Number Generators) + used internally. + """ + random.seed(value) + numpy.random.seed(value) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py new file mode 100644 index 0000000..f9d6d07 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py @@ -0,0 +1,3 @@ +from crankshaft.regression.gwr import * +from crankshaft.regression.glm import * +from crankshaft.regression.gwr_cs import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb new file mode 100644 index 0000000..1b17831 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Import GLM and pysal\n", + "import os\n", + "import numpy as np\n", + "os.chdir('/Users/toshan/dev/pysal/pysal/contrib/glm')\n", + "from glm import GLM\n", + "import pysal\n", + "import pandas as pd\n", + "import statsmodels.formula.api as smf\n", + "import statsmodels.api as sm\n", + "from family import Gaussian, Binomial, Poisson, QuasiPoisson\n", + "\n", + "from statsmodels.api import families" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Prepare some test data - columbus example\n", + "db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r')\n", + "y = np.array(db.by_col(\"HOVAL\"))\n", + "y = np.reshape(y, (49,1))\n", + "X = []\n", + "#X.append(np.ones(len(y)))\n", + "X.append(db.by_col(\"INC\"))\n", + "X.append(db.by_col(\"CRIME\"))\n", + "X = np.array(X).T" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 46.42818268]\n", + " [ 0.62898397]\n", + " [ -0.48488854]]\n" + ] + } + ], + "source": [ + "#First fit pysal OLS model\n", + "from pysal.spreg import ols\n", + "OLS = ols.OLS(y, X)\n", + "print OLS.betas" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "[ 46.42818268 0.62898397 -0.48488854]\n", + "[ 46.42818268 0.62898397 -0.48488854]\n" + ] + } + ], + "source": [ + "#Then fit Gaussian GLM\n", + "\n", + "#create Gaussian GLM model object\n", + "model = GLM(y, X, Gaussian())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()\n", + "\n", + "#Check coefficients - R betas [46.4282, 0.6290, -0.4849]\n", + "print results.params\n", + "\n", + "# Gaussian GLM results from statsmodels\n", + "sm_model = smf.GLM(y, sm.add_constant(X), family=families.Gaussian())\n", + "sm_results = sm_model.fit()\n", + "print sm_results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 2\n", + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "\n", + "\n", + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "print results.df_model, sm_results.df_model\n", + "print np.allclose(results.aic, sm_results.aic)\n", + "print np.allclose(results.bic, sm_results.bic)\n", + "print np.allclose(results.deviance, sm_results.deviance)\n", + "print np.allclose(results.df_model, sm_results.df_model)\n", + "print np.allclose(results.df_resid, sm_results.df_resid)\n", + "print np.allclose(results.llf, sm_results.llf)\n", + "print np.allclose(results.mu, sm_results.mu)\n", + "print np.allclose(results.n, sm_results.nobs)\n", + "print np.allclose(results.null, sm_results.null)\n", + "print np.allclose(results.null_deviance, sm_results.null_deviance)\n", + "print np.allclose(results.params, sm_results.params)\n", + "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n", + "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n", + "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n", + "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n", + "print np.allclose(results.resid_response, sm_results.resid_response)\n", + "print np.allclose(results.resid_working, sm_results.resid_working)\n", + "print np.allclose(results.scale, sm_results.scale)\n", + "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n", + "print np.allclose(results.cov_params(), sm_results.cov_params())\n", + "print np.allclose(results.bse, sm_results.bse)\n", + "print np.allclose(results.conf_int(), sm_results.conf_int())\n", + "print np.allclose(results.pvalues, sm_results.pvalues)\n", + "print np.allclose(results.tvalues, sm_results.tvalues)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "[ 3.92159085 0.01183491 -0.01371397]\n", + "[ 3.92159085 0.01183491 -0.01371397]\n" + ] + } + ], + "source": [ + "#Now fit a Poisson GLM \n", + "\n", + "poisson_y = np.round(y).astype(int)\n", + "\n", + "#create Poisson GLM model object\n", + "model = GLM(poisson_y, X, Poisson())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()\n", + "\n", + "#Check coefficients - R betas [3.91926, 0.01198, -0.01371]\n", + "print results.params.T\n", + "\n", + "# Poisson GLM results from statsmodels\n", + "sm_results = smf.GLM(poisson_y, sm.add_constant(X), family=families.Poisson()).fit()\n", + "print sm_results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "\n", + "\n", + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "[ 0.13049161 0.00511599 0.00193769] [ 0.13049161 0.00511599 0.00193769]\n" + ] + } + ], + "source": [ + "print np.allclose(results.aic, sm_results.aic)\n", + "print np.allclose(results.bic, sm_results.bic)\n", + "print np.allclose(results.deviance, sm_results.deviance)\n", + "print np.allclose(results.df_model, sm_results.df_model)\n", + "print np.allclose(results.df_resid, sm_results.df_resid)\n", + "print np.allclose(results.llf, sm_results.llf)\n", + "print np.allclose(results.mu, sm_results.mu)\n", + "print np.allclose(results.n, sm_results.nobs)\n", + "print np.allclose(results.null, sm_results.null)\n", + "print np.allclose(results.null_deviance, sm_results.null_deviance)\n", + "print np.allclose(results.params, sm_results.params)\n", + "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n", + "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n", + "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n", + "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n", + "print np.allclose(results.resid_response, sm_results.resid_response)\n", + "print np.allclose(results.resid_working, sm_results.resid_working)\n", + "print np.allclose(results.scale, sm_results.scale)\n", + "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n", + "print np.allclose(results.cov_params(), sm_results.cov_params())\n", + "print np.allclose(results.bse, sm_results.bse)\n", + "print np.allclose(results.conf_int(), sm_results.conf_int())\n", + "print np.allclose(results.pvalues, sm_results.pvalues)\n", + "print np.allclose(results.tvalues, sm_results.tvalues)\n", + "print results.bse, sm_results.bse" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-5.33638276 0.0287754 ]\n", + "[-5.33638276 0.0287754 ]\n" + ] + } + ], + "source": [ + "#Now fit a binomial GLM\n", + "londonhp = pd.read_csv('/Users/toshan/projects/londonhp.csv')\n", + "#londonhp = pd.read_csv('/Users/qszhao/Dropbox/pysal/pysal/contrib/gwr/londonhp.csv')\n", + "y = londonhp['BATH2'].values\n", + "y = np.reshape(y, (316,1))\n", + "X = londonhp['FLOORSZ'].values\n", + "X = np.reshape(X, (316,1))\n", + "\n", + "#create logistic GLM model object\n", + "model = GLM(y, X, Binomial())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()\n", + "\n", + "#Check coefficients - R betas [-5.33638, 0.02878]\n", + "print results.params.T\n", + "\n", + "# Logistic GLM results from statsmodels\n", + "sm_results = smf.GLM(y, sm.add_constant(X), family=families.Binomial()).fit()\n", + "print sm_results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 1\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "print results.df_model, sm_results.df_model\n", + "print np.allclose(results.aic, sm_results.aic)\n", + "print np.allclose(results.bic, sm_results.bic)\n", + "print np.allclose(results.deviance, sm_results.deviance)\n", + "print np.allclose(results.df_model, sm_results.df_model)\n", + "print np.allclose(results.df_resid, sm_results.df_resid)\n", + "print np.allclose(results.llf, sm_results.llf)\n", + "print np.allclose(results.mu, sm_results.mu)\n", + "print np.allclose(results.n, sm_results.nobs)\n", + "print np.allclose(results.null, sm_results.null)\n", + "print np.allclose(results.null_deviance, sm_results.null_deviance)\n", + "print np.allclose(results.params, sm_results.params)\n", + "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n", + "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n", + "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n", + "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n", + "print np.allclose(results.resid_response, sm_results.resid_response)\n", + "print np.allclose(results.resid_working, sm_results.resid_working)\n", + "print np.allclose(results.scale, sm_results.scale)\n", + "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n", + "print np.allclose(results.cov_params(), sm_results.cov_params())\n", + "print np.allclose(results.bse, sm_results.bse)\n", + "print np.allclose(results.conf_int(), sm_results.conf_int())\n", + "print np.allclose(results.pvalues, sm_results.pvalues)\n", + "print np.allclose(results.tvalues, sm_results.tvalues)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "#create QUasiPoisson GLM model object\n", + "model = GLM(poisson_y, X, QuasiPoisson())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py new file mode 100644 index 0000000..4a468d5 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py @@ -0,0 +1,4 @@ +import glm +import family +import utils +import iwls diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py new file mode 100644 index 0000000..484c1c8 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py @@ -0,0 +1,959 @@ + +from __future__ import print_function +import numpy as np +from scipy import stats +from utils import cache_readonly + +class Results(object): + """ + Class to contain model results + Parameters + ---------- + model : class instance + the previously specified model instance + params : array + parameter estimates from the fit model + """ + def __init__(self, model, params, **kwd): + self.__dict__.update(kwd) + self.initialize(model, params, **kwd) + self._data_attr = [] + + def initialize(self, model, params, **kwd): + self.params = params + self.model = model + if hasattr(model, 'k_constant'): + self.k_constant = model.k_constant + + def predict(self, exog=None, transform=True, *args, **kwargs): + """ + Call self.model.predict with self.params as the first argument. + Parameters + ---------- + exog : array-like, optional + The values for which you want to predict. + transform : bool, optional + If the model was fit via a formula, do you want to pass + exog through the formula. Default is True. E.g., if you fit + a model y ~ log(x1) + log(x2), and transform is True, then + you can pass a data structure that contains x1 and x2 in + their original form. Otherwise, you'd need to log the data + first. + args, kwargs : + Some models can take additional arguments or keywords, see the + predict method of the model for the details. + Returns + ------- + prediction : ndarray or pandas.Series + See self.model.predict + """ + if transform and hasattr(self.model, 'formula') and exog is not None: + from patsy import dmatrix + exog = dmatrix(self.model.data.design_info.builder, + exog) + + if exog is not None: + exog = np.asarray(exog) + if exog.ndim == 1 and (self.model.exog.ndim == 1 or + self.model.exog.shape[1] == 1): + exog = exog[:, None] + exog = np.atleast_2d(exog) # needed in count model shape[1] + + return self.model.predict(self.params, exog, *args, **kwargs) + + +#TODO: public method? +class LikelihoodModelResults(Results): + """ + Class to contain results from likelihood models + Parameters + ----------- + model : LikelihoodModel instance or subclass instance + LikelihoodModelResults holds a reference to the model that is fit. + params : 1d array_like + parameter estimates from estimated model + normalized_cov_params : 2d array + Normalized (before scaling) covariance of params. (dot(X.T,X))**-1 + scale : float + For (some subset of models) scale will typically be the + mean square error from the estimated model (sigma^2) + Returns + ------- + **Attributes** + mle_retvals : dict + Contains the values returned from the chosen optimization method if + full_output is True during the fit. Available only if the model + is fit by maximum likelihood. See notes below for the output from + the different methods. + mle_settings : dict + Contains the arguments passed to the chosen optimization method. + Available if the model is fit by maximum likelihood. See + LikelihoodModel.fit for more information. + model : model instance + LikelihoodResults contains a reference to the model that is fit. + params : ndarray + The parameters estimated for the model. + scale : float + The scaling factor of the model given during instantiation. + tvalues : array + The t-values of the standard errors. + Notes + ----- + The covariance of params is given by scale times normalized_cov_params. + Return values by solver if full_output is True during fit: + 'newton' + fopt : float + The value of the (negative) loglikelihood at its + minimum. + iterations : int + Number of iterations performed. + score : ndarray + The score vector at the optimum. + Hessian : ndarray + The Hessian at the optimum. + warnflag : int + 1 if maxiter is exceeded. 0 if successful convergence. + converged : bool + True: converged. False: did not converge. + allvecs : list + List of solutions at each iteration. + 'nm' + fopt : float + The value of the (negative) loglikelihood at its + minimum. + iterations : int + Number of iterations performed. + warnflag : int + 1: Maximum number of function evaluations made. + 2: Maximum number of iterations reached. + converged : bool + True: converged. False: did not converge. + allvecs : list + List of solutions at each iteration. + 'bfgs' + fopt : float + Value of the (negative) loglikelihood at its minimum. + gopt : float + Value of gradient at minimum, which should be near 0. + Hinv : ndarray + value of the inverse Hessian matrix at minimum. Note + that this is just an approximation and will often be + different from the value of the analytic Hessian. + fcalls : int + Number of calls to loglike. + gcalls : int + Number of calls to gradient/score. + warnflag : int + 1: Maximum number of iterations exceeded. 2: Gradient + and/or function calls are not changing. + converged : bool + True: converged. False: did not converge. + allvecs : list + Results at each iteration. + 'lbfgs' + fopt : float + Value of the (negative) loglikelihood at its minimum. + gopt : float + Value of gradient at minimum, which should be near 0. + fcalls : int + Number of calls to loglike. + warnflag : int + Warning flag: + - 0 if converged + - 1 if too many function evaluations or too many iterations + - 2 if stopped for another reason + converged : bool + True: converged. False: did not converge. + 'powell' + fopt : float + Value of the (negative) loglikelihood at its minimum. + direc : ndarray + Current direction set. + iterations : int + Number of iterations performed. + fcalls : int + Number of calls to loglike. + warnflag : int + 1: Maximum number of function evaluations. 2: Maximum number + of iterations. + converged : bool + True : converged. False: did not converge. + allvecs : list + Results at each iteration. + 'cg' + fopt : float + Value of the (negative) loglikelihood at its minimum. + fcalls : int + Number of calls to loglike. + gcalls : int + Number of calls to gradient/score. + warnflag : int + 1: Maximum number of iterations exceeded. 2: Gradient and/ + or function calls not changing. + converged : bool + True: converged. False: did not converge. + allvecs : list + Results at each iteration. + 'ncg' + fopt : float + Value of the (negative) loglikelihood at its minimum. + fcalls : int + Number of calls to loglike. + gcalls : int + Number of calls to gradient/score. + hcalls : int + Number of calls to hessian. + warnflag : int + 1: Maximum number of iterations exceeded. + converged : bool + True: converged. False: did not converge. + allvecs : list + Results at each iteration. + """ + + # by default we use normal distribution + # can be overwritten by instances or subclasses + use_t = False + + def __init__(self, model, params, normalized_cov_params=None, scale=1., + **kwargs): + super(LikelihoodModelResults, self).__init__(model, params) + self.normalized_cov_params = normalized_cov_params + self.scale = scale + + # robust covariance + # We put cov_type in kwargs so subclasses can decide in fit whether to + # use this generic implementation + if 'use_t' in kwargs: + use_t = kwargs['use_t'] + if use_t is not None: + self.use_t = use_t + if 'cov_type' in kwargs: + cov_type = kwargs.get('cov_type', 'nonrobust') + cov_kwds = kwargs.get('cov_kwds', {}) + + if cov_type == 'nonrobust': + self.cov_type = 'nonrobust' + self.cov_kwds = {'description' : 'Standard Errors assume that the ' + + 'covariance matrix of the errors is correctly ' + + 'specified.'} + else: + from statsmodels.base.covtype import get_robustcov_results + if cov_kwds is None: + cov_kwds = {} + use_t = self.use_t + # TODO: we shouldn't need use_t in get_robustcov_results + get_robustcov_results(self, cov_type=cov_type, use_self=True, + use_t=use_t, **cov_kwds) + + + def normalized_cov_params(self): + raise NotImplementedError + + + def _get_robustcov_results(self, cov_type='nonrobust', use_self=True, + use_t=None, **cov_kwds): + from statsmodels.base.covtype import get_robustcov_results + if cov_kwds is None: + cov_kwds = {} + + if cov_type == 'nonrobust': + self.cov_type = 'nonrobust' + self.cov_kwds = {'description' : 'Standard Errors assume that the ' + + 'covariance matrix of the errors is correctly ' + + 'specified.'} + else: + # TODO: we shouldn't need use_t in get_robustcov_results + get_robustcov_results(self, cov_type=cov_type, use_self=True, + use_t=use_t, **cov_kwds) + + @cache_readonly + def llf(self): + return self.model.loglike(self.params) + + @cache_readonly + def bse(self): + return np.sqrt(np.diag(self.cov_params())) + + @cache_readonly + def tvalues(self): + """ + Return the t-statistic for a given parameter estimate. + """ + return self.params / self.bse + + @cache_readonly + def pvalues(self): + if self.use_t: + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + return stats.t.sf(np.abs(self.tvalues), df_resid)*2 + else: + return stats.norm.sf(np.abs(self.tvalues))*2 + + + def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None, + other=None): + """ + Returns the variance/covariance matrix. + The variance/covariance matrix can be of a linear contrast + of the estimates of params or all params multiplied by scale which + will usually be an estimate of sigma^2. Scale is assumed to be + a scalar. + Parameters + ---------- + r_matrix : array-like + Can be 1d, or 2d. Can be used alone or with other. + column : array-like, optional + Must be used on its own. Can be 0d or 1d see below. + scale : float, optional + Can be specified or not. Default is None, which means that + the scale argument is taken from the model. + other : array-like, optional + Can be used when r_matrix is specified. + Returns + ------- + cov : ndarray + covariance matrix of the parameter estimates or of linear + combination of parameter estimates. See Notes. + Notes + ----- + (The below are assumed to be in matrix notation.) + If no argument is specified returns the covariance matrix of a model + ``(scale)*(X.T X)^(-1)`` + If contrast is specified it pre and post-multiplies as follows + ``(scale) * r_matrix (X.T X)^(-1) r_matrix.T`` + If contrast and other are specified returns + ``(scale) * r_matrix (X.T X)^(-1) other.T`` + If column is specified returns + ``(scale) * (X.T X)^(-1)[column,column]`` if column is 0d + OR + ``(scale) * (X.T X)^(-1)[column][:,column]`` if column is 1d + """ + if (hasattr(self, 'mle_settings') and + self.mle_settings['optimizer'] in ['l1', 'l1_cvxopt_cp']): + dot_fun = nan_dot + else: + dot_fun = np.dot + + if (cov_p is None and self.normalized_cov_params is None and + not hasattr(self, 'cov_params_default')): + raise ValueError('need covariance of parameters for computing ' + '(unnormalized) covariances') + if column is not None and (r_matrix is not None or other is not None): + raise ValueError('Column should be specified without other ' + 'arguments.') + if other is not None and r_matrix is None: + raise ValueError('other can only be specified with r_matrix') + + if cov_p is None: + if hasattr(self, 'cov_params_default'): + cov_p = self.cov_params_default + else: + if scale is None: + scale = self.scale + cov_p = self.normalized_cov_params * scale + + if column is not None: + column = np.asarray(column) + if column.shape == (): + return cov_p[column, column] + else: + #return cov_p[column][:, column] + return cov_p[column[:, None], column] + elif r_matrix is not None: + r_matrix = np.asarray(r_matrix) + if r_matrix.shape == (): + raise ValueError("r_matrix should be 1d or 2d") + if other is None: + other = r_matrix + else: + other = np.asarray(other) + tmp = dot_fun(r_matrix, dot_fun(cov_p, np.transpose(other))) + return tmp + else: # if r_matrix is None and column is None: + return cov_p + + #TODO: make sure this works as needed for GLMs + def t_test(self, r_matrix, cov_p=None, scale=None, + use_t=None): + """ + Compute a t-test for a each linear hypothesis of the form Rb = q + Parameters + ---------- + r_matrix : array-like, str, tuple + - array : If an array is given, a p x k 2d array or length k 1d + array specifying the linear restrictions. It is assumed + that the linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q). If q is given, + can be either a scalar or a length p row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + An optional `scale` to use. Default is the scale specified + by the model fit. + use_t : bool, optional + If use_t is None, then the default of the model is used. + If use_t is True, then the p-values are based on the t + distribution. + If use_t is False, then the p-values are based on the normal + distribution. + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + The available results have the same elements as the parameter table + in `summary()`. + Examples + -------- + >>> import numpy as np + >>> import statsmodels.api as sm + >>> data = sm.datasets.longley.load() + >>> data.exog = sm.add_constant(data.exog) + >>> results = sm.OLS(data.endog, data.exog).fit() + >>> r = np.zeros_like(results.params) + >>> r[5:] = [1,-1] + >>> print(r) + [ 0. 0. 0. 0. 0. 1. -1.] + r tests that the coefficients on the 5th and 6th independent + variable are the same. + >>> T_test = results.t_test(r) + >>> print(T_test) + + >>> T_test.effect + -1829.2025687192481 + >>> T_test.sd + 455.39079425193762 + >>> T_test.tvalue + -4.0167754636411717 + >>> T_test.pvalue + 0.0015163772380899498 + Alternatively, you can specify the hypothesis tests using a string + >>> from statsmodels.formula.api import ols + >>> dta = sm.datasets.longley.load_pandas().data + >>> formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' + >>> results = ols(formula, dta).fit() + >>> hypotheses = 'GNPDEFL = GNP, UNEMP = 2, YEAR/1829 = 1' + >>> t_test = results.t_test(hypotheses) + >>> print(t_test) + See Also + --------- + tvalues : individual t statistics + f_test : for F tests + patsy.DesignInfo.linear_constraint + """ + from patsy import DesignInfo + names = self.model.data.param_names + LC = DesignInfo(names).linear_constraint(r_matrix) + r_matrix, q_matrix = LC.coefs, LC.constants + num_ttests = r_matrix.shape[0] + num_params = r_matrix.shape[1] + + if (cov_p is None and self.normalized_cov_params is None and + not hasattr(self, 'cov_params_default')): + raise ValueError('Need covariance of parameters for computing ' + 'T statistics') + if num_params != self.params.shape[0]: + raise ValueError('r_matrix and params are not aligned') + if q_matrix is None: + q_matrix = np.zeros(num_ttests) + else: + q_matrix = np.asarray(q_matrix) + q_matrix = q_matrix.squeeze() + if q_matrix.size > 1: + if q_matrix.shape[0] != num_ttests: + raise ValueError("r_matrix and q_matrix must have the same " + "number of rows") + + if use_t is None: + #switch to use_t false if undefined + use_t = (hasattr(self, 'use_t') and self.use_t) + + _t = _sd = None + + _effect = np.dot(r_matrix, self.params) + # nan_dot multiplies with the convention nan * 0 = 0 + + # Perform the test + if num_ttests > 1: + _sd = np.sqrt(np.diag(self.cov_params( + r_matrix=r_matrix, cov_p=cov_p))) + else: + _sd = np.sqrt(self.cov_params(r_matrix=r_matrix, cov_p=cov_p)) + _t = (_effect - q_matrix) * recipr(_sd) + + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + + if use_t: + return ContrastResults(effect=_effect, t=_t, sd=_sd, + df_denom=df_resid) + else: + return ContrastResults(effect=_effect, statistic=_t, sd=_sd, + df_denom=df_resid, + distribution='norm') + + def f_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None): + """ + Compute the F-test for a joint linear hypothesis. + This is a special case of `wald_test` that always uses the F + distribution. + Parameters + ---------- + r_matrix : array-like, str, or tuple + - array : An r x k array where r is the number of restrictions to + test and k is the number of regressors. It is assumed + that the linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q), ``q`` can be + either a scalar or a length k row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + Default is 1.0 for no scaling. + invcov : array-like, optional + A q x q array to specify an inverse covariance matrix based on a + restrictions matrix. + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + Examples + -------- + >>> import numpy as np + >>> import statsmodels.api as sm + >>> data = sm.datasets.longley.load() + >>> data.exog = sm.add_constant(data.exog) + >>> results = sm.OLS(data.endog, data.exog).fit() + >>> A = np.identity(len(results.params)) + >>> A = A[1:,:] + This tests that each coefficient is jointly statistically + significantly different from zero. + >>> print(results.f_test(A)) + + Compare this to + >>> results.fvalue + 330.2853392346658 + >>> results.f_pvalue + 4.98403096572e-10 + >>> B = np.array(([0,0,1,-1,0,0,0],[0,0,0,0,0,1,-1])) + This tests that the coefficient on the 2nd and 3rd regressors are + equal and jointly that the coefficient on the 5th and 6th regressors + are equal. + >>> print(results.f_test(B)) + + Alternatively, you can specify the hypothesis tests using a string + >>> from statsmodels.datasets import longley + >>> from statsmodels.formula.api import ols + >>> dta = longley.load_pandas().data + >>> formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' + >>> results = ols(formula, dta).fit() + >>> hypotheses = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)' + >>> f_test = results.f_test(hypotheses) + >>> print(f_test) + See Also + -------- + statsmodels.stats.contrast.ContrastResults + wald_test + t_test + patsy.DesignInfo.linear_constraint + Notes + ----- + The matrix `r_matrix` is assumed to be non-singular. More precisely, + r_matrix (pX pX.T) r_matrix.T + is assumed invertible. Here, pX is the generalized inverse of the + design matrix of the model. There can be problems in non-OLS models + where the rank of the covariance of the noise is not full. + """ + res = self.wald_test(r_matrix, cov_p=cov_p, scale=scale, + invcov=invcov, use_f=True) + return res + + #TODO: untested for GLMs? + def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None, + use_f=None): + """ + Compute a Wald-test for a joint linear hypothesis. + Parameters + ---------- + r_matrix : array-like, str, or tuple + - array : An r x k array where r is the number of restrictions to + test and k is the number of regressors. It is assumed that the + linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q), ``q`` can be + either a scalar or a length p row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + Default is 1.0 for no scaling. + invcov : array-like, optional + A q x q array to specify an inverse covariance matrix based on a + restrictions matrix. + use_f : bool + If True, then the F-distribution is used. If False, then the + asymptotic distribution, chisquare is used. If use_f is None, then + the F distribution is used if the model specifies that use_t is True. + The test statistic is proportionally adjusted for the distribution + by the number of constraints in the hypothesis. + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + See also + -------- + statsmodels.stats.contrast.ContrastResults + f_test + t_test + patsy.DesignInfo.linear_constraint + Notes + ----- + The matrix `r_matrix` is assumed to be non-singular. More precisely, + r_matrix (pX pX.T) r_matrix.T + is assumed invertible. Here, pX is the generalized inverse of the + design matrix of the model. There can be problems in non-OLS models + where the rank of the covariance of the noise is not full. + """ + if use_f is None: + #switch to use_t false if undefined + use_f = (hasattr(self, 'use_t') and self.use_t) + + from patsy import DesignInfo + names = self.model.data.param_names + LC = DesignInfo(names).linear_constraint(r_matrix) + r_matrix, q_matrix = LC.coefs, LC.constants + + if (self.normalized_cov_params is None and cov_p is None and + invcov is None and not hasattr(self, 'cov_params_default')): + raise ValueError('need covariance of parameters for computing ' + 'F statistics') + + cparams = np.dot(r_matrix, self.params[:, None]) + J = float(r_matrix.shape[0]) # number of restrictions + if q_matrix is None: + q_matrix = np.zeros(J) + else: + q_matrix = np.asarray(q_matrix) + if q_matrix.ndim == 1: + q_matrix = q_matrix[:, None] + if q_matrix.shape[0] != J: + raise ValueError("r_matrix and q_matrix must have the same " + "number of rows") + Rbq = cparams - q_matrix + if invcov is None: + cov_p = self.cov_params(r_matrix=r_matrix, cov_p=cov_p) + if np.isnan(cov_p).max(): + raise ValueError("r_matrix performs f_test for using " + "dimensions that are asymptotically " + "non-normal") + invcov = np.linalg.inv(cov_p) + + if (hasattr(self, 'mle_settings') and + self.mle_settings['optimizer'] in ['l1', 'l1_cvxopt_cp']): + F = nan_dot(nan_dot(Rbq.T, invcov), Rbq) + else: + F = np.dot(np.dot(Rbq.T, invcov), Rbq) + + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + if use_f: + F /= J + return ContrastResults(F=F, df_denom=df_resid, + df_num=invcov.shape[0]) + else: + return ContrastResults(chi2=F, df_denom=J, statistic=F, + distribution='chi2', distargs=(J,)) + + + def wald_test_terms(self, skip_single=False, extra_constraints=None, + combine_terms=None): + """ + Compute a sequence of Wald tests for terms over multiple columns + This computes joined Wald tests for the hypothesis that all + coefficients corresponding to a `term` are zero. + `Terms` are defined by the underlying formula or by string matching. + Parameters + ---------- + skip_single : boolean + If true, then terms that consist only of a single column and, + therefore, refers only to a single parameter is skipped. + If false, then all terms are included. + extra_constraints : ndarray + not tested yet + combine_terms : None or list of strings + Each string in this list is matched to the name of the terms or + the name of the exogenous variables. All columns whose name + includes that string are combined in one joint test. + Returns + ------- + test_result : result instance + The result instance contains `table` which is a pandas DataFrame + with the test results: test statistic, degrees of freedom and + pvalues. + Examples + -------- + >>> res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", + data).fit() + >>> res_ols.wald_test_terms() + + F P>F df constraint df denom + Intercept 279.754525 2.37985521351e-22 1 51 + C(Duration, Sum) 5.367071 0.0245738436636 1 51 + C(Weight, Sum) 12.432445 3.99943118767e-05 2 51 + C(Duration, Sum):C(Weight, Sum) 0.176002 0.83912310946 2 51 + >>> res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)", + data).fit(cov_type='HC0') + >>> wt = res_poi.wald_test_terms(skip_single=False, + combine_terms=['Duration', 'Weight']) + >>> print(wt) + chi2 P>chi2 df constraint + Intercept 15.695625 7.43960374424e-05 1 + C(Weight) 16.132616 0.000313940174705 2 + C(Duration) 1.009147 0.315107378931 1 + C(Weight):C(Duration) 0.216694 0.897315972824 2 + Duration 11.187849 0.010752286833 3 + Weight 30.263368 4.32586407145e-06 4 + """ + # lazy import + from collections import defaultdict + + result = self + if extra_constraints is None: + extra_constraints = [] + if combine_terms is None: + combine_terms = [] + design_info = getattr(result.model.data.orig_exog, 'design_info', None) + + if design_info is None and extra_constraints is None: + raise ValueError('no constraints, nothing to do') + + + identity = np.eye(len(result.params)) + constraints = [] + combined = defaultdict(list) + if design_info is not None: + for term in design_info.terms: + cols = design_info.slice(term) + name = term.name() + constraint_matrix = identity[cols] + + # check if in combined + for cname in combine_terms: + if cname in name: + combined[cname].append(constraint_matrix) + + k_constraint = constraint_matrix.shape[0] + if skip_single: + if k_constraint == 1: + continue + + constraints.append((name, constraint_matrix)) + + combined_constraints = [] + for cname in combine_terms: + combined_constraints.append((cname, np.vstack(combined[cname]))) + else: + # check by exog/params names if there is no formula info + for col, name in enumerate(result.model.exog_names): + constraint_matrix = identity[col] + + # check if in combined + for cname in combine_terms: + if cname in name: + combined[cname].append(constraint_matrix) + + if skip_single: + continue + + constraints.append((name, constraint_matrix)) + + combined_constraints = [] + for cname in combine_terms: + combined_constraints.append((cname, np.vstack(combined[cname]))) + + use_t = result.use_t + distribution = ['chi2', 'F'][use_t] + + res_wald = [] + index = [] + for name, constraint in constraints + combined_constraints + extra_constraints: + wt = result.wald_test(constraint) + row = [wt.statistic.item(), wt.pvalue, constraint.shape[0]] + if use_t: + row.append(wt.df_denom) + res_wald.append(row) + index.append(name) + + # distribution nerutral names + col_names = ['statistic', 'pvalue', 'df_constraint'] + if use_t: + col_names.append('df_denom') + # TODO: maybe move DataFrame creation to results class + from pandas import DataFrame + table = DataFrame(res_wald, index=index, columns=col_names) + res = WaldTestResults(None, distribution, None, table=table) + # TODO: remove temp again, added for testing + res.temp = constraints + combined_constraints + extra_constraints + return res + + + def conf_int(self, alpha=.05, cols=None, method='default'): + """ + Returns the confidence interval of the fitted parameters. + Parameters + ---------- + alpha : float, optional + The significance level for the confidence interval. + ie., The default `alpha` = .05 returns a 95% confidence interval. + cols : array-like, optional + `cols` specifies which confidence intervals to return + method : string + Not Implemented Yet + Method to estimate the confidence_interval. + "Default" : uses self.bse which is based on inverse Hessian for MLE + "hjjh" : + "jac" : + "boot-bse" + "boot_quant" + "profile" + Returns + -------- + conf_int : array + Each row contains [lower, upper] limits of the confidence interval + for the corresponding parameter. The first column contains all + lower, the second column contains all upper limits. + Examples + -------- + >>> import statsmodels.api as sm + >>> data = sm.datasets.longley.load() + >>> data.exog = sm.add_constant(data.exog) + >>> results = sm.OLS(data.endog, data.exog).fit() + >>> results.conf_int() + array([[-5496529.48322745, -1467987.78596704], + [ -177.02903529, 207.15277984], + [ -0.1115811 , 0.03994274], + [ -3.12506664, -0.91539297], + [ -1.5179487 , -0.54850503], + [ -0.56251721, 0.460309 ], + [ 798.7875153 , 2859.51541392]]) + >>> results.conf_int(cols=(2,3)) + array([[-0.1115811 , 0.03994274], + [-3.12506664, -0.91539297]]) + Notes + ----- + The confidence interval is based on the standard normal distribution. + Models wish to use a different distribution should overwrite this + method. + """ + bse = self.bse + + if self.use_t: + dist = stats.t + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + q = dist.ppf(1 - alpha / 2, df_resid) + else: + dist = stats.norm + q = dist.ppf(1 - alpha / 2) + + if cols is None: + lower = self.params - q * bse + upper = self.params + q * bse + else: + cols = np.asarray(cols) + lower = self.params[cols] - q * bse[cols] + upper = self.params[cols] + q * bse[cols] + return np.asarray(lzip(lower, upper)) + + def save(self, fname, remove_data=False): + ''' + save a pickle of this instance + Parameters + ---------- + fname : string or filehandle + fname can be a string to a file path or filename, or a filehandle. + remove_data : bool + If False (default), then the instance is pickled without changes. + If True, then all arrays with length nobs are set to None before + pickling. See the remove_data method. + In some cases not all arrays will be set to None. + Notes + ----- + If remove_data is true and the model result does not implement a + remove_data method then this will raise an exception. + ''' + + from statsmodels.iolib.smpickle import save_pickle + + if remove_data: + self.remove_data() + + save_pickle(self, fname) + + @classmethod + def load(cls, fname): + ''' + load a pickle, (class method) + Parameters + ---------- + fname : string or filehandle + fname can be a string to a file path or filename, or a filehandle. + Returns + ------- + unpickled instance + ''' + + from statsmodels.iolib.smpickle import load_pickle + return load_pickle(fname) + + def remove_data(self): + '''remove data arrays, all nobs arrays from result and model + This reduces the size of the instance, so it can be pickled with less + memory. Currently tested for use with predict from an unpickled + results and model instance. + .. warning:: Since data and some intermediate results have been removed + calculating new statistics that require them will raise exceptions. + The exception will occur the first time an attribute is accessed + that has been set to None. + Not fully tested for time series models, tsa, and might delete too much + for prediction or not all that would be possible. + The list of arrays to delete is maintained as an attribute of the + result and model instance, except for cached values. These lists could + be changed before calling remove_data. + ''' + def wipe(obj, att): + #get to last element in attribute path + p = att.split('.') + att_ = p.pop(-1) + try: + obj_ = reduce(getattr, [obj] + p) + + #print(repr(obj), repr(att)) + #print(hasattr(obj_, att_)) + if hasattr(obj_, att_): + #print('removing3', att_) + setattr(obj_, att_, None) + except AttributeError: + pass + + model_attr = ['model.' + i for i in self.model._data_attr] + for att in self._data_attr + model_attr: + #print('removing', att) + wipe(self, att) + + data_in_cache = getattr(self, 'data_in_cache', []) + data_in_cache += ['fittedvalues', 'resid', 'wresid'] + for key in data_in_cache: + try: + self._cache[key] = None + except (AttributeError, KeyError): + pass + +def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py new file mode 100644 index 0000000..bad22c1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py @@ -0,0 +1,1845 @@ +''' +The one parameter exponential family distributions used by GLM. +''' +# TODO: quasi, quasibinomial, quasipoisson +# see http://www.biostat.jhsph.edu/~qli/biostatistics_r_doc/library/stats/html/family.html +# for comparison to R, and McCullagh and Nelder + +import numpy as np +from scipy import special +import links as L +import varfuncs as V +FLOAT_EPS = np.finfo(float).eps + + +class Family(object): + """ + The parent class for one-parameter exponential families. + + Parameters + ---------- + link : a link function instance + Link is the linear transformation function. + See the individual families for available links. + variance : a variance function + Measures the variance as a function of the mean probabilities. + See the individual families for the default variance function. + + See Also + -------- + :ref:`links` + + """ + # TODO: change these class attributes, use valid somewhere... + valid = [-np.inf, np.inf] + + links = [] + + def _setlink(self, link): + """ + Helper method to set the link for a family. + + Raises a ValueError exception if the link is not available. Note that + the error message might not be that informative because it tells you + that the link should be in the base class for the link function. + + See glm.GLM for a list of appropriate links for each family but note + that not all of these are currently available. + """ + # TODO: change the links class attribute in the families to hold + # meaningful information instead of a list of links instances such as + # [, + # , + # ] + # for Poisson... + self._link = link + if not isinstance(link, L.Link): + raise TypeError("The input should be a valid Link object.") + if hasattr(self, "links"): + validlink = link in self.links + validlink = max([isinstance(link, _) for _ in self.links]) + if not validlink: + errmsg = "Invalid link for family, should be in %s. (got %s)" + raise ValueError(errmsg % (repr(self.links), link)) + + def _getlink(self): + """ + Helper method to get the link for a family. + """ + return self._link + + # link property for each family is a pointer to link instance + link = property(_getlink, _setlink, doc="Link function for family") + + def __init__(self, link, variance): + self.link = link() + self.variance = variance + + def starting_mu(self, y): + r""" + Starting value for mu in the IRLS algorithm. + + Parameters + ---------- + y : array + The untransformed response variable. + + Returns + ------- + mu_0 : array + The first guess on the transformed response variable. + + Notes + ----- + .. math:: + + \mu_0 = (Y + \overline{Y})/2 + + Notes + ----- + Only the Binomial family takes a different initial value. + """ + return (y + y.mean())/2. + + def weights(self, mu): + r""" + Weights for IRLS steps + + Parameters + ---------- + mu : array-like + The transformed mean response variable in the exponential family + + Returns + ------- + w : array + The weights for the IRLS steps + + Notes + ----- + .. math:: + + w = 1 / (g'(\mu)^2 * Var(\mu)) + """ + return 1. / (self.link.deriv(mu)**2 * self.variance(mu)) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + The deviance function evaluated at (endog,mu,freq_weights,mu). + + Deviance is usually defined as twice the loglikelihood ratio. + + Parameters + ---------- + endog : array-like + The endogenous response variable + mu : array-like + The inverse of the link function at the linear predicted values. + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + Deviance : array + The value of deviance function defined below. + + Notes + ----- + Deviance is defined + + .. math:: + + D = \sum_i (2 * freq\_weights_i * llf(Y_i, Y_i) - 2 * + llf(Y_i, \mu_i)) / scale + + where y is the endogenous variable. The deviance functions are + analytically defined for each family. + """ + raise NotImplementedError + + def resid_dev(self, endog, mu, freq_weights=1., scale=1.): + """ + The deviance residuals + + Parameters + ---------- + endog : array + The endogenous response variable + mu : array + The inverse of the link function at the linear predicted values. + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + Deviance residuals. + + Notes + ----- + The deviance residuals are defined for each family. + """ + raise NotImplementedError + + def fitted(self, lin_pred): + """ + Fitted values based on linear predictors lin_pred. + + Parameters + ----------- + lin_pred : array + Values of the linear predictor of the model. + dot(X,beta) in a classical linear model. + + Returns + -------- + mu : array + The mean response variables given by the inverse of the link + function. + """ + fits = self.link.inverse(lin_pred) + return fits + + def predict(self, mu): + """ + Linear predictors based on given mu values. + + Parameters + ---------- + mu : array + The mean response variables + + Returns + ------- + lin_pred : array + Linear predictors based on the mean response variables. The value + of the link function at the given mu. + """ + return self.link(mu) + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + """ + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + `endog` : array + Usually the endogenous response variable. + `mu` : array + Usually but not always the fitted mean response variable. + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float + The scale parameter. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood evaluated at + (endog,mu,freq_weights,scale) as defined below. + Notes + ----- + This is defined for each family. endog and mu are not restricted to + `endog` and `mu` respectively. For instance, the deviance function + calls both loglike(endog,endog) and loglike(endog,mu) to get the + likelihood ratio. + """ + raise NotImplementedError + + def resid_anscombe(self, endog, mu): + """ + The Anscome residuals. + + See also + -------- + statsmodels.families.family.Family docstring and the `resid_anscombe` + for the individual families for more information. + """ + raise NotImplementedError + + +class Poisson(Family): + """ + Poisson exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the Poisson family is the log link. Available + links are log, identity, and sqrt. See statsmodels.family.links for + more information. + + Attributes + ---------- + Poisson.link : a link instance + The link function of the Poisson instance. + Poisson.variance : varfuncs instance + `variance` is an instance of + statsmodels.genmod.families.family.varfuncs.mu + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.sqrt] + variance = V.mu + valid = [0, np.inf] + safe_links = [L.Log, ] + + def __init__(self, link=L.log): + self.variance = Poisson.variance + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def resid_dev(self, endog, mu, scale=1.): + r"""Poisson deviance residual + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{2 * + (Y_i * \log(Y_i / \mu_i) - (Y_i - \mu_i))} / scale + """ + endog_mu = self._clean(endog / mu) + return (np.sign(endog - mu) * + np.sqrt(2 * (endog * np.log(endog_mu) - (endog - mu))) / scale) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r''' + Poisson deviance function + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + The deviance function at (endog,mu,freq_weights,scale) as defined + below. + + Notes + ----- + If a constant term is included it is defined as + + .. math:: + + D = 2 * \sum_i (freq\_weights_i * Y_i * \log(Y_i / \mu_i))/ scale + ''' + endog_mu = self._clean(endog / mu) + return 2 * np.sum(endog * freq_weights * np.log(endog_mu)) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + The scale parameter, defaults to 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + .. math:: + + llf = scale * \sum_i freq\_weights_i * (Y_i * \log(\mu_i) - \mu_i - + \ln \Gamma(Y_i + 1)) + """ + loglike = np.sum(freq_weights * (endog * np.log(mu) - mu - + special.gammaln(endog + 1))) + return scale * loglike + + def resid_anscombe(self, endog, mu): + r""" + Anscombe residuals for the Poisson exponential family distribution + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscome residuals for the Poisson family defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = (3/2) * (Y_i^{2/3} - \mu_i^{2/3}) / \mu_i^{1/6} + """ + return (3 / 2.) * (endog**(2/3.) - mu**(2 / 3.)) / mu**(1 / 6.) + +class QuasiPoisson(Family): + """ + QuasiPoisson exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the Poisson family is the log link. Available + links are log, identity, and sqrt. See statsmodels.family.links for + more information. + + Attributes + ---------- + Poisson.link : a link instance + The link function of the Poisson instance. + Poisson.variance : varfuncs instance + `variance` is an instance of + statsmodels.genmod.families.family.varfuncs.mu + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.sqrt] + variance = V.mu + valid = [0, np.inf] + safe_links = [L.Log, ] + + def __init__(self, link=L.log): + self.variance = Poisson.variance + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def resid_dev(self, endog, mu, scale=1.): + r"""Poisson deviance residual + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{2 * + (Y_i * \log(Y_i / \mu_i) - (Y_i - \mu_i))} / scale + """ + endog_mu = self._clean(endog / mu) + return (np.sign(endog - mu) * + np.sqrt(2 * (endog * np.log(endog_mu) - (endog - mu))) / scale) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r''' + Poisson deviance function + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + The deviance function at (endog,mu,freq_weights,scale) as defined + below. + + Notes + ----- + If a constant term is included it is defined as + + .. math:: + + D = 2 * \sum_i (freq\_weights_i * Y_i * \log(Y_i / \mu_i))/ scale + ''' + endog_mu = self._clean(endog / mu) + return 2 * np.sum(endog * freq_weights * np.log(endog_mu)) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Returns NaN for QuasiPoisson + + Returns + ------- + None: not applicable for QuasiPoisson + """ + return np.nan + + def resid_anscombe(self, endog, mu): + r""" + Anscombe residuals for the Poisson exponential family distribution + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscome residuals for the Poisson family defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = (3/2) * (Y_i^{2/3} - \mu_i^{2/3}) / \mu_i^{1/6} + """ + return (3 / 2.) * (endog**(2/3.) - mu**(2 / 3.)) / mu**(1 / 6.) + +class Gaussian(Family): + """ + Gaussian exponential family distribution. + + Parameters + ---------- + link : a link instance, optional + The default link for the Gaussian family is the identity link. + Available links are log, identity, and inverse. + See statsmodels.family.links for more information. + + Attributes + ---------- + Gaussian.link : a link instance + The link function of the Gaussian instance + Gaussian.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.constant + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.inverse_power] + variance = V.constant + safe_links = links + + def __init__(self, link=L.identity): + self.variance = Gaussian.variance + self.link = link() + + def resid_dev(self, endog, mu, scale=1.): + r""" + Gaussian deviance residuals + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + -------- + .. math:: + + resid\_dev_i = (Y_i - \mu_i) / \sqrt{Var(\mu_i)} / scale + """ + + return (endog - mu) / np.sqrt(self.variance(mu)) / scale + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Gaussian deviance function + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + The deviance function at (endog,mu,freq_weights,scale) + as defined below. + + Notes + -------- + .. math:: + + D = \sum_i freq\_weights_i * (Y_i - \mu_i)^2 / scale + """ + return np.sum((freq_weights * (endog - mu)**2)) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + Scales the loglikelihood function. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + If the link is the identity link function then the + loglikelihood function is the same as the classical OLS model. + + .. math:: + + llf = -nobs / 2 * (\log(SSR) + (1 + \log(2 \pi / nobs))) + + where + + .. math:: + SSR = \sum_i (Y_i - g^{-1}(\mu_i))^2 + + If the links is not the identity link then the loglikelihood + function is defined as + + .. math:: + + llf = \sum_i freq\_weights_i * ((Y_i * \mu_i - \mu_i^2 / 2) / scale- + Y^2 / (2 * scale) - (1/2) * \log(2 * \pi * scale)) + """ + if isinstance(self.link, L.Power) and self.link.power == 1: + # This is just the loglikelihood for classical OLS + nobs2 = endog.shape[0] / 2. + SSR = np.sum((endog-self.fitted(mu))**2, axis=0) + llf = -np.log(SSR) * nobs2 + llf -= (1+np.log(np.pi/nobs2))*nobs2 + return llf + else: + return np.sum(freq_weights * ((endog * mu - mu**2/2)/scale - + endog**2/(2 * scale) - .5*np.log(2 * np.pi * scale))) + + def resid_anscombe(self, endog, mu): + r""" + The Anscombe residuals for the Gaussian exponential family distribution + + Parameters + ---------- + endog : array + Endogenous response variable + mu : array + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals for the Gaussian family defined below + + Notes + -------- + .. math:: + + resid\_anscombe_i = Y_i - \mu_i + """ + return endog - mu + + +class Gamma(Family): + """ + Gamma exponential family distribution. + + Parameters + ---------- + link : a link instance, optional + The default link for the Gamma family is the inverse link. + Available links are log, identity, and inverse. + See statsmodels.family.links for more information. + + Attributes + ---------- + Gamma.link : a link instance + The link function of the Gamma instance + Gamma.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.mu_squared + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.inverse_power] + variance = V.mu_squared + safe_links = [L.Log, ] + + def __init__(self, link=L.inverse_power): + self.variance = Gamma.variance + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Gamma deviance function + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + .. math:: + + D = 2 * \sum_i freq\_weights_i * ((Y_i - \mu_i)/\mu_i - \log(Y_i / + \mu_i)) + """ + endog_mu = self._clean(endog/mu) + return 2*np.sum(freq_weights*((endog-mu)/mu-np.log(endog_mu))) + + def resid_dev(self, endog, mu, scale=1.): + r""" + Gamma deviance residuals + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) \sqrt{-2 * + (-(Y_i - \mu_i) / \mu_i + \log(Y_i / \mu_i))} + """ + endog_mu = self._clean(endog / mu) + return np.sign(endog - mu) * np.sqrt(-2 * (-(endog - mu)/mu + + np.log(endog_mu))) + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + -------- + .. math:: + + llf = -1 / scale * \sum_i *(Y_i / \mu_i+ \log(\mu_i)+ + (scale -1) * \log(Y) + \log(scale) + scale * + \ln \Gamma(1 / scale)) + """ + return - 1./scale * np.sum((endog/mu + np.log(mu) + (scale - 1) * + np.log(endog) + np.log(scale) + scale * + special.gammaln(1./scale)) * freq_weights) + + # in Stata scale is set to equal 1 for reporting llf + # in R it's the dispersion, though there is a loss of precision vs. + # our results due to an assumed difference in implementation + + def resid_anscombe(self, endog, mu): + r""" + The Anscombe residuals for Gamma exponential family distribution + + Parameters + ---------- + endog : array + Endogenous response variable + mu : array + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals for the Gamma family defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = 3 * (Y_i^{1/3} - \mu_i^{1/3}) / \mu_i^{1/3} + """ + return 3 * (endog**(1/3.) - mu**(1/3.)) / mu**(1/3.) + + +class Binomial(Family): + """ + Binomial exponential family distribution. + + Parameters + ---------- + link : a link instance, optional + The default link for the Binomial family is the logit link. + Available links are logit, probit, cauchy, log, and cloglog. + See statsmodels.family.links for more information. + + Attributes + ---------- + Binomial.link : a link instance + The link function of the Binomial instance + Binomial.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.binary + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + endog for Binomial can be specified in one of three ways. + + """ + + links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog, L.identity] + variance = V.binary # this is not used below in an effort to include n + + # Other safe links, e.g. cloglog and probit are subclasses + safe_links = [L.Logit, L.CDFLink] + + def __init__(self, link=L.logit): # , n=1.): + # TODO: it *should* work for a constant n>1 actually, if freq_weights + # is equal to n + self.n = 1 + # overwritten by initialize if needed but always used to initialize + # variance since endog is assumed/forced to be (0,1) + self.variance = V.Binomial(n=self.n) + self.link = link() + + def starting_mu(self, y): + """ + The starting values for the IRLS algorithm for the Binomial family. + A good choice for the binomial family is :math:`\mu_0 = (Y_i + 0.5)/2` + """ + return (y + .5)/2 + + def initialize(self, endog, freq_weights): + ''' + Initialize the response variable. + + Parameters + ---------- + endog : array + Endogenous response variable + + Returns + -------- + If `endog` is binary, returns `endog` + + If `endog` is a 2d array, then the input is assumed to be in the format + (successes, failures) and + successes/(success + failures) is returned. And n is set to + successes + failures. + ''' + # if not np.all(np.asarray(freq_weights) == 1): + # self.variance = V.Binomial(n=freq_weights) + if (endog.ndim > 1 and endog.shape[1] > 1): + y = endog[:, 0] + # overwrite self.freq_weights for deviance below + self.n = endog.sum(1) + return y*1./self.n, self.n + else: + return endog, np.ones(endog.shape[0]) + + def deviance(self, endog, mu, freq_weights=1, scale=1., axis=None): + r''' + Deviance function for either Bernoulli or Binomial data. + + Parameters + ---------- + endog : array-like + Endogenous response variable (already transformed to a probability + if appropriate). + mu : array + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + -------- + deviance : float + The deviance function as defined below + + Notes + ----- + If the endogenous variable is binary: + + .. math:: + + D = -2 * \sum_i freq\_weights * (I_{1,i} * \log(\mu_i) + I_{0,i} * + \log(1 - \mu_i)) + + where :math:`I_{1,i}` is an indicator function that evalueates to 1 if + :math:`Y_i = 1`. and :math:`I_{0,i}` is an indicator function that + evaluates to 1 if :math:`Y_i = 0`. + + If the model is ninomial: + + .. math:: + + D = 2 * \sum_i freq\_weights * (\log(Y_i / \mu_i) + (n_i - Y_i) * + \log((n_i - Y_i) / n_i - \mu_i)) + + where :math:`Y_i` and :math:`n` are as defined in Binomial.initialize. + ''' + if np.shape(self.n) == () and self.n == 1: + one = np.equal(endog, 1) + return -2 * np.sum((one * np.log(mu + 1e-200) + (1-one) * + np.log(1 - mu + 1e-200)) * freq_weights, axis=axis) + + else: + return 2 * np.sum(self.n * freq_weights * + (endog * np.log(endog/mu + 1e-200) + + (1 - endog) * np.log((1 - endog) / + (1 - mu) + 1e-200)), axis=axis) + + def resid_dev(self, endog, mu, scale=1.): + r""" + Binomial deviance residuals + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + If the endogenous variable is binary: + + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{-2 * + \log(I_{1,i} * \mu_i + I_{0,i} * (1 - \mu_i))} + + where :math:`I_{1,i}` is an indicator function that evalueates to 1 if + :math:`Y_i = 1`. and :math:`I_{0,i}` is an indicator function that + evaluates to 1 if :math:`Y_i = 0`. + + If the endogenous variable is binomial: + + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) \sqrt{2 * n_i * + (Y_i * \log(Y_i / \mu_i) + (1 - Y_i) * + \log(1 - Y_i)/(1 - \mu_i))} + + where :math:`Y_i` and :math:`n` are as defined in Binomial.initialize. + """ + + mu = self.link._clean(mu) + if np.shape(self.n) == () and self.n == 1: + one = np.equal(endog, 1) + return np.sign(endog-mu)*np.sqrt(-2 * + np.log(one * mu + (1 - one) * + (1 - mu)))/scale + else: + return (np.sign(endog - mu) * + np.sqrt(2 * self.n * + (endog * np.log(endog/mu + 1e-200) + + (1 - endog) * np.log((1 - endog)/(1 - mu) + 1e-200)))/scale) + + def loglike(self, endog, mu, freq_weights=1, scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + Not used for the Binomial GLM. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + -------- + If the endogenous variable is binary: + + .. math:: + + llf = scale * \sum_i (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) * + freq\_weights_i + + If the endogenous variable is binomial: + + .. math:: + + llf = scale * \sum_i freq\_weights_i * (\ln \Gamma(n+1) - + \ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i * + \log(\mu_i / (1 - \mu_i)) + n * \log(1 - \mu_i)) + + where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as + defined in Binomial initialize. This simply makes :math:`y_i` the + original number of successes. + """ + + if np.shape(self.n) == () and self.n == 1: + return scale * np.sum((endog * np.log(mu/(1 - mu) + 1e-200) + + np.log(1 - mu)) * freq_weights) + else: + y = endog * self.n # convert back to successes + return scale * np.sum((special.gammaln(self.n + 1) - + special.gammaln(y + 1) - + special.gammaln(self.n - y + 1) + y * + np.log(mu/(1 - mu)) + self.n * + np.log(1 - mu)) * freq_weights) + + def resid_anscombe(self, endog, mu): + ''' + The Anscombe residuals + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals as defined below. + + Notes + ----- + sqrt(n)*(cox_snell(endog)-cox_snell(mu))/(mu**(1/6.)*(1-mu)**(1/6.)) + + where cox_snell is defined as + cox_snell(x) = betainc(2/3., 2/3., x)*betainc(2/3.,2/3.) + where betainc is the incomplete beta function + + The name 'cox_snell' is idiosyncratic and is simply used for + convenience following the approach suggested in Cox and Snell (1968). + Further note that + cox_snell(x) = x**(2/3.)/(2/3.)*hyp2f1(2/3.,1/3.,5/3.,x) + where hyp2f1 is the hypergeometric 2f1 function. The Anscombe + residuals are sometimes defined in the literature using the + hyp2f1 formulation. Both betainc and hyp2f1 can be found in scipy. + + References + ---------- + Anscombe, FJ. (1953) "Contribution to the discussion of H. Hotelling's + paper." Journal of the Royal Statistical Society B. 15, 229-30. + + Cox, DR and Snell, EJ. (1968) "A General Definition of Residuals." + Journal of the Royal Statistical Society B. 30, 248-75. + + ''' + cox_snell = lambda x: (special.betainc(2/3., 2/3., x) + * special.beta(2/3., 2/3.)) + return np.sqrt(self.n) * ((cox_snell(endog) - cox_snell(mu)) / + (mu**(1/6.) * (1 - mu)**(1/6.))) + + +class InverseGaussian(Family): + """ + InverseGaussian exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the inverse Gaussian family is the + inverse squared link. + Available links are inverse_squared, inverse, log, and identity. + See statsmodels.family.links for more information. + + Attributes + ---------- + InverseGaussian.link : a link instance + The link function of the inverse Gaussian instance + InverseGaussian.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.mu_cubed + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + The inverse Guassian distribution is sometimes referred to in the + literature as the Wald distribution. + + """ + + links = [L.inverse_squared, L.inverse_power, L.identity, L.log] + variance = V.mu_cubed + safe_links = [L.inverse_squared, L.Log, ] + + def __init__(self, link=L.inverse_squared): + self.variance = InverseGaussian.variance + self.link = link() + + def resid_dev(self, endog, mu, scale=1.): + r""" + Returns the deviance residuals for the inverse Gaussian family. + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * + \sqrt {(Y_i - \mu_i)^2 / (Y_i * \mu_i^2)} / scale + """ + return np.sign(endog-mu) * np.sqrt((endog-mu)**2/(endog*mu**2))/scale + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Inverse Gaussian deviance function + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + .. math:: + + D = \sum_i freq\_weights_i * ((Y_i - \mu_i)^2 / (Y_i *\mu_i^2)) / + scale + """ + return np.sum(freq_weights*(endog-mu)**2/(endog*mu**2))/scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + .. math:: + + llf = -1/2 * \sum_i freq\_weights_i * ((Y_i - \mu_i)^2 / (Y_i * + \mu_i * scale) + \log(scale * Y_i^3) + \log(2 * \pi)) + """ + return -.5 * np.sum(((endog - mu)**2/(endog * mu**2 * scale) + + np.log(scale * endog**3) + np.log(2 * np.pi)) * + freq_weights) + + def resid_anscombe(self, endog, mu): + r""" + The Anscombe residuals for the inverse Gaussian distribution + + Parameters + ---------- + endog : array + Endogenous response variable + mu : array + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals for the inverse Gaussian distribution as + defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = \log(Y_i / \mu_i) / \sqrt{\mu_i} + """ + return np.log(endog / mu) / np.sqrt(mu) + + +class NegativeBinomial(Family): + """ + Negative Binomial exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the negative binomial family is the log link. + Available links are log, cloglog, identity, nbinom and power. + See statsmodels.family.links for more information. + alpha : float, optional + The ancillary parameter for the negative binomial distribution. + For now `alpha` is assumed to be nonstochastic. The default value + is 1. Permissible values are usually assumed to be between .01 and 2. + + + Attributes + ---------- + NegativeBinomial.link : a link instance + The link function of the negative binomial instance + NegativeBinomial.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.nbinom + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + Power link functions are not yet supported. + + """ + links = [L.log, L.cloglog, L.identity, L.nbinom, L.Power] + # TODO: add the ability to use the power links with an if test + # similar to below + variance = V.nbinom + safe_links = [L.Log, ] + + def __init__(self, link=L.log, alpha=1.): + self.alpha = 1. * alpha # make it at least float + self.variance = V.NegativeBinomial(alpha=self.alpha) + if isinstance(link, L.NegativeBinomial): + self.link = link(alpha=self.alpha) + else: + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Returns the value of the deviance function. + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + :math:`D = \sum_i piecewise_i` where :math:`piecewise_i` is defined as: + + If :math:`Y_{i} = 0`: + + :math:`piecewise_i = 2* \log(1 + \alpha * \mu_i) / \alpha` + + If :math:`Y_{i} > 0`: + + :math:`piecewise_i = 2 * Y_i * \log(Y_i / \mu_i) - (2 / \alpha) * + (1 + \alpha * Y_i) * \ln(1 + \alpha * Y_i) / (1 + \alpha * \mu_i)` + """ + iszero = np.equal(endog, 0) + notzero = 1 - iszero + endog_mu = self._clean(endog/mu) + tmp = iszero * 2 * np.log(1 + self.alpha * mu) / self.alpha + tmp += notzero * (2 * endog * np.log(endog_mu) - 2 / self.alpha * + (1 + self.alpha * endog) * + np.log((1 + self.alpha * endog) / + (1 + self.alpha * mu))) + return np.sum(freq_weights * tmp) / scale + + def resid_dev(self, endog, mu, scale=1.): + r""" + Negative Binomial Deviance Residual + + Parameters + ---------- + endog : array-like + `endog` is the response variable + mu : array-like + `mu` is the fitted value of the model + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + -------- + resid_dev : array + The array of deviance residuals + + Notes + ----- + :math:`resid\_dev_i = sign(Y_i-\mu_i) * \sqrt{piecewise_i}` + + where :math:`piecewise_i` is defined as + + If :math:`Y_i = 0`: + + :math:`piecewise_i = 2 * \log(1 + \alpha * \mu_i)/ \alpha` + + If :math:`Y_i > 0`: + + :math:`piecewise_i = 2 * Y_i * \log(Y_i / \mu_i) - (2 / \alpha) * + (1 + \alpha * Y_i) * \log((1 + \alpha * Y_i) / (1 + \alpha * \mu_i))` + """ + iszero = np.equal(endog, 0) + notzero = 1 - iszero + endog_mu = self._clean(endog / mu) + tmp = iszero * 2 * np.log(1 + self.alpha * mu) / self.alpha + tmp += notzero * (2 * endog * np.log(endog_mu) - 2 / self.alpha * + (1 + self.alpha * endog) * + np.log((1 + self.alpha * endog) / + (1 + self.alpha * mu))) + return np.sign(endog - mu) * np.sqrt(tmp) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + The fitted mean response values + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float + The scale parameter. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + Defined as: + + .. math:: + + llf = \sum_i freq\_weights_i * (Y_i * \log{(\alpha * e^{\eta_i} / + (1 + \alpha * e^{\eta_i}))} - \log{(1 + \alpha * e^{\eta_i})}/ + \alpha + Constant) + + where :math:`Constant` is defined as: + + .. math:: + + Constant = \ln \Gamma{(Y_i + 1/ \alpha )} - \ln \Gamma(Y_i + 1) - + \ln \Gamma{(1/ \alpha )} + """ + lin_pred = self._link(mu) + constant = (special.gammaln(endog + 1 / self.alpha) - + special.gammaln(endog+1)-special.gammaln(1/self.alpha)) + exp_lin_pred = np.exp(lin_pred) + return np.sum((endog * np.log(self.alpha * exp_lin_pred / + (1 + self.alpha * exp_lin_pred)) - + np.log(1 + self.alpha * exp_lin_pred) / + self.alpha + constant) * freq_weights) + + def resid_anscombe(self, endog, mu): + """ + The Anscombe residuals for the negative binomial family + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals as defined below. + + Notes + ----- + `resid_anscombe` = (hyp2f1(-alpha*endog)-hyp2f1(-alpha*mu)+\ + 1.5*(endog**(2/3.)-mu**(2/3.)))/(mu+alpha*mu**2)**(1/6.) + + where hyp2f1 is the hypergeometric 2f1 function parameterized as + hyp2f1(x) = hyp2f1(2/3.,1/3.,5/3.,x) + """ + + hyp2f1 = lambda x : special.hyp2f1(2 / 3., 1 / 3., 5 / 3., x) + return ((hyp2f1(-self.alpha * endog) - hyp2f1(-self.alpha * mu) + + 1.5 * ( endog**(2 / 3.) - mu**(2 / 3.))) / + (mu + self.alpha * mu**2)**(1 / 6.)) + + +class Tweedie(Family): + """ + Tweedie family. + + Parameters + ---------- + link : a link instance, optional + The default link for the Tweedie family is the log link when the + link_power is 0. Otherwise, the power link is default. + Available links are log and Power. + var_power : float, optional + The variance power. + link_power : float, optional + The link power. + + Attributes + ---------- + Tweedie.link : a link instance + The link function of the Tweedie instance + Tweedie.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.Power + Tweedie.link_power : float + The power of the link function, or 0 if its a log link. + Tweedie.var_power : float + The power of the variance function. + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + Logliklihood function not implemented because of the complexity of + calculating an infinite series of summations. The variance power can be + estimated using the `estimate_tweedie_power` function that is part of the + `GLM` class. + """ + links = [L.log, L.Power] + variance = V.Power + safe_links = [L.log, L.Power] + + def __init__(self, link=None, var_power=1., link_power=0): + self.var_power = var_power + self.link_power = link_power + self.variance = V.Power(power=var_power * 1.) + if link_power != 0 and not ((link is L.Power) or (link is None)): + msg = 'link_power of {} not supported specified link' + msg = msg.format(link_power) + raise ValueError(msg) + if (link_power == 0) and ((link is None) or (link is L.Log)): + self.link = L.log() + elif link_power != 0: + self.link = L.Power(power=link_power * 1.) + else: + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, 0, np.inf) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Returns the value of the deviance function. + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + When :math:`p = 1`, + + .. math:: + + resid\_dev_i = \mu + + when :math:`endog = 0` and + + .. math:: + + resid\_dev_i = endog * \log(endog / \mu) + (\mu - endog) + + otherwise. + + When :math:`p = 2`, + + .. math:: + + resid\_dev_i = (endog - \mu) / \mu - \log(endog / \mu) + + For all other p, + + .. math:: + + resid\_dev_i = endog ^{2 - p} / ((1 - p) * (2 - p)) - + endog * \mu ^{1 - p} / (1 - p) + \mu ^{2 - p} / + (2 - p) + + Once :math:`resid\_dev_i` is calculated, then calculate deviance as + + .. math:: + + D = \sum{2 * freq\_weights * resid\_dev_i} + """ + p = self.var_power + if p == 1: + dev = np.where(endog == 0, + mu, + endog * np.log(endog / mu) + (mu - endog)) + elif p == 2: + endog1 = np.clip(endog, FLOAT_EPS, np.inf) + dev = ((endog - mu) / mu) - np.log(endog1 / mu) + else: + dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) - + endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p)) + return np.sum(2 * freq_weights * dev) + + def resid_dev(self, endog, mu, scale=1.): + r""" + Tweedie Deviance Residual + + Parameters + ---------- + endog : array-like + `endog` is the response variable + mu : array-like + `mu` is the fitted value of the model + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + -------- + resid_dev : array + The array of deviance residuals + + Notes + ----- + When :math:`p = 1`, + + .. math:: + + resid\_dev_i = \mu + + when :math:`endog = 0` and + + .. math:: + + resid\_dev_i = endog * \log(endog / \mu) + (\mu - endog) + + otherwise. + + When :math:`p = 2`, + + .. math:: + + resid\_dev_i = (endog - \mu) / \mu - \log(endog / \mu) + + For all other p, + + .. math:: + + resid\_dev_i = endog ^{2 - p} / ((1 - p) * (2 - p)) - + endog * \mu ^{1 - p} / (1 - p) + \mu ^{2 - p} / + (2 - p) + """ + p = self.var_power + if p == 1: + dev = np.where(endog == 0, + mu, + endog * np.log(endog / mu) + (mu - endog)) + elif p == 2: + endog1 = np.clip(endog, FLOAT_EPS, np.inf) + dev = ((endog - mu) / mu) - np.log(endog1 / mu) + else: + dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) - + endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p)) + return np.sign(endog - mu) * np.sqrt(2 * dev) + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + The fitted mean response values + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float + The scale parameter. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + This is not implemented because of the complexity of calculating an + infinite series of sums. + """ + return np.nan + + def resid_anscombe(self, endog, mu): + """ + The Anscombe residuals for the Tweedie family + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals as defined below. + + Notes + ----- + When :math:`p = 3`, then + + .. math:: + + resid\_anscombe_i = (\log(endog) - \log(\mu)) / \sqrt{mu} + + Otherwise, + + .. math:: + + c = (3 - p) / 3 + + .. math:: + + resid\_anscombe_i = (1 / c) * (endog ^ c - \mu ^ c) / \mu ^{p / 6} + """ + if self.var_power == 3: + return (np.log(endog) - np.log(mu)) / np.sqrt(mu) + else: + c = (3. - self.var_power) / 3. + return ((1. / c) * (endog ** c - mu ** c) / + mu ** (self.var_power / 6.)) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py new file mode 100644 index 0000000..f2fc17d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py @@ -0,0 +1,326 @@ + +import numpy as np +import numpy.linalg as la +from pysal.spreg.utils import RegressionPropsY, spdot +import pysal.spreg.user_output as USER +from utils import cache_readonly +from base import LikelihoodModelResults +import family +from iwls import iwls + +__all__ = ['GLM'] + +class GLM(RegressionPropsY): + """ + Generalised linear models. Can currently estimate Guassian, Poisson and + Logisitc regression coefficients. GLM object prepares model input and fit + method performs estimation which then returns a GLMResults object. + + Parameters + ---------- + y : array + n*1, dependent variable. + X : array + n*k, independent variable, exlcuding the constant. + family : string + Model type: 'Gaussian', 'Poisson', 'Binomial' + + Attributes + ---------- + y : array + n*1, dependent variable. + X : array + n*k, independent variable, including constant. + family : string + Model type: 'Gaussian', 'Poisson', 'logistic' + n : integer + Number of observations + k : integer + Number of independent variables + df_model : float + k-1, where k is the number of variables (including + intercept) + df_residual : float + observations minus variables (n-k) + mean_y : float + Mean of y + std_y : float + Standard deviation of y + fit_params : dict + Parameters passed into fit method to define estimation + routine. + normalized_cov_params : array + k*k, approximates [X.T*X]-1 + """ + def __init__(self, y, X, family=family.Gaussian(), constant=True): + """ + Initialize class + """ + self.n = USER.check_arrays(y, X) + USER.check_y(y, self.n) + self.y = y + if constant: + self.X = USER.check_constant(X) + else: + self.X = X + self.family = family + self.k = self.X.shape[1] + self.fit_params = {} + + def fit(self, ini_betas=None, tol=1.0e-6, max_iter=200, solve='iwls'): + """ + Method that fits a model with a particular estimation routine. + + Parameters + ---------- + + ini_betas : array + k*1, initial coefficient values, including constant. + Default is None, which calculates initial values during + estimation. + tol: float + Tolerence for estimation convergence. + max_iter : integer + Maximum number of iterations if convergence not + achieved. + solve :string + Technique to solve MLE equations. + 'iwls' = iteratively (re)weighted least squares (default) + """ + self.fit_params['ini_betas'] = ini_betas + self.fit_params['tol'] = tol + self.fit_params['max_iter'] = max_iter + self.fit_params['solve']=solve + if solve.lower() == 'iwls': + params, predy, w, n_iter = iwls(self.y, self.X, self.family, + ini_betas=ini_betas, tol=tol, max_iter=max_iter) + self.fit_params['n_iter'] = n_iter + return GLMResults(self, params.flatten(), predy, w) + + @cache_readonly + def df_model(self): + return self.X.shape[1] - 1 + + @cache_readonly + def df_resid(self): + return self.n - self.df_model - 1 + +class GLMResults(LikelihoodModelResults): + """ + Results of estimated GLM and diagnostics. + + Parameters + ---------- + model : GLM object + Pointer to GLM object with estimation parameters. + params : array + k*1, estimared coefficients + mu : array + n*1, predicted y values. + w : array + n*1, final weight used for iwls + + Attributes + ---------- + model : GLM Object + Points to GLM object for which parameters have been + estimated. + y : array + n*1, dependent variable. + x : array + n*k, independent variable, including constant. + family : string + Model type: 'Gaussian', 'Poisson', 'Logistic' + n : integer + Number of observations + k : integer + Number of independent variables + df_model : float + k-1, where k is the number of variables (including + intercept) + df_residual : float + observations minus variables (n-k) + fit_params : dict + parameters passed into fit method to define estimation + routine. + scale : float + sigma squared used for subsequent computations. + params : array + n*k, estimared beta coefficients + w : array + n*1, final weight values of x + mu : array + n*1, predicted value of y (i.e., fittedvalues) + cov_params : array + Variance covariance matrix (kxk) of betas which has been + appropriately scaled by sigma-squared + bse : array + k*1, standard errors of betas + pvalues : array + k*1, two-tailed pvalues of parameters + tvalues : array + k*1, the tvalues of the standard errors + null : array + n*1, predicted values of y for null model + deviance : float + value of the deviance function evalued at params; + see family.py for distribution-specific deviance + null_deviance : float + value of the deviance function for the model fit with + a constant as the only regressor + llf : float + value of the loglikelihood function evalued at params; + see family.py for distribution-specific loglikelihoods + llnull : float + value of log-likelihood function evaluated at null + aic : float + AIC + bic : float + BIC + D2 : float + percent deviance explained + adj_D2 : float + adjusted percent deviance explained + pseudo_R2 : float + McFadden's pseudo R2 (coefficient of determination) + adj_pseudoR2 : float + adjusted McFadden's pseudo R2 + resid_response : array + response residuals; defined as y-mu + resid_pearson : array + Pearson residuals; defined as (y-mu)/sqrt(VAR(mu)) + where VAR is the distribution specific variance + function; see family.py and varfuncs.py for more information. + resid_working : array + Working residuals; the working residuals are defined as + resid_response/link'(mu); see links.py for the + derivatives of the link functions. + + resid_anscombe : array + Anscombe residuals; see family.py for + distribution-specific Anscombe residuals. + + resid_deviance : array + deviance residuals; see family.py for + distribution-specific deviance residuals. + + pearson_chi2 : float + chi-Squared statistic is defined as the sum + of the squares of the Pearson residuals + + normalized_cov_params : array + k*k, approximates [X.T*X]-1 + """ + def __init__(self, model, params, mu, w): + self.model = model + self.n = model.n + self.y = model.y.T.flatten() + self.X = model.X + self.k = model.k + self.family = model.family + self.fit_params = model.fit_params + self.params = params + self.w = w + self.mu = mu.flatten() + self._cache = {} + + @cache_readonly + def df_model(self): + return self.model.df_model + + @cache_readonly + def df_resid(self): + return self.model.df_resid + + @cache_readonly + def normalized_cov_params(self): + return la.inv(spdot(self.w.T, self.w)) + + @cache_readonly + def resid_response(self): + return (self.y-self.mu) + + @cache_readonly + def resid_pearson(self): + return ((self.y-self.mu) / + np.sqrt(self.family.variance(self.mu))) + + @cache_readonly + def resid_working(self): + return (self.resid_response / self.family.link.deriv(self.mu)) + + @cache_readonly + def resid_anscombe(self): + return (self.family.resid_anscombe(self.y, self.mu)) + + @cache_readonly + def resid_deviance(self): + return (self.family.resid_dev(self.y, self.mu)) + + @cache_readonly + def pearson_chi2(self): + chisq = (self.y - self.mu)**2 / self.family.variance(self.mu) + chisqsum = np.sum(chisq) + return chisqsum + + @cache_readonly + def null(self): + y = np.reshape(self.y, (-1,1)) + model = self.model + X = np.ones((len(y), 1)) + null_mod = GLM(y, X, family=self.family, constant=False) + return null_mod.fit().mu + + @cache_readonly + def scale(self): + if isinstance(self.family, (family.Binomial, family.Poisson)): + return 1. + else: + return (((np.power(self.resid_response, 2) / + self.family.variance(self.mu))).sum() / + (self.df_resid)) + @cache_readonly + def deviance(self): + return self.family.deviance(self.y, self.mu) + + @cache_readonly + def null_deviance(self): + return self.family.deviance(self.y, self.null) + + @cache_readonly + def llnull(self): + return self.family.loglike(self.y, self.null, scale=self.scale) + + @cache_readonly + def llf(self): + return self.family.loglike(self.y, self.mu, scale=self.scale) + + @cache_readonly + def aic(self): + if isinstance(self.family, family.QuasiPoisson): + return np.nan + else: + return -2 * self.llf + 2*(self.df_model+1) + + @cache_readonly + def bic(self): + return (self.deviance - + (self.model.n - self.df_model - 1) * + np.log(self.model.n)) + + @cache_readonly + def D2(self): + return 1 - (self.deviance / self.null_deviance) + + @cache_readonly + def adj_D2(self): + return 1.0 - (float(self.n) - 1.0)/(float(self.n) - float(self.k)) * (1.0-self.D2) + + @cache_readonly + def pseudoR2(self): + return 1 - (self.llf/self.llnull) + + @cache_readonly + def adj_pseudoR2(self): + return 1 - ((self.llf-self.k)/self.llnull) + diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py new file mode 100644 index 0000000..3ea6747 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py @@ -0,0 +1,84 @@ +import numpy as np +import numpy.linalg as la +from scipy import sparse as sp +from scipy.sparse import linalg as spla +from pysal.spreg.utils import spdot, spmultiply +from family import Binomial, Poisson + +def _compute_betas(y, x): + """ + compute MLE coefficients using iwls routine + + Methods: p189, Iteratively (Re)weighted Least Squares (IWLS), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + """ + xT = x.T + xtx = spdot(xT, x) + xtx_inv = la.inv(xtx) + xtx_inv = sp.csr_matrix(xtx_inv) + xTy = spdot(xT, y, array_out=False) + betas = spdot(xtx_inv, xTy) + return betas + +def _compute_betas_gwr(y, x, wi): + """ + compute MLE coefficients using iwls routine + + Methods: p189, Iteratively (Re)weighted Least Squares (IWLS), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + """ + xT = (x * wi).T + xtx = np.dot(xT, x) + xtx_inv = la.inv(xtx) + xtx_inv_xt = np.dot(xtx_inv, xT) + betas = np.dot(xtx_inv_xt, y) + return betas, xtx_inv_xt + +def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=None): + """ + Iteratively re-weighted least squares estimation routine + """ + n_iter = 0 + diff = 1.0e6 + if ini_betas is None: + betas = np.zeros((x.shape[1], 1), np.float) + else: + betas = ini_betas + if isinstance(family, Binomial): + y = family.link._clean(y) + if isinstance(family, Poisson): + y_off = y/offset + y_off = family.starting_mu(y_off) + v = family.predict(y_off) + mu = family.starting_mu(y) + else: + mu = family.starting_mu(y) + v = family.predict(mu) + + while diff > tol and n_iter < max_iter: + n_iter += 1 + w = family.weights(mu) + z = v + (family.link.deriv(mu)*(y-mu)) + w = np.sqrt(w) + if type(x) != np.ndarray: + w = sp.csr_matrix(w) + z = sp.csr_matrix(z) + wx = spmultiply(x, w, array_out=False) + wz = spmultiply(z, w, array_out=False) + if wi is None: + n_betas = _compute_betas(wz, wx) + else: + n_betas, xtx_inv_xt = _compute_betas_gwr(wz, wx, wi) + v = spdot(x, n_betas) + mu = family.fitted(v) + if isinstance(family, Poisson): + mu = mu * offset + diff = min(abs(n_betas-betas)) + betas = n_betas + + if wi is None: + return betas, mu, wx, n_iter + else: + return betas, mu, v, w, z, xtx_inv_xt, n_iter diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py new file mode 100644 index 0000000..f45724d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py @@ -0,0 +1,953 @@ +''' +Defines the link functions to be used with GLM and GEE families. +''' + +import numpy as np +import scipy.stats +FLOAT_EPS = np.finfo(float).eps + + +class Link(object): + """ + A generic link function for one-parameter exponential family. + + `Link` does nothing, but lays out the methods expected of any subclass. + """ + + def __call__(self, p): + """ + Return the value of the link function. This is just a placeholder. + + Parameters + ---------- + p : array-like + Probabilities + + Returns + ------- + g(p) : array-like + The value of the link function g(p) = z + """ + return NotImplementedError + + def inverse(self, z): + """ + Inverse of the link function. Just a placeholder. + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor of the transformed variable + in the IRLS algorithm for GLM. + + Returns + ------- + g^(-1)(z) : array + The value of the inverse of the link function g^(-1)(z) = p + + + """ + return NotImplementedError + + def deriv(self, p): + """ + Derivative of the link function g'(p). Just a placeholder. + + Parameters + ---------- + p : array-like + + Returns + ------- + g'(p) : array + The value of the derivative of the link function g'(p) + """ + return NotImplementedError + + def deriv2(self, p): + """Second derivative of the link function g''(p) + + implemented through numerical differentiation + """ + from statsmodels.tools.numdiff import approx_fprime_cs + # TODO: workaround proplem with numdiff for 1d + return np.diag(approx_fprime_cs(p, self.deriv)) + + def inverse_deriv(self, z): + """ + Derivative of the inverse link function g^(-1)(z). + + Notes + ----- + This reference implementation gives the correct result but is + inefficient, so it can be overriden in subclasses. + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor for a GLM or GEE model. + + Returns + ------- + g'^(-1)(z) : array + The value of the derivative of the inverse of the link function + + """ + return 1 / self.deriv(self.inverse(z)) + + +class Logit(Link): + """ + The logit transform + + Notes + ----- + call and derivative use a private method _clean to make trim p by + machine epsilon so that p is in (0,1) + + Alias of Logit: + logit = Logit() + """ + + def _clean(self, p): + """ + Clip logistic values to range (eps, 1-eps) + + Parameters + ----------- + p : array-like + Probabilities + + Returns + -------- + pclip : array + Clipped probabilities + """ + return np.clip(p, FLOAT_EPS, 1. - FLOAT_EPS) + + def __call__(self, p): + """ + The logit transform + + Parameters + ---------- + p : array-like + Probabilities + + Returns + ------- + z : array + Logit transform of `p` + + Notes + ----- + g(p) = log(p / (1 - p)) + """ + p = self._clean(p) + return np.log(p / (1. - p)) + + def inverse(self, z): + """ + Inverse of the logit transform + + Parameters + ---------- + z : array-like + The value of the logit transform at `p` + + Returns + ------- + p : array + Probabilities + + Notes + ----- + g^(-1)(z) = exp(z)/(1+exp(z)) + """ + z = np.asarray(z) + t = np.exp(-z) + return 1. / (1. + t) + + def deriv(self, p): + + """ + Derivative of the logit transform + + Parameters + ---------- + p: array-like + Probabilities + + Returns + ------- + g'(p) : array + Value of the derivative of logit transform at `p` + + Notes + ----- + g'(p) = 1 / (p * (1 - p)) + + Alias for `Logit`: + logit = Logit() + """ + p = self._clean(p) + return 1. / (p * (1 - p)) + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the logit transform + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor for a GLM or GEE model. + + Returns + ------- + g'^(-1)(z) : array + The value of the derivative of the inverse of the logit function + + """ + t = np.exp(z) + return t/(1 + t)**2 + + + def deriv2(self, p): + """ + Second derivative of the logit function. + + Parameters + ---------- + p : array-like + probabilities + + Returns + ------- + g''(z) : array + The value of the second derivative of the logit function + """ + v = p * (1 - p) + return (2*p - 1) / v**2 + +class logit(Logit): + pass + + +class Power(Link): + """ + The power transform + + Parameters + ---------- + power : float + The exponent of the power transform + + Notes + ----- + Aliases of Power: + inverse = Power(power=-1) + sqrt = Power(power=.5) + inverse_squared = Power(power=-2.) + identity = Power(power=1.) + """ + + def __init__(self, power=1.): + self.power = power + + def __call__(self, p): + """ + Power transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + z : array-like + Power transform of x + + Notes + ----- + g(p) = x**self.power + """ + + z = np.power(p, self.power) + return z + + def inverse(self, z): + """ + Inverse of the power transform link function + + Parameters + ---------- + `z` : array-like + Value of the transformed mean parameters at `p` + + Returns + ------- + `p` : array + Mean parameters + + Notes + ----- + g^(-1)(z`) = `z`**(1/`power`) + """ + + p = np.power(z, 1. / self.power) + return p + + def deriv(self, p): + """ + Derivative of the power transform + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + -------- + g'(p) : array + Derivative of power transform of `p` + + Notes + ----- + g'(`p`) = `power` * `p`**(`power` - 1) + """ + return self.power * np.power(p, self.power - 1) + + def deriv2(self, p): + """ + Second derivative of the power transform + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + -------- + g''(p) : array + Second derivative of the power transform of `p` + + Notes + ----- + g''(`p`) = `power` * (`power` - 1) * `p`**(`power` - 2) + """ + return self.power * (self.power - 1) * np.power(p, self.power - 2) + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the power transform + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor for a GLM or GEE model. + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the power transform + function + """ + return np.power(z, (1 - self.power)/self.power) / self.power + + +class inverse_power(Power): + """ + The inverse transform + + Notes + ----- + g(p) = 1/p + + Alias of statsmodels.family.links.Power(power=-1.) + """ + def __init__(self): + super(inverse_power, self).__init__(power=-1.) + + +class sqrt(Power): + """ + The square-root transform + + Notes + ----- + g(`p`) = sqrt(`p`) + + Alias of statsmodels.family.links.Power(power=.5) + """ + def __init__(self): + super(sqrt, self).__init__(power=.5) + + +class inverse_squared(Power): + """ + The inverse squared transform + + Notes + ----- + g(`p`) = 1/(`p`\ \*\*2) + + Alias of statsmodels.family.links.Power(power=2.) + """ + def __init__(self): + super(inverse_squared, self).__init__(power=-2.) + + +class identity(Power): + """ + The identity transform + + Notes + ----- + g(`p`) = `p` + + Alias of statsmodels.family.links.Power(power=1.) + """ + def __init__(self): + super(identity, self).__init__(power=1.) + + +class Log(Link): + """ + The log transform + + Notes + ----- + call and derivative call a private method _clean to trim the data by + machine epsilon so that p is in (0,1). log is an alias of Log. + """ + + def _clean(self, x): + return np.clip(x, FLOAT_EPS, np.inf) + + def __call__(self, p, **extra): + """ + Log transform link function + + Parameters + ---------- + x : array-like + Mean parameters + + Returns + ------- + z : array + log(x) + + Notes + ----- + g(p) = log(p) + """ + x = self._clean(p) + return np.log(x) + + def inverse(self, z): + """ + Inverse of log transform link function + + Parameters + ---------- + z : array + The inverse of the link function at `p` + + Returns + ------- + p : array + The mean probabilities given the value of the inverse `z` + + Notes + ----- + g^{-1}(z) = exp(z) + """ + return np.exp(z) + + def deriv(self, p): + """ + Derivative of log transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g'(p) : array + derivative of log transform of x + + Notes + ----- + g'(x) = 1/x + """ + p = self._clean(p) + return 1. / p + + def deriv2(self, p): + """ + Second derivative of the log transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g''(p) : array + Second derivative of log transform of x + + Notes + ----- + g''(x) = -1/x^2 + """ + p = self._clean(p) + return -1. / p**2 + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the log transform link function + + Parameters + ---------- + z : array + The inverse of the link function at `p` + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the log function, + the exponential function + """ + return np.exp(z) + + +class log(Log): + """ + The log transform + + Notes + ----- + log is a an alias of Log. + """ + pass + + +# TODO: the CDFLink is untested +class CDFLink(Logit): + """ + The use the CDF of a scipy.stats distribution + + CDFLink is a subclass of logit in order to use its _clean method + for the link and its derivative. + + Parameters + ---------- + dbn : scipy.stats distribution + Default is dbn=scipy.stats.norm + + Notes + ----- + The CDF link is untested. + """ + + def __init__(self, dbn=scipy.stats.norm): + self.dbn = dbn + + def __call__(self, p): + """ + CDF link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + z : array + (ppf) inverse of CDF transform of p + + Notes + ----- + g(`p`) = `dbn`.ppf(`p`) + """ + p = self._clean(p) + return self.dbn.ppf(p) + + def inverse(self, z): + """ + The inverse of the CDF link + + Parameters + ---------- + z : array-like + The value of the inverse of the link function at `p` + + Returns + ------- + p : array + Mean probabilities. The value of the inverse of CDF link of `z` + + Notes + ----- + g^(-1)(`z`) = `dbn`.cdf(`z`) + """ + return self.dbn.cdf(z) + + def deriv(self, p): + """ + Derivative of CDF link + + Parameters + ---------- + p : array-like + mean parameters + + Returns + ------- + g'(p) : array + The derivative of CDF transform at `p` + + Notes + ----- + g'(`p`) = 1./ `dbn`.pdf(`dbn`.ppf(`p`)) + """ + p = self._clean(p) + return 1. / self.dbn.pdf(self.dbn.ppf(p)) + + def deriv2(self, p): + """ + Second derivative of the link function g''(p) + + implemented through numerical differentiation + """ + from statsmodels.tools.numdiff import approx_fprime + p = np.atleast_1d(p) + # Note: special function for norm.ppf does not support complex + return np.diag(approx_fprime(p, self.deriv, centered=True)) + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the CDF transformation link function + + Parameters + ---------- + z : array + The inverse of the link function at `p` + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the logit function + """ + return 1/self.deriv(self.inverse(z)) + + +class probit(CDFLink): + """ + The probit (standard normal CDF) transform + + Notes + -------- + g(p) = scipy.stats.norm.ppf(p) + + probit is an alias of CDFLink. + """ + pass + + +class cauchy(CDFLink): + """ + The Cauchy (standard Cauchy CDF) transform + + Notes + ----- + g(p) = scipy.stats.cauchy.ppf(p) + + cauchy is an alias of CDFLink with dbn=scipy.stats.cauchy + """ + + def __init__(self): + super(cauchy, self).__init__(dbn=scipy.stats.cauchy) + + def deriv2(self, p): + """ + Second derivative of the Cauchy link function. + + Parameters + ---------- + p: array-like + Probabilities + + Returns + ------- + g''(p) : array + Value of the second derivative of Cauchy link function at `p` + """ + a = np.pi * (p - 0.5) + d2 = 2 * np.pi**2 * np.sin(a) / np.cos(a)**3 + return d2 + +class CLogLog(Logit): + """ + The complementary log-log transform + + CLogLog inherits from Logit in order to have access to its _clean method + for the link and its derivative. + + Notes + ----- + CLogLog is untested. + """ + def __call__(self, p): + """ + C-Log-Log transform link function + + Parameters + ---------- + p : array + Mean parameters + + Returns + ------- + z : array + The CLogLog transform of `p` + + Notes + ----- + g(p) = log(-log(1-p)) + """ + p = self._clean(p) + return np.log(-np.log(1 - p)) + + def inverse(self, z): + """ + Inverse of C-Log-Log transform link function + + + Parameters + ---------- + z : array-like + The value of the inverse of the CLogLog link function at `p` + + Returns + ------- + p : array + Mean parameters + + Notes + ----- + g^(-1)(`z`) = 1-exp(-exp(`z`)) + """ + return 1 - np.exp(-np.exp(z)) + + def deriv(self, p): + """ + Derivative of C-Log-Log transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g'(p) : array + The derivative of the CLogLog transform link function + + Notes + ----- + g'(p) = - 1 / ((p-1)*log(1-p)) + """ + p = self._clean(p) + return 1. / ((p - 1) * (np.log(1 - p))) + + def deriv2(self, p): + """ + Second derivative of the C-Log-Log ink function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g''(p) : array + The second derivative of the CLogLog link function + """ + p = self._clean(p) + fl = np.log(1 - p) + d2 = -1 / ((1 - p)**2 * fl) + d2 *= 1 + 1 / fl + return d2 + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the C-Log-Log transform link function + + Parameters + ---------- + z : array-like + The value of the inverse of the CLogLog link function at `p` + + Returns + ------- + g^(-1)'(z) : array + The derivative of the inverse of the CLogLog link function + """ + return np.exp(z - np.exp(z)) + + +class cloglog(CLogLog): + """ + The CLogLog transform link function. + + Notes + ----- + g(`p`) = log(-log(1-`p`)) + + cloglog is an alias for CLogLog + cloglog = CLogLog() + """ + pass + + +class NegativeBinomial(object): + ''' + The negative binomial link function + + Parameters + ---------- + alpha : float, optional + Alpha is the ancillary parameter of the Negative Binomial link + function. It is assumed to be nonstochastic. The default value is 1. + Permissible values are usually assumed to be in (.01, 2). + ''' + + def __init__(self, alpha=1.): + self.alpha = alpha + + def _clean(self, x): + return np.clip(x, FLOAT_EPS, np.inf) + + def __call__(self, p): + ''' + Negative Binomial transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + z : array + The negative binomial transform of `p` + + Notes + ----- + g(p) = log(p/(p + 1/alpha)) + ''' + p = self._clean(p) + return np.log(p/(p + 1/self.alpha)) + + def inverse(self, z): + ''' + Inverse of the negative binomial transform + + Parameters + ----------- + z : array-like + The value of the inverse of the negative binomial link at `p`. + + Returns + ------- + p : array + Mean parameters + + Notes + ----- + g^(-1)(z) = exp(z)/(alpha*(1-exp(z))) + ''' + return -1/(self.alpha * (1 - np.exp(-z))) + + def deriv(self, p): + ''' + Derivative of the negative binomial transform + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g'(p) : array + The derivative of the negative binomial transform link function + + Notes + ----- + g'(x) = 1/(x+alpha*x^2) + ''' + return 1/(p + self.alpha * p**2) + + def deriv2(self,p): + ''' + Second derivative of the negative binomial link function. + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g''(p) : array + The second derivative of the negative binomial transform link + function + + Notes + ----- + g''(x) = -(1+2*alpha*x)/(x+alpha*x^2)^2 + ''' + numer = -(1 + 2 * self.alpha * p) + denom = (p + self.alpha * p**2)**2 + return numer / denom + + def inverse_deriv(self, z): + ''' + Derivative of the inverse of the negative binomial transform + + Parameters + ----------- + z : array-like + Usually the linear predictor for a GLM or GEE model + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the negative + binomial link + ''' + t = np.exp(z) + return t / (self.alpha * (1-t)**2) + + +class nbinom(NegativeBinomial): + """ + The negative binomial link function. + + Notes + ----- + g(p) = log(p/(p + 1/alpha)) + + nbinom is an alias of NegativeBinomial. + nbinom = NegativeBinomial(alpha=1.) + """ + pass diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py new file mode 100644 index 0000000..b86ad6a --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py @@ -0,0 +1,993 @@ +""" +Tests for generalized linear models. Majority of code either directly borrowed +or closely adapted from statsmodels package. Model results verfiied using glm +function in R and GLM function in statsmodels. +""" + +__author__ = 'Taylor Oshan tayoshan@gmail.com' + +from pysal.contrib.glm.glm import GLM +from pysal.contrib.glm.family import Gaussian, Poisson, Binomial, QuasiPoisson +import numpy as np +import pysal +import unittest +import math + + +class TestGaussian(unittest.TestCase): + """ + Tests for Poisson GLM + """ + + def setUp(self): + db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + self.y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X.append(db.by_col("CRIME")) + self.X = np.array(X).T + + def testIWLS(self): + model = GLM(self.y, self.X, family=Gaussian()) + results = model.fit() + self.assertEqual(results.n, 49) + self.assertEqual(results.df_model, 2) + self.assertEqual(results.df_resid, 46) + self.assertEqual(results.aic, 408.73548964604873) + self.assertEqual(results.bic, 10467.991340493107) + self.assertEqual(results.deviance, 10647.015074206196) + self.assertEqual(results.llf, -201.36774482302437) + self.assertEqual(results.null_deviance, 16367.794631703124) + self.assertEqual(results.scale, 231.45684943926514) + np.testing.assert_allclose(results.params, [ 46.42818268, 0.62898397, + -0.48488854]) + np.testing.assert_allclose(results.bse, [ 13.19175703, 0.53591045, + 0.18267291]) + np.testing.assert_allclose(results.cov_params(), + [[ 1.74022453e+02, -6.52060364e+00, -2.15109867e+00], + [ -6.52060364e+00, 2.87200008e-01, 6.80956787e-02], + [ -2.15109867e+00, 6.80956787e-02, 3.33693910e-02]]) + np.testing.assert_allclose(results.tvalues, [ 3.51948437, 1.17367365, + -2.65440864]) + np.testing.assert_allclose(results.pvalues, [ 0.00043239, 0.24052577, + 0.00794475], atol=1.0e-8) + np.testing.assert_allclose(results.conf_int(), + [[ 20.57281401, 72.28355135], + [ -0.42138121, 1.67934915], + [ -0.84292086, -0.12685622]]) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 7.51857004e-01, -2.81720055e-02, -9.29373521e-03], + [ -2.81720055e-02, 1.24083607e-03, 2.94204638e-04], + [ -9.29373521e-03, 2.94204638e-04, 1.44171110e-04]]) + np.testing.assert_allclose(results.mu, + [ 51.08752105, 50.66601521, 41.61367567, 33.53969014, + 28.90638232, 43.87074227, 51.64910882, 34.92671563, + 42.69267622, 38.49449134, 20.92815471, 25.25228436, + 29.78223486, 25.02403635, 29.07959539, 24.63352275, + 34.71372149, 33.40443052, 27.29864225, 65.86219802, + 33.69854751, 37.44976435, 50.01304928, 36.81219959, + 22.02674837, 31.64775955, 27.63563294, 23.7697291 , + 22.43119725, 21.76987089, 48.51169321, 49.05891819, + 32.31656426, 44.20550354, 35.49244888, 51.27811308, + 36.55047181, 27.37048914, 48.78812922, 57.31744163, + 51.22914162, 54.70515578, 37.06622277, 44.5075759 , + 41.24328983, 49.93821824, 44.85644299, 40.93838609, 47.32045464]) + self.assertEqual(results.pearson_chi2, 10647.015074206196) + np.testing.assert_allclose(results.resid_response, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_working, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_pearson, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_anscombe, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_deviance, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.null, + [ 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, 38.43622447]) + self.assertAlmostEqual(results.D2, .349514377851) + self.assertAlmostEqual(results.adj_D2, 0.32123239427957673) + +class TestPoisson(unittest.TestCase): + + def setUp(self): + db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + self.y = np.round(y).astype(int) + X = [] + X.append(db.by_col("INC")) + X.append(db.by_col("CRIME")) + self.X = np.array(X).T + + def testIWLS(self): + model = GLM(self.y, self.X, family=Poisson()) + results = model.fit() + self.assertEqual(results.n, 49) + self.assertEqual(results.df_model, 2) + self.assertEqual(results.df_resid, 46) + self.assertAlmostEqual(results.aic, 500.85184179938756) + self.assertAlmostEqual(results.bic, 51.436404535087661) + self.assertAlmostEqual(results.deviance, 230.46013824817649) + self.assertAlmostEqual(results.llf, -247.42592089969378) + self.assertAlmostEqual(results.null_deviance, 376.97293610347361) + self.assertEqual(results.scale, 1.0) + np.testing.assert_allclose(results.params, [ 3.92159085, 0.01183491, + -0.01371397], atol=1.0e-8) + np.testing.assert_allclose(results.bse, [ 0.13049161, 0.00511599, + 0.00193769], atol=1.0e-8) + np.testing.assert_allclose(results.cov_params(), + [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04], + [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06], + [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]]) + np.testing.assert_allclose(results.tvalues, [ 30.0524361 , 2.31331634, + -7.07748998]) + np.testing.assert_allclose(results.pvalues, [ 2.02901657e-198, + 2.07052532e-002, 1.46788805e-012]) + np.testing.assert_allclose(results.conf_int(), + [[ 3.66583199e+00, 4.17734972e+00], + [ 1.80774841e-03, 2.18620753e-02], + [ -1.75117666e-02, -9.91616901e-03]]) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04], + [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06], + [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]]) + np.testing.assert_allclose(results.mu, + [ 51.26831574, 50.15022766, 40.06142973, 34.13799739, + 28.76119226, 42.6836241 , 55.64593703, 34.08277997, + 40.90389582, 37.19727958, 23.47459217, 26.12384057, + 29.78303507, 25.96888223, 29.14073823, 26.04369592, + 34.18996367, 32.28924005, 27.42284396, 72.69207879, + 33.05316347, 36.52276972, 49.2551479 , 35.33439632, + 24.07252457, 31.67153709, 27.81699478, 25.38021219, + 24.31759259, 23.13586161, 48.40724678, 48.57969818, + 31.92596006, 43.3679231 , 34.32925819, 51.78908089, + 34.49778584, 27.56236198, 48.34273194, 57.50829097, + 50.66038226, 54.68701352, 35.77103116, 43.21886784, + 40.07615759, 49.98658004, 43.13352883, 40.28520774, 46.28910294]) + self.assertAlmostEqual(results.pearson_chi2, 264.62262932090221) + np.testing.assert_allclose(results.resid_response, + [ 28.73168426, -5.15022766, -14.06142973, -1.13799739, + -5.76119226, -13.6836241 , 19.35406297, 2.91722003, + 12.09610418, 58.80272042, -3.47459217, -6.12384057, + 12.21696493, 17.03111777, -11.14073823, -7.04369592, + 7.81003633, 27.71075995, 3.57715604, 8.30792121, + -13.05316347, -6.52276972, -1.2551479 , 17.66560368, + -6.07252457, -11.67153709, 6.18300522, -2.38021219, + 7.68240741, -1.13586161, -16.40724678, -8.57969818, + -7.92596006, -15.3679231 , -7.32925819, -15.78908089, + 8.50221416, -4.56236198, -8.34273194, 4.49170903, + -8.66038226, -10.68701352, -9.77103116, -9.21886784, + -12.07615759, 26.01341996, -1.13352883, -13.28520774, -10.28910294]) + np.testing.assert_allclose(results.resid_working, + [ 1473.02506034, -258.28508941, -563.32097891, -38.84895192, + -165.69875817, -584.06666725, 1076.97496919, 99.42696848, + 494.77778514, 2187.30123163, -81.56463405, -159.97823479, + 363.858295 , 442.27909165, -324.64933645, -183.44387481, + 267.02485844, 894.75938 , 98.09579187, 603.9200634 , + -431.44834594, -238.2296165 , -61.82249568, 624.20344168, + -146.18099686, -369.65551968, 171.99262399, -60.41029031, + 186.81765356, -26.27913713, -794.22964417, -416.79914795, + -253.04388425, -666.47490701, -251.6079969 , -817.70198717, + 293.30756327, -125.74947222, -403.31045369, 258.31051005, + -438.73827602, -584.440853 , -349.51985996, -398.42903071, + -483.96599444, 1300.32189904, -48.89309853, -535.19735391, + -476.27334527]) + np.testing.assert_allclose(results.resid_pearson, + [ 4.01269878, -0.72726045, -2.221602 , -0.19477008, -1.07425881, + -2.09445239, 2.59451042, 0.49969118, 1.89131202, 9.64143836, + -0.71714142, -1.19813392, 2.23861212, 3.34207756, -2.0637814 , + -1.3802231 , 1.33568403, 4.87662684, 0.68309584, 0.97442591, + -2.27043598, -1.07931992, -0.17884182, 2.97186889, -1.23768025, + -2.07392709, 1.1723155 , -0.47246327, 1.55789092, -0.23614708, + -2.35819937, -1.23096188, -1.40274877, -2.33362391, -1.25091503, + -2.19400568, 1.44755952, -0.8690235 , -1.19989348, 0.59230634, + -1.21675413, -1.44515442, -1.63370888, -1.40229988, -1.90759306, + 3.67934693, -0.17259375, -2.09312684, -1.51230062]) + np.testing.assert_allclose(results.resid_anscombe, + [ 3.70889134, -0.74031295, -2.37729865, -0.19586855, -1.11374751, + -2.22611959, 2.46352013, 0.49282126, 1.80857757, 8.06444452, + -0.73610811, -1.25061371, 2.10820431, 3.05467547, -2.22437611, + -1.45136173, 1.28939698, 4.35942058, 0.66904552, 0.95674923, + -2.45438937, -1.11429881, -0.17961012, 2.76715848, -1.29658591, + -2.22816691, 1.13269136, -0.48017382, 1.48562248, -0.23812278, + -2.51664399, -1.2703721 , -1.4683091 , -2.49907536, -1.30026484, + -2.32398309, 1.39380683, -0.89495368, -1.23735395, 0.58485202, + -1.25435224, -1.4968484 , -1.71888038, -1.45756652, -2.01906267, + 3.41729922, -0.17335867, -2.22921828, -1.57470549]) + np.testing.assert_allclose(results.resid_deviance, + [ 3.70529668, -0.74027329, -2.37536322, -0.19586751, -1.11349765, + -2.22466106, 2.46246446, 0.4928057 , 1.80799655, 8.02696525, + -0.73602255, -1.25021555, 2.10699958, 3.05084608, -2.22214376, + -1.45072221, 1.28913747, 4.35106213, 0.6689982 , 0.95669662, + -2.45171913, -1.11410444, -0.17960956, 2.76494217, -1.29609865, + -2.22612429, 1.13247453, -0.48015254, 1.48508549, -0.23812 , + -2.51476072, -1.27015583, -1.46777697, -2.49699318, -1.29992892, + -2.32263069, 1.39348459, -0.89482132, -1.23715363, 0.58483655, + -1.25415329, -1.49653039, -1.7181055 , -1.45719072, -2.01791949, + 3.41437156, -0.1733581 , -2.22765605, -1.57426046]) + np.testing.assert_allclose(results.null, + [ 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, 38.42857143]) + self.assertAlmostEqual(results.D2, .388656011675) + self.assertAlmostEqual(results.adj_D2, 0.36207583826952761)#.375648692774) + + def testQuasi(self): + model = GLM(self.y, self.X, family=QuasiPoisson()) + results = model.fit() + self.assertEqual(results.n, 49) + self.assertEqual(results.df_model, 2) + self.assertEqual(results.df_resid, 46) + self.assertTrue(math.isnan(results.aic)) + self.assertAlmostEqual(results.bic, 51.436404535087661) + self.assertAlmostEqual(results.deviance, 230.46013824817649) + self.assertTrue(math.isnan(results.llf)) + self.assertAlmostEqual(results.null_deviance, 376.97293610347361) + self.assertAlmostEqual(results.scale, 5.7526658548022223) + np.testing.assert_allclose(results.params, [ 3.92159085, 0.01183491, + -0.01371397], atol=1.0e-8) + np.testing.assert_allclose(results.bse, [ 0.31298042, 0.01227057, + 0.00464749], atol=1.0e-8) + np.testing.assert_allclose(results.cov_params(), + [[ 9.79567451e-02, -3.55876238e-03, -1.27356524e-03], + [ -3.55876238e-03, 1.50566777e-04, 3.89741067e-05], + [ -1.27356524e-03, 3.89741067e-05, 2.15991606e-05]]) + np.testing.assert_allclose(results.tvalues, [ 12.52982796, 0.96449604, + -2.95083339]) + np.testing.assert_allclose(results.pvalues, [ 5.12737770e-36, + 3.34797291e-01, 3.16917819e-03]) + np.testing.assert_allclose(results.conf_int(), + [[ 3.3081605 , 4.53502121], + [-0.01221495, 0.03588478], + [-0.02282288, -0.00460506]], atol=1.0e-8) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04], + [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06], + [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]]) + np.testing.assert_allclose(results.mu, + [ 51.26831574, 50.15022766, 40.06142973, 34.13799739, + 28.76119226, 42.6836241 , 55.64593703, 34.08277997, + 40.90389582, 37.19727958, 23.47459217, 26.12384057, + 29.78303507, 25.96888223, 29.14073823, 26.04369592, + 34.18996367, 32.28924005, 27.42284396, 72.69207879, + 33.05316347, 36.52276972, 49.2551479 , 35.33439632, + 24.07252457, 31.67153709, 27.81699478, 25.38021219, + 24.31759259, 23.13586161, 48.40724678, 48.57969818, + 31.92596006, 43.3679231 , 34.32925819, 51.78908089, + 34.49778584, 27.56236198, 48.34273194, 57.50829097, + 50.66038226, 54.68701352, 35.77103116, 43.21886784, + 40.07615759, 49.98658004, 43.13352883, 40.28520774, 46.28910294]) + self.assertAlmostEqual(results.pearson_chi2, 264.62262932090221) + np.testing.assert_allclose(results.resid_response, + [ 28.73168426, -5.15022766, -14.06142973, -1.13799739, + -5.76119226, -13.6836241 , 19.35406297, 2.91722003, + 12.09610418, 58.80272042, -3.47459217, -6.12384057, + 12.21696493, 17.03111777, -11.14073823, -7.04369592, + 7.81003633, 27.71075995, 3.57715604, 8.30792121, + -13.05316347, -6.52276972, -1.2551479 , 17.66560368, + -6.07252457, -11.67153709, 6.18300522, -2.38021219, + 7.68240741, -1.13586161, -16.40724678, -8.57969818, + -7.92596006, -15.3679231 , -7.32925819, -15.78908089, + 8.50221416, -4.56236198, -8.34273194, 4.49170903, + -8.66038226, -10.68701352, -9.77103116, -9.21886784, + -12.07615759, 26.01341996, -1.13352883, -13.28520774, -10.28910294]) + np.testing.assert_allclose(results.resid_working, + [ 1473.02506034, -258.28508941, -563.32097891, -38.84895192, + -165.69875817, -584.06666725, 1076.97496919, 99.42696848, + 494.77778514, 2187.30123163, -81.56463405, -159.97823479, + 363.858295 , 442.27909165, -324.64933645, -183.44387481, + 267.02485844, 894.75938 , 98.09579187, 603.9200634 , + -431.44834594, -238.2296165 , -61.82249568, 624.20344168, + -146.18099686, -369.65551968, 171.99262399, -60.41029031, + 186.81765356, -26.27913713, -794.22964417, -416.79914795, + -253.04388425, -666.47490701, -251.6079969 , -817.70198717, + 293.30756327, -125.74947222, -403.31045369, 258.31051005, + -438.73827602, -584.440853 , -349.51985996, -398.42903071, + -483.96599444, 1300.32189904, -48.89309853, -535.19735391, + -476.27334527]) + np.testing.assert_allclose(results.resid_pearson, + [ 4.01269878, -0.72726045, -2.221602 , -0.19477008, -1.07425881, + -2.09445239, 2.59451042, 0.49969118, 1.89131202, 9.64143836, + -0.71714142, -1.19813392, 2.23861212, 3.34207756, -2.0637814 , + -1.3802231 , 1.33568403, 4.87662684, 0.68309584, 0.97442591, + -2.27043598, -1.07931992, -0.17884182, 2.97186889, -1.23768025, + -2.07392709, 1.1723155 , -0.47246327, 1.55789092, -0.23614708, + -2.35819937, -1.23096188, -1.40274877, -2.33362391, -1.25091503, + -2.19400568, 1.44755952, -0.8690235 , -1.19989348, 0.59230634, + -1.21675413, -1.44515442, -1.63370888, -1.40229988, -1.90759306, + 3.67934693, -0.17259375, -2.09312684, -1.51230062]) + np.testing.assert_allclose(results.resid_anscombe, + [ 3.70889134, -0.74031295, -2.37729865, -0.19586855, -1.11374751, + -2.22611959, 2.46352013, 0.49282126, 1.80857757, 8.06444452, + -0.73610811, -1.25061371, 2.10820431, 3.05467547, -2.22437611, + -1.45136173, 1.28939698, 4.35942058, 0.66904552, 0.95674923, + -2.45438937, -1.11429881, -0.17961012, 2.76715848, -1.29658591, + -2.22816691, 1.13269136, -0.48017382, 1.48562248, -0.23812278, + -2.51664399, -1.2703721 , -1.4683091 , -2.49907536, -1.30026484, + -2.32398309, 1.39380683, -0.89495368, -1.23735395, 0.58485202, + -1.25435224, -1.4968484 , -1.71888038, -1.45756652, -2.01906267, + 3.41729922, -0.17335867, -2.22921828, -1.57470549]) + np.testing.assert_allclose(results.resid_deviance, + [ 3.70529668, -0.74027329, -2.37536322, -0.19586751, -1.11349765, + -2.22466106, 2.46246446, 0.4928057 , 1.80799655, 8.02696525, + -0.73602255, -1.25021555, 2.10699958, 3.05084608, -2.22214376, + -1.45072221, 1.28913747, 4.35106213, 0.6689982 , 0.95669662, + -2.45171913, -1.11410444, -0.17960956, 2.76494217, -1.29609865, + -2.22612429, 1.13247453, -0.48015254, 1.48508549, -0.23812 , + -2.51476072, -1.27015583, -1.46777697, -2.49699318, -1.29992892, + -2.32263069, 1.39348459, -0.89482132, -1.23715363, 0.58483655, + -1.25415329, -1.49653039, -1.7181055 , -1.45719072, -2.01791949, + 3.41437156, -0.1733581 , -2.22765605, -1.57426046]) + np.testing.assert_allclose(results.null, + [ 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, 38.42857143]) + self.assertAlmostEqual(results.D2, .388656011675) + self.assertAlmostEqual(results.adj_D2, 0.36207583826952761) + +class TestBinomial(unittest.TestCase): + + def setUp(self): + #London house price data + #y: 'BATH2' + y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.y = y.reshape((316,1)) + #X: 'FLOORSZ' + X = np.array([ 77, 75, 64, 95, 107, 100, 81, 151, 98, 260, 171, 161, 91, + 80, 50, 85, 52, 69, 60, 84, 155, 97, 69, 126, 90, 43, + 51, 41, 140, 80, 52, 86, 66, 60, 40, 155, 138, 97, 115, + 148, 206, 60, 53, 96, 88, 160, 31, 43, 154, 60, 131, 60, + 46, 61, 125, 150, 76, 92, 96, 100, 105, 72, 48, 41, 72, + 65, 60, 65, 98, 33, 144, 111, 91, 108, 38, 48, 95, 63, + 98, 129, 108, 51, 131, 66, 48, 127, 76, 68, 52, 64, 57, + 121, 67, 76, 112, 96, 90, 53, 93, 64, 97, 58, 44, 157, + 53, 70, 71, 167, 47, 70, 96, 77, 75, 71, 67, 47, 71, + 90, 69, 64, 65, 95, 60, 60, 65, 54, 121, 105, 50, 85, + 69, 69, 62, 65, 93, 93, 70, 62, 155, 68, 117, 80, 80, + 75, 98, 114, 86, 70, 50, 51, 163, 124, 59, 95, 51, 63, + 85, 53, 46, 102, 114, 83, 47, 40, 63, 123, 100, 63, 110, + 79, 98, 99, 120, 52, 48, 37, 81, 30, 88, 50, 35, 116, + 67, 45, 80, 86, 109, 59, 75, 60, 71, 141, 121, 50, 168, + 90, 51, 133, 75, 133, 127, 37, 68, 105, 61, 123, 151, 110, + 77, 220, 94, 77, 70, 100, 98, 126, 55, 105, 60, 176, 104, + 68, 62, 70, 48, 102, 80, 97, 66, 80, 102, 160, 55, 60, + 71, 125, 85, 85, 190, 137, 48, 41, 42, 51, 57, 60, 114, + 88, 84, 108, 66, 85, 42, 98, 90, 127, 100, 55, 76, 82, + 63, 80, 71, 76, 121, 109, 92, 160, 109, 185, 100, 90, 90, + 86, 88, 95, 116, 135, 61, 74, 60, 235, 76, 66, 100, 49, + 50, 37, 100, 88, 90, 52, 95, 81, 79, 96, 75, 91, 86, + 83, 180, 108, 80, 96, 49, 117, 117, 86, 46, 66, 95, 57, + 120, 137, 68, 240]) + self.X = X.reshape((316,1)) + + def testIWLS(self): + model = GLM(self.y, self.X, family=Binomial()) + results = model.fit() + self.assertEqual(results.n, 316) + self.assertEqual(results.df_model, 1) + self.assertEqual(results.df_resid, 314) + self.assertEqual(results.aic, 155.19347530342466) + self.assertEqual(results.bic, -1656.1095797628657) + self.assertEqual(results.deviance, 151.19347530342466) + self.assertEqual(results.llf, -75.596737651712331) + self.assertEqual(results.null_deviance, 189.16038985881212) + self.assertEqual(results.scale, 1.0) + np.testing.assert_allclose(results.params, [-5.33638276, 0.0287754 ]) + np.testing.assert_allclose(results.bse, [ 0.64499904, 0.00518312], + atol=1.0e-8) + np.testing.assert_allclose(results.cov_params(), + [[ 4.16023762e-01, -3.14338457e-03], + [ -3.14338457e-03, 2.68646833e-05]]) + np.testing.assert_allclose(results.tvalues, [-8.27347396, 5.55175826]) + np.testing.assert_allclose(results.pvalues, [ 1.30111233e-16, + 2.82810512e-08]) + np.testing.assert_allclose(results.conf_int(), + [[-6.60055765, -4.07220787], + [ 0.01861668, 0.03893412]], atol=1.0e-8) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 4.16023762e-01, -3.14338457e-03], + [ -3.14338457e-03, 2.68646833e-05]]) + np.testing.assert_allclose(results.mu, + [ 0.04226237, 0.03999333, 0.02946178, 0.0689636 , 0.09471181, + 0.07879431, 0.04717464, 0.27065598, 0.07471691, 0.89522144, + 0.39752487, 0.33102718, 0.06192993, 0.04589793, 0.01988679, + 0.0526265 , 0.02104007, 0.03386636, 0.02634295, 0.05121018, + 0.29396682, 0.07275173, 0.03386636, 0.15307528, 0.06027915, + 0.01631789, 0.02045547, 0.01541937, 0.2128508 , 0.04589793, + 0.02104007, 0.05407977, 0.0311527 , 0.02634295, 0.01498855, + 0.29396682, 0.20336776, 0.07275173, 0.11637537, 0.25395607, + 0.64367488, 0.02634295, 0.02164101, 0.07083428, 0.05710047, + 0.32468619, 0.01160845, 0.01631789, 0.28803008, 0.02634295, + 0.17267234, 0.02634295, 0.01776301, 0.02709115, 0.14938186, + 0.26501331, 0.04111287, 0.06362285, 0.07083428, 0.07879431, + 0.08989109, 0.03680743, 0.0187955 , 0.01541937, 0.03680743, + 0.03029581, 0.02634295, 0.03029581, 0.07471691, 0.01228768, + 0.23277197, 0.10505173, 0.06192993, 0.09720799, 0.01416217, + 0.0187955 , 0.0689636 , 0.02865003, 0.07471691, 0.16460503, + 0.09720799, 0.02045547, 0.17267234, 0.0311527 , 0.0187955 , + 0.15684317, 0.04111287, 0.03293737, 0.02104007, 0.02946178, + 0.02421701, 0.1353385 , 0.03203302, 0.04111287, 0.10778798, + 0.07083428, 0.06027915, 0.02164101, 0.06535882, 0.02946178, + 0.07275173, 0.02490638, 0.01678627, 0.30605146, 0.02164101, + 0.03482061, 0.03580075, 0.37030921, 0.0182721 , 0.03482061, + 0.07083428, 0.04226237, 0.03999333, 0.03580075, 0.03203302, + 0.0182721 , 0.03580075, 0.06027915, 0.03386636, 0.02946178, + 0.03029581, 0.0689636 , 0.02634295, 0.02634295, 0.03029581, + 0.02225873, 0.1353385 , 0.08989109, 0.01988679, 0.0526265 , + 0.03386636, 0.03386636, 0.02786 , 0.03029581, 0.06535882, + 0.06535882, 0.03482061, 0.02786 , 0.29396682, 0.03293737, + 0.12242534, 0.04589793, 0.04589793, 0.03999333, 0.07471691, + 0.11344884, 0.05407977, 0.03482061, 0.01988679, 0.02045547, + 0.34389327, 0.14576223, 0.02561486, 0.0689636 , 0.02045547, + 0.02865003, 0.0526265 , 0.02164101, 0.01776301, 0.08307425, + 0.11344884, 0.04982997, 0.0182721 , 0.01498855, 0.02865003, + 0.14221564, 0.07879431, 0.02865003, 0.10237696, 0.04465416, + 0.07471691, 0.07673078, 0.13200634, 0.02104007, 0.0187955 , + 0.01376599, 0.04717464, 0.01128289, 0.05710047, 0.01988679, + 0.01300612, 0.11936722, 0.03203302, 0.01726786, 0.04589793, + 0.05407977, 0.09976271, 0.02561486, 0.03999333, 0.02634295, + 0.03580075, 0.21771181, 0.1353385 , 0.01988679, 0.37704374, + 0.06027915, 0.02045547, 0.18104935, 0.03999333, 0.18104935, + 0.15684317, 0.01376599, 0.03293737, 0.08989109, 0.02709115, + 0.14221564, 0.27065598, 0.10237696, 0.04226237, 0.72991785, + 0.06713876, 0.04226237, 0.03482061, 0.07879431, 0.07471691, + 0.15307528, 0.02289366, 0.08989109, 0.02634295, 0.43243779, + 0.08756457, 0.03293737, 0.02786 , 0.03482061, 0.0187955 , + 0.08307425, 0.04589793, 0.07275173, 0.0311527 , 0.04589793, + 0.08307425, 0.32468619, 0.02289366, 0.02634295, 0.03580075, + 0.14938186, 0.0526265 , 0.0526265 , 0.53268924, 0.19874565, + 0.0187955 , 0.01541937, 0.01586237, 0.02045547, 0.02421701, + 0.02634295, 0.11344884, 0.05710047, 0.05121018, 0.09720799, + 0.0311527 , 0.0526265 , 0.01586237, 0.07471691, 0.06027915, + 0.15684317, 0.07879431, 0.02289366, 0.04111287, 0.04848506, + 0.02865003, 0.04589793, 0.03580075, 0.04111287, 0.1353385 , + 0.09976271, 0.06362285, 0.32468619, 0.09976271, 0.49676673, + 0.07879431, 0.06027915, 0.06027915, 0.05407977, 0.05710047, + 0.0689636 , 0.11936722, 0.18973955, 0.02709115, 0.03890304, + 0.02634295, 0.80625182, 0.04111287, 0.0311527 , 0.07879431, + 0.0193336 , 0.01988679, 0.01376599, 0.07879431, 0.05710047, + 0.06027915, 0.02104007, 0.0689636 , 0.04717464, 0.04465416, + 0.07083428, 0.03999333, 0.06192993, 0.05407977, 0.04982997, + 0.46087756, 0.09720799, 0.04589793, 0.07083428, 0.0193336 , + 0.12242534, 0.12242534, 0.05407977, 0.01776301, 0.0311527 , + 0.0689636 , 0.02421701, 0.13200634, 0.19874565, 0.03293737, + 0.82774282], atol=1.0e-8) + self.assertAlmostEqual(results.pearson_chi2, 271.21110541713801) + np.testing.assert_allclose(results.resid_response, + [-0.04226237, -0.03999333, -0.02946178, -0.0689636 , -0.09471181, + -0.07879431, -0.04717464, -0.27065598, -0.07471691, 0.10477856, + -0.39752487, 0.66897282, -0.06192993, -0.04589793, -0.01988679, + -0.0526265 , -0.02104007, -0.03386636, -0.02634295, -0.05121018, + -0.29396682, 0.92724827, -0.03386636, -0.15307528, -0.06027915, + -0.01631789, -0.02045547, -0.01541937, -0.2128508 , -0.04589793, + -0.02104007, -0.05407977, -0.0311527 , -0.02634295, -0.01498855, + -0.29396682, 0.79663224, -0.07275173, -0.11637537, 0.74604393, + -0.64367488, -0.02634295, -0.02164101, -0.07083428, -0.05710047, + -0.32468619, -0.01160845, -0.01631789, -0.28803008, -0.02634295, + -0.17267234, -0.02634295, -0.01776301, -0.02709115, 0.85061814, + 0.73498669, -0.04111287, -0.06362285, -0.07083428, -0.07879431, + 0.91010891, -0.03680743, -0.0187955 , -0.01541937, -0.03680743, + -0.03029581, -0.02634295, -0.03029581, -0.07471691, -0.01228768, + 0.76722803, -0.10505173, -0.06192993, -0.09720799, -0.01416217, + -0.0187955 , -0.0689636 , -0.02865003, -0.07471691, -0.16460503, + -0.09720799, -0.02045547, 0.82732766, -0.0311527 , -0.0187955 , + -0.15684317, -0.04111287, -0.03293737, -0.02104007, -0.02946178, + -0.02421701, -0.1353385 , -0.03203302, -0.04111287, -0.10778798, + -0.07083428, -0.06027915, -0.02164101, -0.06535882, -0.02946178, + -0.07275173, -0.02490638, -0.01678627, -0.30605146, -0.02164101, + -0.03482061, -0.03580075, 0.62969079, -0.0182721 , -0.03482061, + -0.07083428, -0.04226237, -0.03999333, -0.03580075, -0.03203302, + -0.0182721 , -0.03580075, -0.06027915, -0.03386636, -0.02946178, + -0.03029581, -0.0689636 , -0.02634295, -0.02634295, -0.03029581, + -0.02225873, -0.1353385 , -0.08989109, -0.01988679, -0.0526265 , + -0.03386636, -0.03386636, -0.02786 , -0.03029581, -0.06535882, + -0.06535882, -0.03482061, -0.02786 , -0.29396682, -0.03293737, + -0.12242534, -0.04589793, -0.04589793, -0.03999333, -0.07471691, + -0.11344884, -0.05407977, -0.03482061, -0.01988679, -0.02045547, + 0.65610673, 0.85423777, -0.02561486, -0.0689636 , -0.02045547, + -0.02865003, -0.0526265 , -0.02164101, -0.01776301, -0.08307425, + -0.11344884, -0.04982997, -0.0182721 , -0.01498855, -0.02865003, + -0.14221564, -0.07879431, -0.02865003, -0.10237696, -0.04465416, + -0.07471691, -0.07673078, -0.13200634, -0.02104007, -0.0187955 , + -0.01376599, -0.04717464, -0.01128289, 0.94289953, -0.01988679, + -0.01300612, -0.11936722, -0.03203302, -0.01726786, -0.04589793, + -0.05407977, -0.09976271, -0.02561486, -0.03999333, -0.02634295, + -0.03580075, -0.21771181, 0.8646615 , -0.01988679, 0.62295626, + -0.06027915, -0.02045547, -0.18104935, 0.96000667, -0.18104935, + -0.15684317, -0.01376599, -0.03293737, -0.08989109, -0.02709115, + -0.14221564, 0.72934402, -0.10237696, -0.04226237, -0.72991785, + -0.06713876, -0.04226237, -0.03482061, -0.07879431, -0.07471691, + -0.15307528, 0.97710634, 0.91010891, -0.02634295, -0.43243779, + -0.08756457, -0.03293737, -0.02786 , -0.03482061, -0.0187955 , + 0.91692575, -0.04589793, -0.07275173, -0.0311527 , -0.04589793, + -0.08307425, 0.67531381, -0.02289366, -0.02634295, -0.03580075, + -0.14938186, -0.0526265 , -0.0526265 , 0.46731076, -0.19874565, + -0.0187955 , -0.01541937, -0.01586237, -0.02045547, -0.02421701, + -0.02634295, -0.11344884, -0.05710047, -0.05121018, -0.09720799, + 0.9688473 , -0.0526265 , -0.01586237, -0.07471691, -0.06027915, + -0.15684317, -0.07879431, -0.02289366, -0.04111287, -0.04848506, + -0.02865003, -0.04589793, -0.03580075, -0.04111287, -0.1353385 , + -0.09976271, -0.06362285, 0.67531381, -0.09976271, -0.49676673, + -0.07879431, -0.06027915, -0.06027915, -0.05407977, -0.05710047, + -0.0689636 , -0.11936722, -0.18973955, -0.02709115, -0.03890304, + -0.02634295, 0.19374818, -0.04111287, -0.0311527 , -0.07879431, + -0.0193336 , -0.01988679, -0.01376599, -0.07879431, 0.94289953, + -0.06027915, -0.02104007, -0.0689636 , -0.04717464, -0.04465416, + 0.92916572, -0.03999333, -0.06192993, -0.05407977, -0.04982997, + -0.46087756, -0.09720799, -0.04589793, -0.07083428, -0.0193336 , + -0.12242534, -0.12242534, -0.05407977, -0.01776301, -0.0311527 , + -0.0689636 , -0.02421701, -0.13200634, -0.19874565, -0.03293737, + -0.82774282], atol=1.0e-8) + np.testing.assert_allclose(results.resid_working, + [ -1.71062283e-03, -1.53549840e-03, -8.42423701e-04, + -4.42798906e-03, -8.12073047e-03, -5.71934606e-03, + -2.12046213e-03, -5.34278480e-02, -5.16550074e-03, + 9.82823035e-03, -9.52067472e-02, 1.48142818e-01, + -3.59779501e-03, -2.00993083e-03, -3.87619325e-04, + -2.62379729e-03, -4.33370579e-04, -1.10808799e-03, + -6.75670103e-04, -2.48818484e-03, -6.10129090e-02, + 6.25511612e-02, -1.10808799e-03, -1.98451739e-02, + -3.41454749e-03, -2.61928659e-04, -4.09867263e-04, + -2.34090923e-04, -3.56621577e-02, -2.00993083e-03, + -4.33370579e-04, -2.76645832e-03, -9.40257152e-04, + -6.75670103e-04, -2.21289369e-04, -6.10129090e-02, + 1.29061842e-01, -4.90775251e-03, -1.19671283e-02, + 1.41347263e-01, -1.47631680e-01, -6.75670103e-04, + -4.58198217e-04, -4.66208406e-03, -3.07429001e-03, + -7.11923401e-02, -1.33191898e-04, -2.61928659e-04, + -5.90659690e-02, -6.75670103e-04, -2.46673839e-02, + -6.75670103e-04, -3.09919962e-04, -7.14047519e-04, + 1.08085429e-01, 1.43161630e-01, -1.62077632e-03, + -3.79032977e-03, -4.66208406e-03, -5.71934606e-03, + 7.44566288e-02, -1.30492035e-03, -3.46630910e-04, + -2.34090923e-04, -1.30492035e-03, -8.90029618e-04, + -6.75670103e-04, -8.90029618e-04, -5.16550074e-03, + -1.49131762e-04, 1.37018624e-01, -9.87652847e-03, + -3.59779501e-03, -8.53083698e-03, -1.97726627e-04, + -3.46630910e-04, -4.42798906e-03, -7.97307494e-04, + -5.16550074e-03, -2.26348718e-02, -8.53083698e-03, + -4.09867263e-04, 1.18189219e-01, -9.40257152e-04, + -3.46630910e-04, -2.07414715e-02, -1.62077632e-03, + -1.04913757e-03, -4.33370579e-04, -8.42423701e-04, + -5.72261321e-04, -1.58375811e-02, -9.93244730e-04, + -1.62077632e-03, -1.03659408e-02, -4.66208406e-03, + -3.41454749e-03, -4.58198217e-04, -3.99257703e-03, + -8.42423701e-04, -4.90775251e-03, -6.04877746e-04, + -2.77048947e-04, -6.50004229e-02, -4.58198217e-04, + -1.17025566e-03, -1.23580799e-03, 1.46831486e-01, + -3.27769165e-04, -1.17025566e-03, -4.66208406e-03, + -1.71062283e-03, -1.53549840e-03, -1.23580799e-03, + -9.93244730e-04, -3.27769165e-04, -1.23580799e-03, + -3.41454749e-03, -1.10808799e-03, -8.42423701e-04, + -8.90029618e-04, -4.42798906e-03, -6.75670103e-04, + -6.75670103e-04, -8.90029618e-04, -4.84422741e-04, + -1.58375811e-02, -7.35405096e-03, -3.87619325e-04, + -2.62379729e-03, -1.10808799e-03, -1.10808799e-03, + -7.54555329e-04, -8.90029618e-04, -3.99257703e-03, + -3.99257703e-03, -1.17025566e-03, -7.54555329e-04, + -6.10129090e-02, -1.04913757e-03, -1.31530576e-02, + -2.00993083e-03, -2.00993083e-03, -1.53549840e-03, + -5.16550074e-03, -1.14104800e-02, -2.76645832e-03, + -1.17025566e-03, -3.87619325e-04, -4.09867263e-04, + 1.48037813e-01, 1.06365931e-01, -6.39314594e-04, + -4.42798906e-03, -4.09867263e-04, -7.97307494e-04, + -2.62379729e-03, -4.58198217e-04, -3.09919962e-04, + -6.32800839e-03, -1.14104800e-02, -2.35929680e-03, + -3.27769165e-04, -2.21289369e-04, -7.97307494e-04, + -1.73489362e-02, -5.71934606e-03, -7.97307494e-04, + -9.40802551e-03, -1.90495384e-03, -5.16550074e-03, + -5.43585191e-03, -1.51253748e-02, -4.33370579e-04, + -3.46630910e-04, -1.86893696e-04, -2.12046213e-03, + -1.25867293e-04, 5.07657192e-02, -3.87619325e-04, + -1.66959104e-04, -1.25477263e-02, -9.93244730e-04, + -2.93030065e-04, -2.00993083e-03, -2.76645832e-03, + -8.95970087e-03, -6.39314594e-04, -1.53549840e-03, + -6.75670103e-04, -1.23580799e-03, -3.70792339e-02, + 1.01184411e-01, -3.87619325e-04, 1.46321062e-01, + -3.41454749e-03, -4.09867263e-04, -2.68442736e-02, + 3.68583645e-02, -2.68442736e-02, -2.07414715e-02, + -1.86893696e-04, -1.04913757e-03, -7.35405096e-03, + -7.14047519e-04, -1.73489362e-02, 1.43973473e-01, + -9.40802551e-03, -1.71062283e-03, -1.43894386e-01, + -4.20497779e-03, -1.71062283e-03, -1.17025566e-03, + -5.71934606e-03, -5.16550074e-03, -1.98451739e-02, + 2.18574168e-02, 7.44566288e-02, -6.75670103e-04, + -1.06135519e-01, -6.99614755e-03, -1.04913757e-03, + -7.54555329e-04, -1.17025566e-03, -3.46630910e-04, + 6.98449121e-02, -2.00993083e-03, -4.90775251e-03, + -9.40257152e-04, -2.00993083e-03, -6.32800839e-03, + 1.48072729e-01, -5.12120512e-04, -6.75670103e-04, + -1.23580799e-03, -1.89814939e-02, -2.62379729e-03, + -2.62379729e-03, 1.16328328e-01, -3.16494123e-02, + -3.46630910e-04, -2.34090923e-04, -2.47623705e-04, + -4.09867263e-04, -5.72261321e-04, -6.75670103e-04, + -1.14104800e-02, -3.07429001e-03, -2.48818484e-03, + -8.53083698e-03, 2.92419496e-02, -2.62379729e-03, + -2.47623705e-04, -5.16550074e-03, -3.41454749e-03, + -2.07414715e-02, -5.71934606e-03, -5.12120512e-04, + -1.62077632e-03, -2.23682205e-03, -7.97307494e-04, + -2.00993083e-03, -1.23580799e-03, -1.62077632e-03, + -1.58375811e-02, -8.95970087e-03, -3.79032977e-03, + 1.48072729e-01, -8.95970087e-03, -1.24186489e-01, + -5.71934606e-03, -3.41454749e-03, -3.41454749e-03, + -2.76645832e-03, -3.07429001e-03, -4.42798906e-03, + -1.25477263e-02, -2.91702648e-02, -7.14047519e-04, + -1.45456868e-03, -6.75670103e-04, 3.02653681e-02, + -1.62077632e-03, -9.40257152e-04, -5.71934606e-03, + -3.66561274e-04, -3.87619325e-04, -1.86893696e-04, + -5.71934606e-03, 5.07657192e-02, -3.41454749e-03, + -4.33370579e-04, -4.42798906e-03, -2.12046213e-03, + -1.90495384e-03, 6.11546973e-02, -1.53549840e-03, + -3.59779501e-03, -2.76645832e-03, -2.35929680e-03, + -1.14513988e-01, -8.53083698e-03, -2.00993083e-03, + -4.66208406e-03, -3.66561274e-04, -1.31530576e-02, + -1.31530576e-02, -2.76645832e-03, -3.09919962e-04, + -9.40257152e-04, -4.42798906e-03, -5.72261321e-04, + -1.51253748e-02, -3.16494123e-02, -1.04913757e-03, + -1.18023417e-01]) + np.testing.assert_allclose(results.resid_pearson, + [-0.21006498, -0.20410641, -0.17423009, -0.27216147, -0.3234511 , + -0.29246179, -0.22250903, -0.60917574, -0.28416602, 0.3421141 , + -0.81229277, 1.42158361, -0.25694055, -0.21933056, -0.142444 , + -0.23569027, -0.14660243, -0.18722578, -0.16448609, -0.2323235 , + -0.64526275, 3.57006696, -0.18722578, -0.42513819, -0.25327023, + -0.12879668, -0.14450826, -0.12514332, -0.5200069 , -0.21933056, + -0.14660243, -0.23910582, -0.17931646, -0.16448609, -0.12335569, + -0.64526275, 1.97919183, -0.28010679, -0.36290807, 1.71396874, + -1.3440334 , -0.16448609, -0.14872695, -0.27610555, -0.24608613, + -0.69339243, -0.1083734 , -0.12879668, -0.63604537, -0.16448609, + -0.45684893, -0.16448609, -0.13447767, -0.16686977, 2.3862634 , + 1.66535145, -0.20706426, -0.26066405, -0.27610555, -0.29246179, + 3.18191348, -0.19548397, -0.13840353, -0.12514332, -0.19548397, + -0.17675498, -0.16448609, -0.17675498, -0.28416602, -0.11153719, + 1.81550268, -0.34261205, -0.25694055, -0.32813846, -0.11985666, + -0.13840353, -0.27216147, -0.17174127, -0.28416602, -0.44389026, + -0.32813846, -0.14450826, 2.18890738, -0.17931646, -0.13840353, + -0.43129917, -0.20706426, -0.18455132, -0.14660243, -0.17423009, + -0.1575374 , -0.39562855, -0.18191506, -0.20706426, -0.34757708, + -0.27610555, -0.25327023, -0.14872695, -0.26444152, -0.17423009, + -0.28010679, -0.15982038, -0.13066317, -0.66410018, -0.14872695, + -0.189939 , -0.19269154, 1.30401147, -0.13642648, -0.189939 , + -0.27610555, -0.21006498, -0.20410641, -0.19269154, -0.18191506, + -0.13642648, -0.19269154, -0.25327023, -0.18722578, -0.17423009, + -0.17675498, -0.27216147, -0.16448609, -0.16448609, -0.17675498, + -0.15088226, -0.39562855, -0.3142763 , -0.142444 , -0.23569027, + -0.18722578, -0.18722578, -0.169288 , -0.17675498, -0.26444152, + -0.26444152, -0.189939 , -0.169288 , -0.64526275, -0.18455132, + -0.3735026 , -0.21933056, -0.21933056, -0.20410641, -0.28416602, + -0.35772404, -0.23910582, -0.189939 , -0.142444 , -0.14450826, + 1.38125991, 2.42084442, -0.16213645, -0.27216147, -0.14450826, + -0.17174127, -0.23569027, -0.14872695, -0.13447767, -0.30099975, + -0.35772404, -0.22900483, -0.13642648, -0.12335569, -0.17174127, + -0.4071783 , -0.29246179, -0.17174127, -0.33771794, -0.21619749, + -0.28416602, -0.28828407, -0.38997712, -0.14660243, -0.13840353, + -0.11814455, -0.22250903, -0.10682532, 4.06361781, -0.142444 , + -0.11479334, -0.36816723, -0.18191506, -0.1325567 , -0.21933056, + -0.23910582, -0.33289374, -0.16213645, -0.20410641, -0.16448609, + -0.19269154, -0.52754269, 2.52762346, -0.142444 , 1.28538406, + -0.25327023, -0.14450826, -0.47018591, 4.89940505, -0.47018591, + -0.43129917, -0.11814455, -0.18455132, -0.3142763 , -0.16686977, + -0.4071783 , 1.64156241, -0.33771794, -0.21006498, -1.6439517 , + -0.26827373, -0.21006498, -0.189939 , -0.29246179, -0.28416602, + -0.42513819, 6.53301013, 3.18191348, -0.16448609, -0.87288109, + -0.30978696, -0.18455132, -0.169288 , -0.189939 , -0.13840353, + 3.32226189, -0.21933056, -0.28010679, -0.17931646, -0.21933056, + -0.30099975, 1.44218477, -0.1530688 , -0.16448609, -0.19269154, + -0.41906522, -0.23569027, -0.23569027, 0.93662539, -0.4980393 , + -0.13840353, -0.12514332, -0.12695686, -0.14450826, -0.1575374 , + -0.16448609, -0.35772404, -0.24608613, -0.2323235 , -0.32813846, + 5.57673284, -0.23569027, -0.12695686, -0.28416602, -0.25327023, + -0.43129917, -0.29246179, -0.1530688 , -0.20706426, -0.22573357, + -0.17174127, -0.21933056, -0.19269154, -0.20706426, -0.39562855, + -0.33289374, -0.26066405, 1.44218477, -0.33289374, -0.99355423, + -0.29246179, -0.25327023, -0.25327023, -0.23910582, -0.24608613, + -0.27216147, -0.36816723, -0.48391225, -0.16686977, -0.20119082, + -0.16448609, 0.49021146, -0.20706426, -0.17931646, -0.29246179, + -0.14040923, -0.142444 , -0.11814455, -0.29246179, 4.06361781, + -0.25327023, -0.14660243, -0.27216147, -0.22250903, -0.21619749, + 3.6218033 , -0.20410641, -0.25694055, -0.23910582, -0.22900483, + -0.92458976, -0.32813846, -0.21933056, -0.27610555, -0.14040923, + -0.3735026 , -0.3735026 , -0.23910582, -0.13447767, -0.17931646, + -0.27216147, -0.1575374 , -0.38997712, -0.4980393 , -0.18455132, + -2.19209332]) + np.testing.assert_allclose(results.resid_anscombe, + [-0.31237627, -0.3036605 , -0.25978208, -0.40240831, -0.47552289, + -0.43149255, -0.33053793, -0.85617194, -0.41962951, 0.50181328, + -1.0954382 , 1.66940149, -0.38048321, -0.3259044 , -0.21280762, + -0.34971301, -0.21896842, -0.27890356, -0.2454118 , -0.34482158, + -0.90063409, 2.80452413, -0.27890356, -0.61652596, -0.37518169, + -0.19255932, -0.2158664 , -0.18713159, -0.74270558, -0.3259044 , + -0.21896842, -0.35467084, -0.2672722 , -0.2454118 , -0.18447466, + -0.90063409, 2.05763941, -0.41381347, -0.53089521, 1.88552083, + -1.60654218, -0.2454118 , -0.22211425, -0.40807333, -0.3647888 , + -0.95861559, -0.16218047, -0.19255932, -0.88935802, -0.2454118 , + -0.65930821, -0.2454118 , -0.20099345, -0.24892975, 2.28774016, + 1.85167195, -0.30798858, -0.38585584, -0.40807333, -0.43149255, + 2.65398426, -0.2910267 , -0.20681747, -0.18713159, -0.2910267 , + -0.26350118, -0.2454118 , -0.26350118, -0.41962951, -0.16689207, + 1.95381191, -0.50251231, -0.38048321, -0.48214234, -0.17927213, + -0.20681747, -0.40240831, -0.25611424, -0.41962951, -0.64189694, + -0.48214234, -0.2158664 , 2.18071204, -0.2672722 , -0.20681747, + -0.62488429, -0.30798858, -0.27497271, -0.21896842, -0.25978208, + -0.23514749, -0.57618899, -0.27109582, -0.30798858, -0.50947546, + -0.40807333, -0.37518169, -0.22211425, -0.39130036, -0.25978208, + -0.41381347, -0.2385213 , -0.19533116, -0.92350689, -0.22211425, + -0.28288904, -0.28692985, 1.5730846 , -0.20388497, -0.28288904, + -0.40807333, -0.31237627, -0.3036605 , -0.28692985, -0.27109582, + -0.20388497, -0.28692985, -0.37518169, -0.27890356, -0.25978208, + -0.26350118, -0.40240831, -0.2454118 , -0.2454118 , -0.26350118, + -0.22530448, -0.57618899, -0.46253505, -0.21280762, -0.34971301, + -0.27890356, -0.27890356, -0.25249702, -0.26350118, -0.39130036, + -0.39130036, -0.28288904, -0.25249702, -0.90063409, -0.27497271, + -0.5456246 , -0.3259044 , -0.3259044 , -0.3036605 , -0.41962951, + -0.52366614, -0.35467084, -0.28288904, -0.21280762, -0.2158664 , + 1.63703418, 2.30570989, -0.24194253, -0.40240831, -0.2158664 , + -0.25611424, -0.34971301, -0.22211425, -0.20099345, -0.44366892, + -0.52366614, -0.33999576, -0.20388497, -0.18447466, -0.25611424, + -0.59203547, -0.43149255, -0.25611424, -0.49563627, -0.32133344, + -0.41962951, -0.42552227, -0.56840788, -0.21896842, -0.20681747, + -0.17672552, -0.33053793, -0.15987433, 2.9768074 , -0.21280762, + -0.17173916, -0.53821445, -0.27109582, -0.19814236, -0.3259044 , + -0.35467084, -0.48884654, -0.24194253, -0.3036605 , -0.2454118 , + -0.28692985, -0.75249089, 2.35983933, -0.21280762, 1.55726719, + -0.37518169, -0.2158664 , -0.67712261, 3.23165236, -0.67712261, + -0.62488429, -0.17672552, -0.27497271, -0.46253505, -0.24892975, + -0.59203547, 1.83482464, -0.49563627, -0.31237627, -1.83652534, + -0.39681759, -0.31237627, -0.28288904, -0.43149255, -0.41962951, + -0.61652596, 3.63983609, 2.65398426, -0.2454118 , -1.16171662, + -0.45616505, -0.27497271, -0.25249702, -0.28288904, -0.20681747, + 2.71015945, -0.3259044 , -0.41381347, -0.2672722 , -0.3259044 , + -0.44366892, 1.68567947, -0.22853969, -0.2454118 , -0.28692985, + -0.60826548, -0.34971301, -0.34971301, 1.2290223 , -0.71397735, + -0.20681747, -0.18713159, -0.1898263 , -0.2158664 , -0.23514749, + -0.2454118 , -0.52366614, -0.3647888 , -0.34482158, -0.48214234, + 3.41271513, -0.34971301, -0.1898263 , -0.41962951, -0.37518169, + -0.62488429, -0.43149255, -0.22853969, -0.30798858, -0.3352348 , + -0.25611424, -0.3259044 , -0.28692985, -0.30798858, -0.57618899, + -0.48884654, -0.38585584, 1.68567947, -0.48884654, -1.28709718, + -0.43149255, -0.37518169, -0.37518169, -0.35467084, -0.3647888 , + -0.40240831, -0.53821445, -0.69534436, -0.24892975, -0.29939131, + -0.2454118 , 0.70366797, -0.30798858, -0.2672722 , -0.43149255, + -0.2097915 , -0.21280762, -0.17672552, -0.43149255, 2.9768074 , + -0.37518169, -0.21896842, -0.40240831, -0.33053793, -0.32133344, + 2.82351017, -0.3036605 , -0.38048321, -0.35467084, -0.33999576, + -1.21650102, -0.48214234, -0.3259044 , -0.40807333, -0.2097915 , + -0.5456246 , -0.5456246 , -0.35467084, -0.20099345, -0.2672722 , + -0.40240831, -0.23514749, -0.56840788, -0.71397735, -0.27497271, + -2.18250381]) + np.testing.assert_allclose(results.resid_deviance, + [-0.29387552, -0.2857098 , -0.24455876, -0.37803944, -0.44609851, + -0.40514674, -0.31088148, -0.79449324, -0.39409528, 0.47049798, + -1.00668653, 1.48698001, -0.35757692, -0.30654405, -0.20043547, + -0.32882173, -0.20622595, -0.26249995, -0.23106769, -0.32424676, + -0.83437766, 2.28941155, -0.26249995, -0.57644334, -0.35262564, + -0.18139734, -0.20331052, -0.17629229, -0.69186337, -0.30654405, + -0.20622595, -0.33345774, -0.251588 , -0.23106769, -0.17379306, + -0.83437766, 1.78479093, -0.38867448, -0.4974393 , 1.65565332, + -1.43660134, -0.23106769, -0.20918228, -0.38332275, -0.34291558, + -0.88609006, -0.15281596, -0.18139734, -0.82428104, -0.23106769, + -0.61571821, -0.23106769, -0.18932865, -0.234371 , 1.94999969, + 1.62970871, -0.2897651 , -0.36259328, -0.38332275, -0.40514674, + 2.19506559, -0.27386827, -0.19480442, -0.17629229, -0.27386827, + -0.24804925, -0.23106769, -0.24804925, -0.39409528, -0.15725009, + 1.7074519 , -0.47114617, -0.35757692, -0.4522457 , -0.16889886, + -0.19480442, -0.37803944, -0.24111595, -0.39409528, -0.59975102, + -0.4522457 , -0.20331052, 1.87422489, -0.251588 , -0.19480442, + -0.5841272 , -0.2897651 , -0.25881274, -0.20622595, -0.24455876, + -0.22142749, -0.53929061, -0.25517563, -0.2897651 , -0.47760126, + -0.38332275, -0.35262564, -0.20918228, -0.36767536, -0.24455876, + -0.38867448, -0.2245965 , -0.18400413, -0.85481866, -0.20918228, + -0.26623785, -0.27002708, 1.40955093, -0.19204738, -0.26623785, + -0.38332275, -0.29387552, -0.2857098 , -0.27002708, -0.25517563, + -0.19204738, -0.27002708, -0.35262564, -0.26249995, -0.24455876, + -0.24804925, -0.37803944, -0.23106769, -0.23106769, -0.24804925, + -0.21218006, -0.53929061, -0.43402996, -0.20043547, -0.32882173, + -0.26249995, -0.26249995, -0.23772023, -0.24804925, -0.36767536, + -0.36767536, -0.26623785, -0.23772023, -0.83437766, -0.25881274, + -0.51106408, -0.30654405, -0.30654405, -0.2857098 , -0.39409528, + -0.49074728, -0.33345774, -0.26623785, -0.20043547, -0.20331052, + 1.46111186, 1.96253843, -0.22780971, -0.37803944, -0.20331052, + -0.24111595, -0.32882173, -0.20918228, -0.18932865, -0.41648237, + -0.49074728, -0.31973217, -0.19204738, -0.17379306, -0.24111595, + -0.55389988, -0.40514674, -0.24111595, -0.46476893, -0.30226435, + -0.39409528, -0.39958581, -0.53211065, -0.20622595, -0.19480442, + -0.16650295, -0.31088148, -0.15064545, 2.39288231, -0.20043547, + -0.16181126, -0.5042114 , -0.25517563, -0.18664773, -0.30654405, + -0.33345774, -0.45846897, -0.22780971, -0.2857098 , -0.23106769, + -0.27002708, -0.7007597 , 1.99998811, -0.20043547, 1.39670618, + -0.35262564, -0.20331052, -0.63203077, 2.53733821, -0.63203077, + -0.5841272 , -0.16650295, -0.25881274, -0.43402996, -0.234371 , + -0.55389988, 1.61672923, -0.46476893, -0.29387552, -1.61804148, + -0.37282386, -0.29387552, -0.26623785, -0.40514674, -0.39409528, + -0.57644334, 2.74841605, 2.19506559, -0.23106769, -1.06433539, + -0.42810736, -0.25881274, -0.23772023, -0.26623785, -0.19480442, + 2.23070414, -0.30654405, -0.38867448, -0.251588 , -0.30654405, + -0.41648237, 1.49993075, -0.21521982, -0.23106769, -0.27002708, + -0.5688444 , -0.32882173, -0.32882173, 1.12233423, -0.66569789, + -0.19480442, -0.17629229, -0.17882689, -0.20331052, -0.22142749, + -0.23106769, -0.49074728, -0.34291558, -0.32424676, -0.4522457 , + 2.63395309, -0.32882173, -0.17882689, -0.39409528, -0.35262564, + -0.5841272 , -0.40514674, -0.21521982, -0.2897651 , -0.3152773 , + -0.24111595, -0.30654405, -0.27002708, -0.2897651 , -0.53929061, + -0.45846897, -0.36259328, 1.49993075, -0.45846897, -1.17192274, + -0.40514674, -0.35262564, -0.35262564, -0.33345774, -0.34291558, + -0.37803944, -0.5042114 , -0.64869028, -0.234371 , -0.28170899, + -0.23106769, 0.65629132, -0.2897651 , -0.251588 , -0.40514674, + -0.19760028, -0.20043547, -0.16650295, -0.40514674, 2.39288231, + -0.35262564, -0.20622595, -0.37803944, -0.31088148, -0.30226435, + 2.30104857, -0.2857098 , -0.35757692, -0.33345774, -0.31973217, + -1.11158678, -0.4522457 , -0.30654405, -0.38332275, -0.19760028, + -0.51106408, -0.51106408, -0.33345774, -0.18932865, -0.251588 , + -0.37803944, -0.22142749, -0.53211065, -0.66569789, -0.25881274, + -1.87550882]) + np.testing.assert_allclose(results.null, + [ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759]) + self.assertAlmostEqual(results.D2, .200712816165) + self.assertAlmostEqual(results.adj_D2, 0.19816731557930456) + + + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py new file mode 100644 index 0000000..0789675 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py @@ -0,0 +1,350 @@ + +from __future__ import absolute_import, print_function +import numpy as np +import warnings + + +def _bit_length_26(x): + if x == 0: + return 0 + elif x == 1: + return 1 + else: + return len(bin(x)) - 2 + + +try: + from scipy.lib._version import NumpyVersion +except ImportError: + import re + string_types = basestring + + class NumpyVersion(): + """Parse and compare numpy version strings. + Numpy has the following versioning scheme (numbers given are examples; they + can be >9) in principle): + - Released version: '1.8.0', '1.8.1', etc. + - Alpha: '1.8.0a1', '1.8.0a2', etc. + - Beta: '1.8.0b1', '1.8.0b2', etc. + - Release candidates: '1.8.0rc1', '1.8.0rc2', etc. + - Development versions: '1.8.0.dev-f1234afa' (git commit hash appended) + - Development versions after a1: '1.8.0a1.dev-f1234afa', + '1.8.0b2.dev-f1234afa', + '1.8.1rc1.dev-f1234afa', etc. + - Development versions (no git hash available): '1.8.0.dev-Unknown' + Comparing needs to be done against a valid version string or other + `NumpyVersion` instance. + Parameters + ---------- + vstring : str + Numpy version string (``np.__version__``). + Notes + ----- + All dev versions of the same (pre-)release compare equal. + Examples + -------- + >>> from scipy.lib._version import NumpyVersion + >>> if NumpyVersion(np.__version__) < '1.7.0': + ... print('skip') + skip + >>> NumpyVersion('1.7') # raises ValueError, add ".0" + """ + + def __init__(self, vstring): + self.vstring = vstring + ver_main = re.match(r'\d[.]\d+[.]\d+', vstring) + if not ver_main: + raise ValueError("Not a valid numpy version string") + + self.version = ver_main.group() + self.major, self.minor, self.bugfix = [int(x) for x in + self.version.split('.')] + if len(vstring) == ver_main.end(): + self.pre_release = 'final' + else: + alpha = re.match(r'a\d', vstring[ver_main.end():]) + beta = re.match(r'b\d', vstring[ver_main.end():]) + rc = re.match(r'rc\d', vstring[ver_main.end():]) + pre_rel = [m for m in [alpha, beta, rc] if m is not None] + if pre_rel: + self.pre_release = pre_rel[0].group() + else: + self.pre_release = '' + + self.is_devversion = bool(re.search(r'.dev-', vstring)) + + def _compare_version(self, other): + """Compare major.minor.bugfix""" + if self.major == other.major: + if self.minor == other.minor: + if self.bugfix == other.bugfix: + vercmp = 0 + elif self.bugfix > other.bugfix: + vercmp = 1 + else: + vercmp = -1 + elif self.minor > other.minor: + vercmp = 1 + else: + vercmp = -1 + elif self.major > other.major: + vercmp = 1 + else: + vercmp = -1 + + return vercmp + + def _compare_pre_release(self, other): + """Compare alpha/beta/rc/final.""" + if self.pre_release == other.pre_release: + vercmp = 0 + elif self.pre_release == 'final': + vercmp = 1 + elif other.pre_release == 'final': + vercmp = -1 + elif self.pre_release > other.pre_release: + vercmp = 1 + else: + vercmp = -1 + + return vercmp + + def _compare(self, other): + if not isinstance(other, (string_types, NumpyVersion)): + raise ValueError("Invalid object to compare with NumpyVersion.") + + if isinstance(other, string_types): + other = NumpyVersion(other) + + vercmp = self._compare_version(other) + if vercmp == 0: + # Same x.y.z version, check for alpha/beta/rc + vercmp = self._compare_pre_release(other) + if vercmp == 0: + # Same version and same pre-release, check if dev version + if self.is_devversion is other.is_devversion: + vercmp = 0 + elif self.is_devversion: + vercmp = -1 + else: + vercmp = 1 + + return vercmp + + def __lt__(self, other): + return self._compare(other) < 0 + + def __le__(self, other): + return self._compare(other) <= 0 + + def __eq__(self, other): + return self._compare(other) == 0 + + def __ne__(self, other): + return self._compare(other) != 0 + + def __gt__(self, other): + return self._compare(other) > 0 + + def __ge__(self, other): + return self._compare(other) >= 0 + + def __repr(self): + return "NumpyVersion(%s)" % self.vstring + + +def _next_regular(target): + """ + Find the next regular number greater than or equal to target. + Regular numbers are composites of the prime factors 2, 3, and 5. + Also known as 5-smooth numbers or Hamming numbers, these are the optimal + size for inputs to FFTPACK. + Target must be a positive integer. + """ + if target <= 6: + return target + + # Quickly check if it's already a power of 2 + if not (target & (target - 1)): + return target + + match = float('inf') # Anything found will be smaller + p5 = 1 + while p5 < target: + p35 = p5 + while p35 < target: + # Ceiling integer division, avoiding conversion to float + # (quotient = ceil(target / p35)) + quotient = -(-target // p35) + # Quickly find next power of 2 >= quotient + try: + p2 = 2 ** ((quotient - 1).bit_length()) + except AttributeError: + # Fallback for Python <2.7 + p2 = 2 ** _bit_length_26(quotient - 1) + + N = p2 * p35 + if N == target: + return N + elif N < match: + match = N + p35 *= 3 + if p35 == target: + return p35 + if p35 < match: + match = p35 + p5 *= 5 + if p5 == target: + return p5 + if p5 < match: + match = p5 + return match +if NumpyVersion(np.__version__) >= '1.7.1': + np_matrix_rank = np.linalg.matrix_rank +else: + def np_matrix_rank(M, tol=None): + """ + Return matrix rank of array using SVD method + Rank of the array is the number of SVD singular values of the array that are + greater than `tol`. + Parameters + ---------- + M : {(M,), (M, N)} array_like + array of <=2 dimensions + tol : {None, float}, optional + threshold below which SVD values are considered zero. If `tol` is + None, and ``S`` is an array with singular values for `M`, and + ``eps`` is the epsilon value for datatype of ``S``, then `tol` is + set to ``S.max() * max(M.shape) * eps``. + Notes + ----- + The default threshold to detect rank deficiency is a test on the magnitude + of the singular values of `M`. By default, we identify singular values less + than ``S.max() * max(M.shape) * eps`` as indicating rank deficiency (with + the symbols defined above). This is the algorithm MATLAB uses [1]. It also + appears in *Numerical recipes* in the discussion of SVD solutions for linear + least squares [2]. + This default threshold is designed to detect rank deficiency accounting for + the numerical errors of the SVD computation. Imagine that there is a column + in `M` that is an exact (in floating point) linear combination of other + columns in `M`. Computing the SVD on `M` will not produce a singular value + exactly equal to 0 in general: any difference of the smallest SVD value from + 0 will be caused by numerical imprecision in the calculation of the SVD. + Our threshold for small SVD values takes this numerical imprecision into + account, and the default threshold will detect such numerical rank + deficiency. The threshold may declare a matrix `M` rank deficient even if + the linear combination of some columns of `M` is not exactly equal to + another column of `M` but only numerically very close to another column of + `M`. + We chose our default threshold because it is in wide use. Other thresholds + are possible. For example, elsewhere in the 2007 edition of *Numerical + recipes* there is an alternative threshold of ``S.max() * + np.finfo(M.dtype).eps / 2. * np.sqrt(m + n + 1.)``. The authors describe + this threshold as being based on "expected roundoff error" (p 71). + The thresholds above deal with floating point roundoff error in the + calculation of the SVD. However, you may have more information about the + sources of error in `M` that would make you consider other tolerance values + to detect *effective* rank deficiency. The most useful measure of the + tolerance depends on the operations you intend to use on your matrix. For + example, if your data come from uncertain measurements with uncertainties + greater than floating point epsilon, choosing a tolerance near that + uncertainty may be preferable. The tolerance may be absolute if the + uncertainties are absolute rather than relative. + References + ---------- + .. [1] MATLAB reference documention, "Rank" + http://www.mathworks.com/help/techdoc/ref/rank.html + .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery, + "Numerical Recipes (3rd edition)", Cambridge University Press, 2007, + page 795. + Examples + -------- + >>> from numpy.linalg import matrix_rank + >>> matrix_rank(np.eye(4)) # Full rank matrix + 4 + >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix + >>> matrix_rank(I) + 3 + >>> matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0 + 1 + >>> matrix_rank(np.zeros((4,))) + 0 + """ + M = np.asarray(M) + if M.ndim > 2: + raise TypeError('array should have 2 or fewer dimensions') + if M.ndim < 2: + return int(not all(M == 0)) + S = np.linalg.svd(M, compute_uv=False) + if tol is None: + tol = S.max() * max(M.shape) * np.finfo(S.dtype).eps + return np.sum(S > tol) + + + +class CacheWriteWarning(UserWarning): + pass + +class CachedAttribute(object): + + def __init__(self, func, cachename=None, resetlist=None): + self.fget = func + self.name = func.__name__ + self.cachename = cachename or '_cache' + self.resetlist = resetlist or () + + def __get__(self, obj, type=None): + if obj is None: + return self.fget + # Get the cache or set a default one if needed + _cachename = self.cachename + _cache = getattr(obj, _cachename, None) + if _cache is None: + setattr(obj, _cachename, resettable_cache()) + _cache = getattr(obj, _cachename) + # Get the name of the attribute to set and cache + name = self.name + _cachedval = _cache.get(name, None) + # print("[_cachedval=%s]" % _cachedval) + if _cachedval is None: + # Call the "fget" function + _cachedval = self.fget(obj) + # Set the attribute in obj + # print("Setting %s in cache to %s" % (name, _cachedval)) + try: + _cache[name] = _cachedval + except KeyError: + setattr(_cache, name, _cachedval) + # Update the reset list if needed (and possible) + resetlist = self.resetlist + if resetlist is not (): + try: + _cache._resetdict[name] = self.resetlist + except AttributeError: + pass + # else: + # print("Reading %s from cache (%s)" % (name, _cachedval)) + return _cachedval + + def __set__(self, obj, value): + errmsg = "The attribute '%s' cannot be overwritten" % self.name + warnings.warn(errmsg, CacheWriteWarning) + + +class _cache_readonly(object): + """ + Decorator for CachedAttribute + """ + + def __init__(self, cachename=None, resetlist=None): + self.func = None + self.cachename = cachename + self.resetlist = resetlist or None + + def __call__(self, func): + return CachedAttribute(func, + cachename=self.cachename, + resetlist=self.resetlist) +cache_readonly = _cache_readonly() + + diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py new file mode 100644 index 0000000..af66d8c --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py @@ -0,0 +1,284 @@ +""" +Variance functions for use with the link functions in statsmodels.family.links +""" + +__docformat__ = 'restructuredtext' + +import numpy as np +FLOAT_EPS = np.finfo(float).eps + +class VarianceFunction(object): + """ + Relates the variance of a random variable to its mean. Defaults to 1. + + Methods + ------- + call + Returns an array of ones that is the same shape as `mu` + + Notes + ----- + After a variance function is initialized, its call method can be used. + + Alias for VarianceFunction: + constant = VarianceFunction() + + See also + -------- + statsmodels.family.family + """ + + def __call__(self, mu): + """ + Default variance function + + Parameters + ----------- + mu : array-like + mean parameters + + Returns + ------- + v : array + ones(mu.shape) + """ + mu = np.asarray(mu) + return np.ones(mu.shape, np.float64) + + + def deriv(self, mu): + """ + Derivative of the variance function v'(mu) + """ + from statsmodels.tools.numdiff import approx_fprime_cs + # TODO: diag workaround proplem with numdiff for 1d + return np.diag(approx_fprime_cs(mu, self)) + + +constant = VarianceFunction() +constant.__doc__ = """ +The call method of constant returns a constant variance, i.e., a vector of ones. + +constant is an alias of VarianceFunction() +""" + +class Power(object): + """ + Power variance function + + Parameters + ---------- + power : float + exponent used in power variance function + + Methods + ------- + call + Returns the power variance + + Formulas + -------- + V(mu) = numpy.fabs(mu)**power + + Notes + ----- + Aliases for Power: + mu = Power() + mu_squared = Power(power=2) + mu_cubed = Power(power=3) + """ + + def __init__(self, power=1.): + self.power = power + + def __call__(self, mu): + """ + Power variance function + + Parameters + ---------- + mu : array-like + mean parameters + + Returns + ------- + variance : array + numpy.fabs(mu)**self.power + """ + return np.power(np.fabs(mu), self.power) + + + def deriv(self, mu): + """ + Derivative of the variance function v'(mu) + """ + from statsmodels.tools.numdiff import approx_fprime_cs, approx_fprime + #return approx_fprime_cs(mu, self) # TODO fix breaks in `fabs + # TODO: diag is workaround problem with numdiff for 1d + return np.diag(approx_fprime(mu, self)) + + +mu = Power() +mu.__doc__ = """ +Returns np.fabs(mu) + +Notes +----- +This is an alias of Power() +""" +mu_squared = Power(power=2) +mu_squared.__doc__ = """ +Returns np.fabs(mu)**2 + +Notes +----- +This is an alias of statsmodels.family.links.Power(power=2) +""" +mu_cubed = Power(power=3) +mu_cubed.__doc__ = """ +Returns np.fabs(mu)**3 + +Notes +----- +This is an alias of statsmodels.family.links.Power(power=3) +""" + +class Binomial(object): + """ + Binomial variance function + + Parameters + ---------- + n : int, optional + The number of trials for a binomial variable. The default is 1 for + p in (0,1) + + Methods + ------- + call + Returns the binomial variance + + Formulas + -------- + V(mu) = p * (1 - p) * n + + where p = mu / n + + Notes + ----- + Alias for Binomial: + binary = Binomial() + + A private method _clean trims the data by machine epsilon so that p is + in (0,1) + """ + + def __init__(self, n=1): + self.n = n + + def _clean(self, p): + return np.clip(p, FLOAT_EPS, 1 - FLOAT_EPS) + + def __call__(self, mu): + """ + Binomial variance function + + Parameters + ----------- + mu : array-like + mean parameters + + Returns + ------- + variance : array + variance = mu/n * (1 - mu/n) * self.n + """ + p = self._clean(mu / self.n) + return p * (1 - p) * self.n + + #TODO: inherit from super + def deriv(self, mu): + """ + Derivative of the variance function v'(mu) + """ + from statsmodels.tools.numdiff import approx_fprime_cs, approx_fprime + # TODO: diag workaround proplem with numdiff for 1d + return np.diag(approx_fprime_cs(mu, self)) + + +binary = Binomial() +binary.__doc__ = """ +The binomial variance function for n = 1 + +Notes +----- +This is an alias of Binomial(n=1) +""" + +class NegativeBinomial(object): + ''' + Negative binomial variance function + + Parameters + ---------- + alpha : float + The ancillary parameter for the negative binomial variance function. + `alpha` is assumed to be nonstochastic. The default is 1. + + Methods + ------- + call + Returns the negative binomial variance + + Formulas + -------- + V(mu) = mu + alpha*mu**2 + + Notes + ----- + Alias for NegativeBinomial: + nbinom = NegativeBinomial() + + A private method _clean trims the data by machine epsilon so that p is + in (0,inf) + ''' + + def __init__(self, alpha=1.): + self.alpha = alpha + + def _clean(self, p): + return np.clip(p, FLOAT_EPS, np.inf) + + def __call__(self, mu): + """ + Negative binomial variance function + + Parameters + ---------- + mu : array-like + mean parameters + + Returns + ------- + variance : array + variance = mu + alpha*mu**2 + """ + p = self._clean(mu) + return p + self.alpha*p**2 + + def deriv(self, mu): + """ + Derivative of the negative binomial variance function. + """ + + p = self._clean(mu) + return 1 + 2 * self.alpha * p + +nbinom = NegativeBinomial() +nbinom.__doc__ = """ +Negative Binomial variance function. + +Notes +----- +This is an alias of NegativeBinomial(alpha=1.) +""" diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py new file mode 100644 index 0000000..f7a77b2 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py @@ -0,0 +1 @@ +from base import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py new file mode 100644 index 0000000..eeb63b3 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py @@ -0,0 +1,4 @@ +import gwr +import sel_bw +import diagnostics +import kernels diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py new file mode 100644 index 0000000..7fbcdc4 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py @@ -0,0 +1,81 @@ +""" +Diagnostics for estimated gwr modesl +""" +__author__ = "Taylor Oshan tayoshan@gmail.com" + +import numpy as np +from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial + +def get_AICc(gwr): + """ + Get AICc value + + Gaussian: p61, (2.33), Fotheringham, Brunsdon and Charlton (2002) + + GWGLM: AICc=AIC+2k(k+1)/(n-k-1), Nakaya et al. (2005): p2704, (36) + + """ + n = gwr.n + k = gwr.tr_S + if isinstance(gwr.family, Gaussian): + aicc = -2.0*gwr.llf + 2.0*n*(k + 1.0)/(n-k-2.0) + elif isinstance(gwr.family, (Poisson, Binomial)): + aicc = get_AIC(gwr) + 2.0 * k * (k+1.0) / (n - k - 1.0) + return aicc + +def get_AIC(gwr): + """ + Get AIC calue + + Gaussian: p96, (4.22), Fotheringham, Brunsdon and Charlton (2002) + + GWGLM: AIC(G)=D(G) + 2K(G), where D and K denote the deviance and the effective + number of parameters in the model with bandwidth G, respectively. + + """ + k = gwr.tr_S + #deviance = -2*log-likelihood + y = gwr.y + mu = gwr.mu + if isinstance(gwr.family, Gaussian): + aic = -2.0 * gwr.llf + 2.0 * (k+1) + elif isinstance(gwr.family, (Poisson, Binomial)): + aic = np.sum(gwr.family.resid_dev(y, mu)**2) + 2.0 * k + return aic + +def get_BIC(gwr): + """ + Get BIC value + + Gaussian: p61 (2.34), Fotheringham, Brunsdon and Charlton (2002) + BIC = -2log(L)+klog(n) + + GWGLM: BIC = dev + tr_S * log(n) + + """ + n = gwr.n # (scalar) number of observations + k = gwr.tr_S + y = gwr.y + mu = gwr.mu + if isinstance(gwr.family, Gaussian): + bic = -2.0 * gwr.llf + (k+1) * np.log(n) + elif isinstance(gwr.family, (Poisson, Binomial)): + bic = np.sum(gwr.family.resid_dev(y, mu)**2) + k * np.log(n) + return bic + +def get_CV(gwr): + """ + Get CV value + + Gaussian only + + Methods: p60, (2.31) or p212 (9.4) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + Modification: sum of residual squared is divided by n according to GWR4 results + + """ + aa = gwr.resid_response.reshape((-1,1))/(1.0-gwr.influ) + cv = np.sum(aa**2)/gwr.n + return cv + diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py new file mode 100644 index 0000000..6d5257f --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py @@ -0,0 +1,1086 @@ +#Main GWR classes + +#Offset does not yet do anyhting and needs to be implemented + +__author__ = "Taylor Oshan Tayoshan@gmail.com" + +import numpy as np +import numpy.linalg as la +from scipy.stats import t +from kernels import * +from diagnostics import get_AIC, get_AICc, get_BIC +import pysal.spreg.user_output as USER +from crankshaft.regression.glm.family import Gaussian, Binomial, Poisson +from crankshaft.regression.glm.glm import GLM, GLMResults +from crankshaft.regression.glm.iwls import iwls +from crankshaft.regression.glm.utils import cache_readonly + +fk = {'gaussian': fix_gauss, 'bisquare': fix_bisquare, 'exponential': fix_exp} +ak = {'gaussian': adapt_gauss, 'bisquare': adapt_bisquare, 'exponential': adapt_exp} + +class GWR(GLM): + """ + Geographically weighted regression. Can currently estimate Gaussian, + Poisson, and logistic models(built on a GLM framework). GWR object prepares + model input. Fit method performs estimation and returns a GWRResults object. + + Parameters + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates of + observatons; also used as calibration locations is + 'points' is set to None + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations; default is set to None, which + uses every observation as a calibration point + + bw : scalar + bandwidth value consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations; + only for Poisson models + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + Attributes + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + bw : scalar + bandwidth value consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept + + n : integer + number of observations + + k : integer + number of independent variables + + mean_y : float + mean of y + + std_y : float + standard deviation of y + + fit_params : dict + parameters passed into fit method to define estimation + routine + + W : array + n*n, spatial weights matrix for weighting all + observations from each calibration point + """ + def __init__(self, coords, y, X, bw, family=Gaussian(), offset=None, + sigma2_v1=False, kernel='bisquare', fixed=False, constant=True): + """ + Initialize class + """ + GLM.__init__(self, y, X, family, constant=constant) + self.constant = constant + self.sigma2_v1 = sigma2_v1 + self.coords = coords + self.bw = bw + self.kernel = kernel + self.fixed = fixed + if offset is None: + self.offset = np.ones((self.n, 1)) + else: + self.offset = offset * 1.0 + self.fit_params = {} + self.W = self._build_W(fixed, kernel, coords, bw) + self.points = None + self.exog_scale = None + self.exog_resid = None + self.P = None + + def _build_W(self, fixed, kernel, coords, bw, points=None): + if fixed: + try: + W = fk[kernel](coords, bw, points) + except: + raise TypeError('Unsupported kernel function ', kernel) + else: + try: + W = ak[kernel](coords, bw, points) + except: + raise TypeError('Unsupported kernel function ', kernel) + + return W + + def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'): + """ + Method that fits a model with a particular estimation routine. + + Parameters + ---------- + + ini_betas : array + k*1, initial coefficient values, including constant. + Default is None, which calculates initial values during + estimation + tol: float + Tolerence for estimation convergence + max_iter : integer + Maximum number of iterations if convergence not + achieved + solve : string + Technique to solve MLE equations. + 'iwls' = iteratively (re)weighted least squares (default) + """ + self.fit_params['ini_params'] = ini_params + self.fit_params['tol'] = tol + self.fit_params['max_iter'] = max_iter + self.fit_params['solve']= solve + if solve.lower() == 'iwls': + m = self.W.shape[0] + params = np.zeros((m, self.k)) + predy = np.zeros((m, 1)) + v = np.zeros((m, 1)) + w = np.zeros((m, 1)) + z = np.zeros((self.n, self.n)) + S = np.zeros((self.n, self.n)) + R = np.zeros((self.n, self.n)) + CCT = np.zeros((m, self.k)) + #f = np.zeros((n, n)) + p = np.zeros((m, 1)) + for i in range(m): + wi = self.W[i].reshape((-1,1)) + rslt = iwls(self.y, self.X, self.family, self.offset, + ini_params, tol, max_iter, wi=wi) + params[i,:] = rslt[0].T + predy[i] = rslt[1][i] + v[i] = rslt[2][i] + w[i] = rslt[3][i] + z[i] = rslt[4].flatten() + R[i] = np.dot(self.X[i], rslt[5]) + ri = np.dot(self.X[i], rslt[5]) + S[i] = ri*np.reshape(rslt[4].flatten(), (1,-1)) + #dont need unless f is explicitly passed for + #prediction of non-sampled points + #cf = rslt[5] - np.dot(rslt[5], f) + #CCT[i] = np.diag(np.dot(cf, cf.T/rslt[3])) + CCT[i] = np.diag(np.dot(rslt[5], rslt[5].T)) + S = S * (1.0/z) + return GWRResults(self, params, predy, S, CCT, w) + + def predict(self, points, P, exog_scale=None, exog_resid=None, fit_params={}): + """ + Method that predicts values of the dependent variable at un-sampled + locations + + Parameters + ---------- + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration prediction locations + P : array + n*k, independent variables used to make prediction; + exlcuding the constant + exog_scale : scalar + estimated scale using sampled locations; defualt is None + which estimates a model using points from "coords" + exog_resid : array-like + estimated residuals using sampled locations; defualt is None + which estimates a model using points from "coords"; if + given it must be n*1 where n is the length of coords + fit_params : dict + key-value pairs of parameters that will be passed into fit method to define estimation + routine; see fit method for more details + + """ + if (exog_scale is None) & (exog_resid is None): + train_gwr = self.fit(**fit_params) + self.exog_scale = train_gwr.scale + self.exog_resid = train_gwr.resid_response + elif (exog_scale is not None) & (exog_resid is not None): + self.exog_scale = exog_scale + self.exog_resid = exog_resid + else: + raise InputError('exog_scale and exog_resid must both either be' + 'None or specified') + self.points = points + if self.constant: + P = np.hstack([np.ones((len(P),1)), P]) + self.P = P + else: + self.P = P + self.W = self._build_W(self.fixed, self.kernel, self.coords, self.bw, points) + gwr = self.fit(**fit_params) + + return gwr + + @cache_readonly + def df_model(self): + raise NotImplementedError('Only computed for fitted model in GWRResults') + + @cache_readonly + def df_resid(self): + raise NotImplementedError('Only computed for fitted model in GWRResults') + +class GWRResults(GLMResults): + """ + Basic class including common properties for all GWR regression models + + Parameters + ---------- + model : GWR object + pointer to GWR object with estimation parameters + + params : array + n*k, estimated coefficients + + predy : array + n*1, predicted y values + + w : array + n*1, final weight used for iteratively re-weighted least + sqaures; default is None + + S : array + n*n, hat matrix + + CCT : array + n*k, scaled variance-covariance matrix + + Attributes + ---------- + model : GWR Object + points to GWR object for which parameters have been + estimated + + params : array + n*k, parameter estimates + + predy : array + n*1, predicted value of y + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, including constant + + family : family object + underlying probability model; provides + distribution-specific calculations + + n : integer + number of observations + + k : integer + number of independent variables + + df_model : integer + model degrees of freedom + + df_resid : integer + residual degrees of freedom + + offset : array + n*1, the offset variable at the ith location. + For Poisson model this term is often the size of + the population at risk or the expected size of + the outcome in spatial epidemiology; Default is + None where Ni becomes 1.0 for all locations + + scale : float + sigma squared used for subsequent computations + + w : array + n*1, final weights from iteratively re-weighted least + sqaures routine + + resid_response : array + n*1, residuals of the repsonse + + resid_ss : scalar + residual sum of sqaures + + W : array + n*n; spatial weights for each observation from each + calibration point + + S : array + n*n, hat matrix + + CCT : array + n*k, scaled variance-covariance matrix + + tr_S : float + trace of S (hat) matrix + + tr_STS : float + trace of STS matrix + + tr_SWSTW : float + trace of weighted STS matrix; weights are those output + from iteratively weighted least sqaures (not spatial + weights) + + y_bar : array + n*1, weighted mean value of y + + TSS : array + n*1, geographically weighted total sum of squares + + RSS : array + n*1, geographically weighted residual sum of squares + + localR2 : array + n*1, local R square + + sigma2_v1 : float + sigma squared, use (n-v1) as denominator + + sigma2_v1v2 : float + sigma squared, use (n-2v1+v2) as denominator + + sigma2_ML : float + sigma squared, estimated using ML + + std_res : array + n*1, standardised residuals + + bse : array + n*k, standard errors of parameters (betas) + + influ : array + n*1, leading diagonal of S matrix + + CooksD : array + n*1, Cook's D + + tvalues : array + n*k, local t-statistics + + adj_alpha : array + 3*1, corrected alpha values to account for multiple + hypothesis testing for the 90%, 95%, and 99% confidence + levels; tvalues with an absolute value larger than the + corrected alpha are considered statistically + significant. + + deviance : array + n*1, local model deviance for each calibration point + + resid_deviance : array + n*1, local sum of residual deviance for each + calibration point + + llf : scalar + log-likelihood of the full model; see + pysal.contrib.glm.family for damily-sepcific + log-likelihoods + + pDev : float + local percent of deviation accounted for; analogous to + r-squared for GLM's + + mu : array + n*, flat one dimensional array of predicted mean + response value from estimator + + fit_params : dict + parameters passed into fit method to define estimation + routine + """ + def __init__(self, model, params, predy, S, CCT, w=None): + GLMResults.__init__(self, model, params, predy, w) + self.W = model.W + self.offset = model.offset + if w is not None: + self.w = w + self.predy = predy + self.S = S + self.CCT = self.cov_params(CCT, model.exog_scale) + self._cache = {} + + @cache_readonly + def resid_ss(self): + u = self.resid_response.flatten() + return np.dot(u, u.T) + + @cache_readonly + def scale(self, scale=None): + if isinstance(self.family, Gaussian): + if self.model.sigma2_v1: + scale = self.sigma2_v1 + else: + scale = self.sigma2_v1v2 + else: + scale = 1.0 + return scale + + def cov_params(self, cov, exog_scale=None): + """ + Returns scaled covariance parameters + Parameters + ---------- + cov : array + estimated covariance parameters + + Returns + ------- + Scaled covariance parameters + + """ + if exog_scale is not None: + return cov*exog_scale + else: + return cov*self.scale + + @cache_readonly + def tr_S(self): + """ + trace of S (hat) matrix + """ + return np.trace(self.S*self.w) + + @cache_readonly + def tr_STS(self): + """ + trace of STS matrix + """ + return np.trace(np.dot(self.S.T*self.w,self.S*self.w)) + + @cache_readonly + def y_bar(self): + """ + weighted mean of y + """ + if self.model.points is not None: + n = len(self.model.points) + else: + n = self.n + off = self.offset.reshape((-1,1)) + arr_ybar = np.zeros(shape=(self.n,1)) + for i in range(n): + w_i= np.reshape(np.array(self.W[i]), (-1, 1)) + sum_yw = np.sum(self.y.reshape((-1,1)) * w_i) + arr_ybar[i] = 1.0 * sum_yw / np.sum(w_i*off) + return arr_ybar + + @cache_readonly + def TSS(self): + """ + geographically weighted total sum of squares + + Methods: p215, (9.9) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + + """ + if self.model.points is not None: + n = len(self.model.points) + else: + n = self.n + TSS = np.zeros(shape=(n,1)) + for i in range(n): + TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) * + (self.y.reshape((-1,1)) - self.y_bar[i])**2) + return TSS + + @cache_readonly + def RSS(self): + """ + geographically weighted residual sum of squares + + Methods: p215, (9.10) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + if self.model.points is not None: + n = len(self.model.points) + resid = self.model.exog_resid.reshape((-1,1)) + else: + n = self.n + resid = self.resid_response.reshape((-1,1)) + RSS = np.zeros(shape=(n,1)) + for i in range(n): + RSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) + * resid**2) + return RSS + + @cache_readonly + def localR2(self): + """ + local R square + + Methods: p215, (9.8) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + if isinstance(self.family, Gaussian): + return (self.TSS - self.RSS)/self.TSS + else: + raise NotImplementedError('Only applicable to Gaussian') + + @cache_readonly + def sigma2_v1(self): + """ + residual variance + + Methods: p214, (9.6), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + + only use v1 + """ + return (self.resid_ss/(self.n-self.tr_S)) + + @cache_readonly + def sigma2_v1v2(self): + """ + residual variance + + Methods: p55 (2.16)-(2.18) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + + use v1 and v2 #used in GWR4 + """ + if isinstance(self.family, (Poisson, Binomial)): + return self.resid_ss/(self.n - 2.0*self.tr_S + + self.tr_STS) #could be changed to SWSTW - nothing to test against + else: + return self.resid_ss/(self.n - 2.0*self.tr_S + + self.tr_STS) #could be changed to SWSTW - nothing to test against + @cache_readonly + def sigma2_ML(self): + """ + residual variance + + Methods: maximum likelihood + """ + return self.resid_ss/self.n + + @cache_readonly + def std_res(self): + """ + standardized residuals + + Methods: p215, (9.7) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + return self.resid_response.reshape((-1,1))/(np.sqrt(self.scale * (1.0 - self.influ))) + + @cache_readonly + def bse(self): + """ + standard errors of Betas + + Methods: p215, (2.15) and (2.21) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + return np.sqrt(self.CCT) + + @cache_readonly + def influ(self): + """ + Influence: leading diagonal of S Matrix + """ + return np.reshape(np.diag(self.S),(-1,1)) + + @cache_readonly + def cooksD(self): + """ + Influence: leading diagonal of S Matrix + + Methods: p216, (9.11), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + Note: in (9.11), p should be tr(S), that is, the effective number of parameters + """ + return self.std_res**2 * self.influ / (self.tr_S * (1.0-self.influ)) + + @cache_readonly + def deviance(self): + off = self.offset.reshape((-1,1)).T + y = self.y + ybar = self.y_bar + if isinstance(self.family, Gaussian): + raise NotImplementedError('deviance not currently used for Gaussian') + elif isinstance(self.family, Poisson): + dev = np.sum(2.0*self.W*(y*np.log(y/(ybar*off))-(y-ybar*off)),axis=1) + elif isinstance(self.family, Binomial): + dev = self.family.deviance(self.y, self.y_bar, self.W, axis=1) + return dev.reshape((-1,1)) + + @cache_readonly + def resid_deviance(self): + if isinstance(self.family, Gaussian): + raise NotImplementedError('deviance not currently used for Gaussian') + else: + off = self.offset.reshape((-1,1)).T + y = self.y + ybar = self.y_bar + global_dev_res = ((self.family.resid_dev(self.y, self.mu))**2) + dev_res = np.repeat(global_dev_res.flatten(),self.n) + dev_res = dev_res.reshape((self.n, self.n)) + dev_res = np.sum(dev_res * self.W.T, axis=0) + return dev_res.reshape((-1,1)) + + @cache_readonly + def pDev(self): + """ + Local percentage of deviance accounted for. Described in the GWR4 + manual. Equivalent to 1 - (deviance/null deviance) + """ + if isinstance(self.family, Gaussian): + raise NotImplementedError('Not implemented for Gaussian') + else: + return 1.0 - (self.resid_deviance/self.deviance) + + @cache_readonly + def adj_alpha(self): + """ + Corrected alpha (critical) values to account for multiple testing during hypothesis + testing. Includes corrected value for 90% (.1), 95% (.05), and 99% + (.01) confidence levels. Correction comes from: + + da Silva, A. R., & Fotheringham, A. S. (2015). The Multiple Testing Issue in + Geographically Weighted Regression. Geographical Analysis. + + """ + alpha = np.array([.1, .05, .001]) + pe = (2.0 * self.tr_S) - self.tr_STS + p = self.k + return (alpha*p)/pe + + def filter_tvals(self, alpha): + """ + Utility function to set tvalues with an absolute value smaller than the + absolute value of the alpha (critical) value to 0 + + Parameters + ---------- + alpha : scalar + critical value to determine which tvalues are + associated with statistically significant parameter + estimates + + Returns + ------- + filtered : array + n*k; new set of n tvalues for each of k variables + where absolute tvalues less than the absolute value of + alpha have been set to 0. + """ + alpha = np.abs(alpha)/2.0 + n = self.n + critical = t.ppf(1-alpha, n-1) + subset = (self.tvalues < critical) & (self.tvalues > -1.0*critical) + tvalues = self.tvalues.copy() + tvalues[subset] = 0 + return tvalues + + @cache_readonly + def df_model(self): + return self.n - self.tr_S + + @cache_readonly + def df_resid(self): + return self.n - 2.0*self.tr_S + self.tr_STS + + @cache_readonly + def normalized_cov_params(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def resid_pearson(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def resid_working(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def resid_anscombe(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def pearson_chi2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def null(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def llnull(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def null_deviance(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def aic(self): + return get_AIC(self) + + @cache_readonly + def aicc(self): + return get_AICc(self) + + @cache_readonly + def bic(self): + return get_BIC(self) + + @cache_readonly + def D2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def adj_D2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def pseudoR2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def adj_pseudoR2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def pvalues(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def predictions(self): + P = self.model.P + if P is None: + raise NotImplementedError('predictions only avaialble if predict' + 'method called on GWR model') + else: + predictions = np.sum(P*self.params, axis=1).reshape((-1,1)) + return predictions + +class FBGWR(GWR): + """ + Parameters + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates of + observatons; also used as calibration locations is + 'points' is set to None + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations; default is set to None, which + uses every observation as a calibration point + + bws : array-like + collection of bandwidth values consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW with fb=True. Order of values should the same as + the order of columns associated with X + XB : array + n*k, product of temporary X and params obtained as through-put + from the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + err : array + n*1, temporary residuals associated with the predicted values from + the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + Attributes + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates of + observatons; also used as calibration locations is + 'points' is set to None + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations; default is set to None, which + uses every observation as a calibration point + + bws : array-like + collection of bandwidth values consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW with fb=True. Order of values should the same as + the order of columns associated with X + XB : array + n*k, product of temporary X and params obtained as through-put + from the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + err : array + n*1, temporary residuals associated with the predicted values from + the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + + Examples + ------- + TODO + + """ + def __init__(self, coords, y, X, bws, XB, err, family=Gaussian(), offset=None, + sigma2_v1=False, kernel='bisquare', fixed=False, constant=True): + """ + Initialize class + """ + self.coords = coords + self.y = y + self.X = X + self.XB = XB + self.err = err + self.bws = bws + self.family = family + self.offset = offset + self.sigma2_v1 = sigma2_v1 + self.kernel = kernel + self.fixed = fixed + self.constant = constant + if constant: + self.X = USER.check_constant(self.X) + + def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'): + """ + Method that fits a model with a particular estimation routine. + + Parameters + ---------- + + ini_betas : array + k*1, initial coefficient values, including constant. + Default is None, which calculates initial values during + estimation + tol: float + Tolerence for estimation convergence + max_iter : integer + Maximum number of iterations if convergence not + achieved + solve : string + Technique to solve MLE equations. + 'iwls' = iteratively (re)weighted least squares (default) + + """ + params = np.zeros_like(self.X) + err = self.err + for i, bw in enumerate(self.bws): + W = self._build_W(self.fixed, self.kernel, self.coords, bw) + X = self.X[:,i].reshape((-1,1)) + y = self.XB[:,i].reshape((-1,1)) + err + model = GWR(self.coords, y, X, bw, self.family, self.offset, + self.sigma2_v1, self.kernel, self.fixed, constant=False) + results = model.fit(ini_params, tol, max_iter, solve) + params[:,i] = results.params.flatten() + err = results.resid_response.reshape((-1,1)) + return FBGWRResults(self, params) + +class FBGWRResults(object): + """ + Parameters + ---------- + model : GWR object + pointer to FBGWR object with estimation parameters + + params : array + n*k, estimated coefficients + + Attributes + ---------- + model : GWR Object + points to FBGWR object for which parameters have been + estimated + + params : array + n*k, parameter estimates + + predy : array + n*1, predicted value of y + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, including constant + + : array + resid_response n*1, residuals of response + + resid_ss : scalar + residual sum of sqaures + + Examples + ------- + TODO + + """ + def __init__(self, model, params): + """ + Initialize class + """ + self.model = model + self.params = params + self.X = model.X + self.y = model.y + self._cache = {} + + @cache_readonly + def predy(self): + return np.sum(np.multiply(self.params, self.X), axis=1).reshape((-1,1)) + + @cache_readonly + def resid_response(self): + return (self.y - self.predy).reshape((-1,1)) + + @cache_readonly + def resid_ss(self): + u = self.resid_response.flatten() + return np.dot(u, u.T) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py new file mode 100644 index 0000000..bdf246d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py @@ -0,0 +1,120 @@ +# GWR kernel function specifications + +__author__ = "Taylor Oshan tayoshan@gmail.com" + +#from pysal.weights.Distance import Kernel +import scipy +from scipy.spatial.kdtree import KDTree +import numpy as np + +#adaptive specifications should be parameterized with nn-1 to match original gwr +#implementation. That is, pysal counts self neighbors with knn automatically. + +def fix_gauss(coords, bw, points=None): + w = _Kernel(coords, function='gwr_gaussian', bandwidth=bw, + truncate=False, points=points) + return w.kernel + +def adapt_gauss(coords, nn, points=None): + w = _Kernel(coords, fixed=False, k=nn-1, function='gwr_gaussian', + truncate=False, points=points) + return w.kernel + +def fix_bisquare(coords, bw, points=None): + w = _Kernel(coords, function='bisquare', bandwidth=bw, points=points) + return w.kernel + +def adapt_bisquare(coords, nn, points=None): + w = _Kernel(coords, fixed=False, k=nn-1, function='bisquare', points=points) + return w.kernel + +def fix_exp(coords, bw, points=None): + w = _Kernel(coords, function='exponential', bandwidth=bw, + truncate=False, points=points) + return w.kernel + +def adapt_exp(coords, nn, points=None): + w = _Kernel(coords, fixed=False, k=nn-1, function='exponential', + truncate=False, points=points) + return w.kernel + +from scipy.spatial.distance import cdist + +class _Kernel(object): + """ + + """ + def __init__(self, data, bandwidth=None, fixed=True, k=None, + function='triangular', eps=1.0000001, ids=None, truncate=True, + points=None): #Added truncate flag + if issubclass(type(data), scipy.spatial.KDTree): + self.data = data.data + data = self.data + else: + self.data = data + if k is not None: + self.k = int(k) + 1 + else: + self.k = k + if points is None: + self.dmat = cdist(self.data, self.data) + else: + self.points = points + self.dmat = cdist(self.points, self.data) + self.function = function.lower() + self.fixed = fixed + self.eps = eps + self.trunc = truncate + if bandwidth: + try: + bandwidth = np.array(bandwidth) + bandwidth.shape = (len(bandwidth), 1) + except: + bandwidth = np.ones((len(data), 1), 'float') * bandwidth + self.bandwidth = bandwidth + else: + self._set_bw() + self.kernel = self._kernel_funcs(self.dmat/self.bandwidth) + + if self.trunc: + mask = np.repeat(self.bandwidth, len(self.data), axis=1) + self.kernel[(self.dmat >= mask)] = 0 + + def _set_bw(self): + if self.k is not None: + dmat = np.sort(self.dmat)[:,:self.k] + else: + dmat = self.dmat + if self.fixed: + # use max knn distance as bandwidth + bandwidth = dmat.max() * self.eps + n = len(self.data) + self.bandwidth = np.ones((n, 1), 'float') * bandwidth + else: + # use local max knn distance + self.bandwidth = dmat.max(axis=1) * self.eps + self.bandwidth.shape = (self.bandwidth.size, 1) + + + def _kernel_funcs(self, zs): + # functions follow Anselin and Rey (2010) table 5.4 + if self.function == 'triangular': + return 1 - zs + elif self.function == 'uniform': + return np.ones(zi.shape) * 0.5 + elif self.function == 'quadratic': + return (3. / 4) * (1 - zs ** 2) + elif self.function == 'quartic': + return (15. / 16) * (1 - zs ** 2) ** 2 + elif self.function == 'gaussian': + c = np.pi * 2 + c = c ** (-0.5) + return c * np.exp(-(zs ** 2) / 2.) + elif self.function == 'gwr_gaussian': + return np.exp(-0.5*(zs)**2) + elif self.function == 'bisquare': + return (1-(zs)**2)**2 + elif self.function =='exponential': + return np.exp(-zs) + else: + print('Unsupported kernel function', self.function) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py new file mode 100644 index 0000000..97de4be --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py @@ -0,0 +1,208 @@ +#Bandwidth optimization methods + +__author__ = "Taylor Oshan" + +import numpy as np + +def golden_section(a, c, delta, function, tol, max_iter, int_score=False): + """ + Golden section search routine + Method: p212, 9.6.4 + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + + Parameters + ---------- + a : float + initial max search section value + b : float + initial min search section value + delta : float + constant used to determine width of search sections + function : function + obejective function to be evaluated at different section + values + int_score : boolean + False for float score, True for integer score + tol : float + tolerance used to determine convergence + max_iter : integer + maximum iterations if no convergence to tolerance + + Returns + ------- + opt_val : float + optimal value + opt_score : kernel + optimal score + output : list of tuples + searching history + """ + b = a + delta * np.abs(c-a) + d = c - delta * np.abs(c-a) + score = 0.0 + diff = 1.0e9 + iters = 0 + output = [] + while np.abs(diff) > tol and iters < max_iter: + iters += 1 + if int_score: + b = np.round(b) + d = np.round(d) + + score_a = function(a) + score_b = function(b) + score_c = function(c) + score_d = function(d) + + if score_b <= score_d: + opt_val = b + opt_score = score_b + c = d + d = b + b = a + delta * np.abs(c-a) + #if int_score: + #b = np.round(b) + else: + opt_val = d + opt_score = score_d + a = b + b = d + d = c - delta * np.abs(c-a) + #if int_score: + #d = np.round(b) + + #if int_score: + # opt_val = np.round(opt_val) + output.append((opt_val, opt_score)) + diff = score_b - score_d + score = opt_score + return np.round(opt_val, 2), opt_score, output + +def equal_interval(l_bound, u_bound, interval, function, int_score=False): + """ + Interval search, using interval as stepsize + + Parameters + ---------- + l_bound : float + initial min search section value + u_bound : float + initial max search section value + interval : float + constant used to determine width of search sections + function : function + obejective function to be evaluated at different section + values + int_score : boolean + False for float score, True for integer score + + Returns + ------- + opt_val : float + optimal value + opt_score : kernel + optimal score + output : list of tuples + searching history + """ + a = l_bound + c = u_bound + b = a + interval + if int_score: + a = np.round(a,0) + c = np.round(c,0) + b = np.round(b,0) + + output = [] + + score_a = function(a) + score_c = function(c) + + output.append((a,score_a)) + output.append((c,score_c)) + + if score_a < score_c: + opt_val = a + opt_score = score_a + else: + opt_val = c + opt_score = score_c + + while b < c: + score_b = function(b) + + output.append((b,score_b)) + + if score_b < opt_score: + opt_val = b + opt_score = score_b + b = b + interval + + return opt_val, opt_score, output + + +def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score, + gwr_func, bw_func, sel_func): + if init: + bw = sel_func(bw_func(y, X)) + print bw + optim_model = gwr_func(y, X, bw) + err = optim_model.resid_response.reshape((-1,1)) + est = optim_model.params + else: + model = GLM(y, X, family=self.family, constant=False).fit() + err = model.resid_response.reshape((-1,1)) + est = np.repeat(model.params.T, n, axis=0) + + + XB = np.multiply(est, X) + if rss_score: + rss = np.sum((err)**2) + iters = 0 + scores = [] + delta = 1e6 + BWs = [] + VALs = [] + + while delta > tol and iters < max_iter: + iters += 1 + new_XB = np.zeros_like(X) + bws = [] + vals = [] + ests = np.zeros_like(X) + f_XB = XB.copy() + f_err = err.copy() + for i in range(k): + temp_y = XB[:,i].reshape((-1,1)) + temp_y = temp_y + err + temp_X = X[:,i].reshape((-1,1)) + bw_class = bw_func(temp_y, temp_X) + bw = sel_func(bw_class) + optim_model = gwr_func(temp_y, temp_X, bw) + err = optim_model.resid_response.reshape((-1,1)) + est = optim_model.params.reshape((-1,)) + + new_XB[:,i] = np.multiply(est, temp_X.reshape((-1,))) + bws.append(bw) + ests[:,i] = est + vals.append(bw_class.bw[1]) + + predy = np.sum(np.multiply(ests, X), axis=1).reshape((-1,1)) + num = np.sum((new_XB - XB)**2)/n + den = np.sum(np.sum(new_XB, axis=1)**2) + score = (num/den)**0.5 + XB = new_XB + + if rss_score: + new_rss = np.sum((y - predy)**2) + score = np.abs((new_rss - rss)/new_rss) + rss = new_rss + print score + scores.append(score) + delta = score + BWs.append(bws) + VALs.append(vals) + + opt_bws = BWs[-1] + return opt_bws, np.array(BWs), np.array(VALs), np.array(scores), f_XB, f_err diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py new file mode 100644 index 0000000..9ab1263 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py @@ -0,0 +1,286 @@ +# GWR Bandwidth selection class + +#Thinking about removing the search method and just having optimization begin in +#class __init__ + +#x_glob and offset parameters dont yet do anything; former is for semiparametric +#GWR and later is for offset variable for Poisson model + +__author__ = "Taylor Oshan Tayoshan@gmail.com" + +from kernels import * +from search import golden_section, equal_interval, flexible_bw +from gwr import GWR +from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial +import pysal.spreg.user_output as USER +from diagnostics import get_AICc, get_AIC, get_BIC, get_CV +from scipy.spatial.distance import pdist, squareform +from pysal.common import KDTree +import numpy as np + +kernels = {1: fix_gauss, 2: adapt_gauss, 3: fix_bisquare, 4: + adapt_bisquare, 5: fix_exp, 6:adapt_exp} +getDiag = {'AICc': get_AICc,'AIC':get_AIC, 'BIC': get_BIC, 'CV': get_CV} + +class Sel_BW(object): + """ + Select bandwidth for kernel + + Methods: p211 - p213, bandwidth selection + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + + Parameters + ---------- + y : array + n*1, dependent variable. + x_glob : array + n*k1, fixed independent variable. + x_loc : array + n*k2, local independent variable, including constant. + coords : list of tuples + (x,y) of points used in bandwidth selection + family : string + GWR model type: 'Gaussian', 'logistic, 'Poisson'' + offset : array + n*1, offset variable for Poisson model + kernel : string + kernel function: 'gaussian', 'bisquare', 'exponetial' + fixed : boolean + True for fixed bandwidth and False for adaptive (NN) + fb : True for flexible (mutliple covaraite-specific) bandwidths + False for a traditional (same for all covariates) + bandwdith; defualt is False. + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + + Attributes + ---------- + y : array + n*1, dependent variable. + x_glob : array + n*k1, fixed independent variable. + x_loc : array + n*k2, local independent variable, including constant. + coords : list of tuples + (x,y) of points used in bandwidth selection + family : string + GWR model type: 'Gaussian', 'logistic, 'Poisson'' + kernel : string + type of kernel used and wether fixed or adaptive + criterion : string + bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV' + search : string + bw search method: 'golden', 'interval' + bw_min : float + min value used in bandwidth search + bw_max : float + max value used in bandwidth search + interval : float + interval increment used in interval search + tol : float + tolerance used to determine convergence + max_iter : integer + max interations if no convergence to tol + fb : True for flexible (mutliple covaraite-specific) bandwidths + False for a traditional (same for all covariates) + bandwdith; defualt is False. + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + """ + def __init__(self, coords, y, x_loc, x_glob=None, family=Gaussian(), + offset=None, kernel='bisquare', fixed=False, fb=False, constant=True): + self.coords = coords + self.y = y + self.x_loc = x_loc + if x_glob is not None: + self.x_glob = x_glob + else: + self.x_glob = [] + self.family=family + self.fixed = fixed + self.kernel = kernel + if offset is None: + self.offset = np.ones((len(y), 1)) + else: + self.offset = offset * 1.0 + self.fb = fb + self.constant = constant + + def search(self, search='golden_section', criterion='AICc', bw_min=0.0, + bw_max=0.0, interval=0.0, tol=1.0e-6, max_iter=200, init_fb=True, + tol_fb=1.0e-5, rss_score=False, max_iter_fb=200): + """ + Parameters + ---------- + criterion : string + bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV' + search : string + bw search method: 'golden', 'interval' + bw_min : float + min value used in bandwidth search + bw_max : float + max value used in bandwidth search + interval : float + interval increment used in interval search + tol : float + tolerance used to determine convergence + max_iter : integer + max iterations if no convergence to tol + init_fb : True to initialize flexible bandwidth search with + esitmates from a traditional GWR and False to + initialize flexible bandwidth search with global + regression estimates + tol_fb : convergence tolerence for the flexible bandwidth + backfitting algorithm; a larger tolerance may stop the + algorith faster though it may result in a less optimal + model + max_iter_fb : max iterations if no convergence to tol for flexible + bandwidth backfittign algorithm + rss_score : True to use the residual sum of sqaures to evaluate + each iteration of the flexible bandwidth backfitting + routine and False to use a smooth function; default is + False + + Returns + ------- + bw : scalar or array + optimal bandwidth value or values; returns scalar for + fb=False and array for fb=True; ordering of bandwidths + matches the ordering of the covariates (columns) of the + designs matrix, X + """ + self.search = search + self.criterion = criterion + self.bw_min = bw_min + self.bw_max = bw_max + self.interval = interval + self.tol = tol + self.max_iter = max_iter + self.init_fb = init_fb + self.tol_fb = tol_fb + self.rss_score = rss_score + self.max_iter_fb = max_iter_fb + + + if self.fixed: + if self.kernel == 'gaussian': + ktype = 1 + elif self.kernel == 'bisquare': + ktype = 3 + elif self.kernel == 'exponential': + ktype = 5 + else: + raise TypeError('Unsupported kernel function ', self.kernel) + else: + if self.kernel == 'gaussian': + ktype = 2 + elif self.kernel == 'bisquare': + ktype = 4 + elif self.kernel == 'exponential': + ktype = 6 + else: + raise TypeError('Unsupported kernel function ', self.kernel) + + function = lambda bw: getDiag[criterion]( + GWR(self.coords, self.y, self.x_loc, bw, family=self.family, + kernel=self.kernel, fixed=self.fixed, offset=self.offset).fit()) + + if ktype % 2 == 0: + int_score = True + else: + int_score = False + self.int_score = int_score + + if self.fb: + self._fbw() + print self.bw[1] + self.XB = self.bw[4] + self.err = self.bw[5] + else: + self._bw() + + return self.bw[0] + + def _bw(self): + gwr_func = lambda bw: getDiag[self.criterion]( + GWR(self.coords, self.y, self.x_loc, bw, family=self.family, + kernel=self.kernel, fixed=self.fixed, constant=self.constant).fit()) + if self.search == 'golden_section': + a,c = self._init_section(self.x_glob, self.x_loc, self.coords, + self.constant) + delta = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0 + self.bw = golden_section(a, c, delta, gwr_func, self.tol, + self.max_iter, self.int_score) + elif self.search == 'interval': + self.bw = equal_interval(self.bw_min, self.bw_max, self.interval, + gwr_func, self.int_score) + else: + raise TypeError('Unsupported computational search method ', search) + + def _fbw(self): + y = self.y + if self.constant: + X = USER.check_constant(self.x_loc) + else: + X = self.x_loc + n, k = X.shape + family = self.family + offset = self.offset + kernel = self.kernel + fixed = self.fixed + coords = self.coords + search = self.search + criterion = self.criterion + bw_min = self.bw_min + bw_max = self.bw_max + interval = self.interval + tol = self.tol + max_iter = self.max_iter + gwr_func = lambda y, X, bw: GWR(coords, y, X, bw, family=family, + kernel=kernel, fixed=fixed, offset=offset, constant=False).fit() + bw_func = lambda y, X: Sel_BW(coords, y, X, x_glob=[], family=family, + kernel=kernel, fixed=fixed, offset=offset, constant=False) + sel_func = lambda bw_func: bw_func.search(search=search, + criterion=criterion, bw_min=bw_min, bw_max=bw_max, + interval=interval, tol=tol, max_iter=max_iter) + self.bw = flexible_bw(self.init_fb, y, X, n, k, family, self.tol_fb, + self.max_iter_fb, self.rss_score, gwr_func, bw_func, sel_func) + + + + def _init_section(self, x_glob, x_loc, coords, constant): + if len(x_glob) > 0: + n_glob = x_glob.shape[1] + else: + n_glob = 0 + if len(x_loc) > 0: + n_loc = x_loc.shape[1] + else: + n_loc = 0 + if constant: + n_vars = n_glob + n_loc + 1 + else: + n_vars = n_glob + n_loc + n = np.array(coords).shape[0] + + if self.int_score: + a = 40 + 2 * n_vars + c = n + else: + nn = 40 + 2 * n_vars + sq_dists = squareform(pdist(coords)) + sort_dists = np.sort(sq_dists, axis=1) + min_dists = sort_dists[:,nn-1] + max_dists = sort_dists[:,-1] + a = np.min(min_dists)/2.0 + c = np.max(max_dists)/2.0 + + if a < self.bw_min: + a = self.bw_min + if c > self.bw_max and self.bw_max > 0: + c = self.bw_max + return a, c diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py new file mode 100644 index 0000000..7f12b7e --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py @@ -0,0 +1,853 @@ +""" +GWR is tested against results from GWR4 +""" + +import unittest +import pickle as pk +from crankshaft.regression.gwr.gwr import GWR, FBGWR +from crankshaft.regression.gwr.sel_bw import Sel_BW +from crankshaft.regression.gwr.diagnostics import get_AICc, get_AIC, get_BIC, get_CV +from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial +import numpy as np +import pysal + +class TestGWRGaussian(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('GData_utm.csv')) + self.coords = zip(data.by_col('X'), data.by_col('Y')) + self.y = np.array(data.by_col('PctBach')).reshape((-1,1)) + rural = np.array(data.by_col('PctRural')).reshape((-1,1)) + pov = np.array(data.by_col('PctPov')).reshape((-1,1)) + black = np.array(data.by_col('PctBlack')).reshape((-1,1)) + self.X = np.hstack([rural, pov, black]) + self.BS_F = pysal.open(pysal.examples.get_path('georgia_BS_F_listwise.csv')) + self.BS_NN = pysal.open(pysal.examples.get_path('georgia_BS_NN_listwise.csv')) + self.GS_F = pysal.open(pysal.examples.get_path('georgia_GS_F_listwise.csv')) + self.GS_NN = pysal.open(pysal.examples.get_path('georgia_GS_NN_listwise.csv')) + self.FB = pk.load(open(pysal.examples.get_path('FB.p'), 'r')) + self.XB = pk.load(open(pysal.examples.get_path('XB.p'), 'r')) + self.err = pk.load(open(pysal.examples.get_path('err.p'), 'r')) + + def test_BS_F(self): + est_Int = self.BS_F.by_col(' est_Intercept') + se_Int = self.BS_F.by_col(' se_Intercept') + t_Int = self.BS_F.by_col(' t_Intercept') + est_rural = self.BS_F.by_col(' est_PctRural') + se_rural = self.BS_F.by_col(' se_PctRural') + t_rural = self.BS_F.by_col(' t_PctRural') + est_pov = self.BS_F.by_col(' est_PctPov') + se_pov = self.BS_F.by_col(' se_PctPov') + t_pov = self.BS_F.by_col(' t_PctPov') + est_black = self.BS_F.by_col(' est_PctBlack') + se_black = self.BS_F.by_col(' se_PctBlack') + t_black = self.BS_F.by_col(' t_PctBlack') + yhat = self.BS_F.by_col(' yhat') + res = np.array(self.BS_F.by_col(' residual')) + std_res = np.array(self.BS_F.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.BS_F.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.BS_F.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.BS_F.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=209267.689, fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 894.0) + self.assertAlmostEquals(np.floor(AIC), 890.0) + self.assertAlmostEquals(np.floor(BIC), 944.0) + self.assertAlmostEquals(np.round(CV,2), 18.25) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_BS_NN(self): + est_Int = self.BS_NN.by_col(' est_Intercept') + se_Int = self.BS_NN.by_col(' se_Intercept') + t_Int = self.BS_NN.by_col(' t_Intercept') + est_rural = self.BS_NN.by_col(' est_PctRural') + se_rural = self.BS_NN.by_col(' se_PctRural') + t_rural = self.BS_NN.by_col(' t_PctRural') + est_pov = self.BS_NN.by_col(' est_PctPov') + se_pov = self.BS_NN.by_col(' se_PctPov') + t_pov = self.BS_NN.by_col(' t_PctPov') + est_black = self.BS_NN.by_col(' est_PctBlack') + se_black = self.BS_NN.by_col(' se_PctBlack') + t_black = self.BS_NN.by_col(' t_PctBlack') + yhat = self.BS_NN.by_col(' yhat') + res = np.array(self.BS_NN.by_col(' residual')) + std_res = np.array(self.BS_NN.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.BS_NN.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.BS_NN.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.BS_NN.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=90.000, fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 896.0) + self.assertAlmostEquals(np.floor(AIC), 892.0) + self.assertAlmostEquals(np.floor(BIC), 941.0) + self.assertAlmostEquals(np.around(CV, 2), 19.19) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_GS_F(self): + est_Int = self.GS_F.by_col(' est_Intercept') + se_Int = self.GS_F.by_col(' se_Intercept') + t_Int = self.GS_F.by_col(' t_Intercept') + est_rural = self.GS_F.by_col(' est_PctRural') + se_rural = self.GS_F.by_col(' se_PctRural') + t_rural = self.GS_F.by_col(' t_PctRural') + est_pov = self.GS_F.by_col(' est_PctPov') + se_pov = self.GS_F.by_col(' se_PctPov') + t_pov = self.GS_F.by_col(' t_PctPov') + est_black = self.GS_F.by_col(' est_PctBlack') + se_black = self.GS_F.by_col(' se_PctBlack') + t_black = self.GS_F.by_col(' t_PctBlack') + yhat = self.GS_F.by_col(' yhat') + res = np.array(self.GS_F.by_col(' residual')) + std_res = np.array(self.GS_F.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.GS_F.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.GS_F.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.GS_F.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=87308.298, + kernel='gaussian', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 895.0) + self.assertAlmostEquals(np.floor(AIC), 890.0) + self.assertAlmostEquals(np.floor(BIC), 943.0) + self.assertAlmostEquals(np.around(CV, 2), 18.21) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_GS_NN(self): + est_Int = self.GS_NN.by_col(' est_Intercept') + se_Int = self.GS_NN.by_col(' se_Intercept') + t_Int = self.GS_NN.by_col(' t_Intercept') + est_rural = self.GS_NN.by_col(' est_PctRural') + se_rural = self.GS_NN.by_col(' se_PctRural') + t_rural = self.GS_NN.by_col(' t_PctRural') + est_pov = self.GS_NN.by_col(' est_PctPov') + se_pov = self.GS_NN.by_col(' se_PctPov') + t_pov = self.GS_NN.by_col(' t_PctPov') + est_black = self.GS_NN.by_col(' est_PctBlack') + se_black = self.GS_NN.by_col(' se_PctBlack') + t_black = self.GS_NN.by_col(' t_PctBlack') + yhat = self.GS_NN.by_col(' yhat') + res = np.array(self.GS_NN.by_col(' residual')) + std_res = np.array(self.GS_NN.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.GS_NN.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.GS_NN.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.GS_NN.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=49.000, + kernel='gaussian', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 896) + self.assertAlmostEquals(np.floor(AIC), 894.0) + self.assertAlmostEquals(np.floor(BIC), 922.0) + self.assertAlmostEquals(np.around(CV, 2), 17.91) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_FBGWR(self): + model = FBGWR(self.coords, self.y, self.X, [157.0, 65.0, 52.0], + XB=self.XB, err=self.err, constant=False) + rslt = model.fit() + + np.testing.assert_allclose(rslt.predy, self.FB['predy'], atol=1e-07) + np.testing.assert_allclose(rslt.params, self.FB['params'], atol=1e-07) + np.testing.assert_allclose(rslt.resid_response, self.FB['u'], atol=1e-05) + np.testing.assert_almost_equal(rslt.resid_ss, 6339.3497144025841) + + def test_Prediction(self): + coords =np.array(self.coords) + index = np.arange(len(self.y)) + #train = index[0:-10] + test = index[-10:] + + #y_train = self.y[train] + #X_train = self.X[train] + #coords_train = list(coords[train]) + + #y_test = self.y[test] + X_test = self.X[test] + coords_test = list(coords[test]) + + + model = GWR(self.coords, self.y, self.X, 93, family=Gaussian(), + fixed=False, kernel='bisquare') + results = model.predict(coords_test, X_test) + + params = np.array([22.77198, -0.10254, -0.215093, -0.01405, + 19.10531, -0.094177, -0.232529, 0.071913, + 19.743421, -0.080447, -0.30893, 0.083206, + 17.505759, -0.078919, -0.187955, 0.051719, + 27.747402, -0.165335, -0.208553, 0.004067, + 26.210627, -0.138398, -0.360514, 0.072199, + 18.034833, -0.077047, -0.260556, 0.084319, + 28.452802, -0.163408, -0.14097, -0.063076, + 22.353095, -0.103046, -0.226654, 0.002992, + 18.220508, -0.074034, -0.309812, 0.108636]).reshape((10,4)) + np.testing.assert_allclose(params, results.params, rtol=1e-03) + + bse = np.array([2.080166, 0.021462, 0.102954, 0.049627, + 2.536355, 0.022111, 0.123857, 0.051917, + 1.967813, 0.019716, 0.102562, 0.054918, + 2.463219, 0.021745, 0.110297, 0.044189, + 1.556056, 0.019513, 0.12764, 0.040315, + 1.664108, 0.020114, 0.131208, 0.041613, + 2.5835, 0.021481, 0.113158, 0.047243, + 1.709483, 0.019752, 0.116944, 0.043636, + 1.958233, 0.020947, 0.09974, 0.049821, + 2.276849, 0.020122, 0.107867, 0.047842]).reshape((10,4)) + np.testing.assert_allclose(bse, results.bse, rtol=1e-03) + + tvalues = np.array([10.947193, -4.777659, -2.089223, -0.283103, + 7.532584, -4.259179, -1.877395, 1.385161, + 10.033179, -4.080362, -3.012133, 1.515096, + 7.106862, -3.629311, -1.704079, 1.17042, + 17.831878, -8.473156, -1.633924, 0.100891, + 15.750552, -6.880725, -2.74765, 1.734978, + 6.980774, -3.586757, -2.302575, 1.784818, + 16.644095, -8.273001, -1.205451, -1.445501, + 11.414933, -4.919384, -2.272458, 0.060064, + 8.00251, -3.679274, -2.872176, 2.270738]).reshape((10,4)) + np.testing.assert_allclose(tvalues, results.tvalues, rtol=1e-03) + + localR2 = np.array([[ 0.53068693], + [ 0.59582647], + [ 0.59700925], + [ 0.45769954], + [ 0.54634509], + [ 0.5494828 ], + [ 0.55159604], + [ 0.55634237], + [ 0.53903842], + [ 0.55884954]]) + np.testing.assert_allclose(localR2, results.localR2, rtol=1e-05) + +class TestGWRPoisson(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('Tokyomortality.csv'), mode='Ur') + self.coords = zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID')) + self.y = np.array(data.by_col('db2564')).reshape((-1,1)) + self.off = np.array(data.by_col('eb2564')).reshape((-1,1)) + OCC = np.array(data.by_col('OCC_TEC')).reshape((-1,1)) + OWN = np.array(data.by_col('OWNH')).reshape((-1,1)) + POP = np.array(data.by_col('POP65')).reshape((-1,1)) + UNEMP = np.array(data.by_col('UNEMP')).reshape((-1,1)) + self.X = np.hstack([OCC,OWN,POP,UNEMP]) + self.BS_F = pysal.open(pysal.examples.get_path('tokyo_BS_F_listwise.csv')) + self.BS_NN = pysal.open(pysal.examples.get_path('tokyo_BS_NN_listwise.csv')) + self.GS_F = pysal.open(pysal.examples.get_path('tokyo_GS_F_listwise.csv')) + self.GS_NN = pysal.open(pysal.examples.get_path('tokyo_GS_NN_listwise.csv')) + self.BS_NN_OFF = pysal.open(pysal.examples.get_path('tokyo_BS_NN_OFF_listwise.csv')) + + def test_BS_F(self): + est_Int = self.BS_F.by_col(' est_Intercept') + se_Int = self.BS_F.by_col(' se_Intercept') + t_Int = self.BS_F.by_col(' t_Intercept') + est_OCC = self.BS_F.by_col(' est_OCC_TEC') + se_OCC = self.BS_F.by_col(' se_OCC_TEC') + t_OCC = self.BS_F.by_col(' t_OCC_TEC') + est_OWN = self.BS_F.by_col(' est_OWNH') + se_OWN = self.BS_F.by_col(' se_OWNH') + t_OWN = self.BS_F.by_col(' t_OWNH') + est_POP = self.BS_F.by_col(' est_POP65') + se_POP = self.BS_F.by_col(' se_POP65') + t_POP = self.BS_F.by_col(' t_POP65') + est_UNEMP = self.BS_F.by_col(' est_UNEMP') + se_UNEMP = self.BS_F.by_col(' se_UNEMP') + t_UNEMP = self.BS_F.by_col(' t_UNEMP') + yhat = self.BS_F.by_col(' yhat') + pdev = np.array(self.BS_F.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=26029.625, family=Poisson(), + kernel='bisquare', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 13294.0) + self.assertAlmostEquals(np.floor(AIC), 13247.0) + self.assertAlmostEquals(np.floor(BIC), 13485.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-05) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-03) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-03) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-03) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-03) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-04) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + + def test_BS_NN(self): + est_Int = self.BS_NN.by_col(' est_Intercept') + se_Int = self.BS_NN.by_col(' se_Intercept') + t_Int = self.BS_NN.by_col(' t_Intercept') + est_OCC = self.BS_NN.by_col(' est_OCC_TEC') + se_OCC = self.BS_NN.by_col(' se_OCC_TEC') + t_OCC = self.BS_NN.by_col(' t_OCC_TEC') + est_OWN = self.BS_NN.by_col(' est_OWNH') + se_OWN = self.BS_NN.by_col(' se_OWNH') + t_OWN = self.BS_NN.by_col(' t_OWNH') + est_POP = self.BS_NN.by_col(' est_POP65') + se_POP = self.BS_NN.by_col(' se_POP65') + t_POP = self.BS_NN.by_col(' t_POP65') + est_UNEMP = self.BS_NN.by_col(' est_UNEMP') + se_UNEMP = self.BS_NN.by_col(' se_UNEMP') + t_UNEMP = self.BS_NN.by_col(' t_UNEMP') + yhat = self.BS_NN.by_col(' yhat') + pdev = np.array(self.BS_NN.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=50, family=Poisson(), + kernel='bisquare', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 13285) + self.assertAlmostEquals(np.floor(AIC), 13259.0) + self.assertAlmostEquals(np.floor(BIC), 13442.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-03) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_BS_NN_Offset(self): + est_Int = self.BS_NN_OFF.by_col(' est_Intercept') + se_Int = self.BS_NN_OFF.by_col(' se_Intercept') + t_Int = self.BS_NN_OFF.by_col(' t_Intercept') + est_OCC = self.BS_NN_OFF.by_col(' est_OCC_TEC') + se_OCC = self.BS_NN_OFF.by_col(' se_OCC_TEC') + t_OCC = self.BS_NN_OFF.by_col(' t_OCC_TEC') + est_OWN = self.BS_NN_OFF.by_col(' est_OWNH') + se_OWN = self.BS_NN_OFF.by_col(' se_OWNH') + t_OWN = self.BS_NN_OFF.by_col(' t_OWNH') + est_POP = self.BS_NN_OFF.by_col(' est_POP65') + se_POP = self.BS_NN_OFF.by_col(' se_POP65') + t_POP = self.BS_NN_OFF.by_col(' t_POP65') + est_UNEMP = self.BS_NN_OFF.by_col(' est_UNEMP') + se_UNEMP = self.BS_NN_OFF.by_col(' se_UNEMP') + t_UNEMP = self.BS_NN_OFF.by_col(' t_UNEMP') + yhat = self.BS_NN_OFF.by_col(' yhat') + pdev = np.array(self.BS_NN_OFF.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=100, offset=self.off, family=Poisson(), + kernel='bisquare', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 367.0) + self.assertAlmostEquals(np.floor(AIC), 361.0) + self.assertAlmostEquals(np.floor(BIC), 451.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-02, + atol=1e-02) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03, + atol=1e-02) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04, + atol=1e-02) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-03, + atol=1e-02) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04, + atol=1e-02) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02, + atol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-03, atol=1e-02) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-04, atol=1e-02) + + def test_GS_F(self): + est_Int = self.GS_F.by_col(' est_Intercept') + se_Int = self.GS_F.by_col(' se_Intercept') + t_Int = self.GS_F.by_col(' t_Intercept') + est_OCC = self.GS_F.by_col(' est_OCC_TEC') + se_OCC = self.GS_F.by_col(' se_OCC_TEC') + t_OCC = self.GS_F.by_col(' t_OCC_TEC') + est_OWN = self.GS_F.by_col(' est_OWNH') + se_OWN = self.GS_F.by_col(' se_OWNH') + t_OWN = self.GS_F.by_col(' t_OWNH') + est_POP = self.GS_F.by_col(' est_POP65') + se_POP = self.GS_F.by_col(' se_POP65') + t_POP = self.GS_F.by_col(' t_POP65') + est_UNEMP = self.GS_F.by_col(' est_UNEMP') + se_UNEMP = self.GS_F.by_col(' se_UNEMP') + t_UNEMP = self.GS_F.by_col(' t_UNEMP') + yhat = self.GS_F.by_col(' yhat') + pdev = np.array(self.GS_F.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=8764.474, family=Poisson(), + kernel='gaussian', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 11283.0) + self.assertAlmostEquals(np.floor(AIC), 11211.0) + self.assertAlmostEquals(np.floor(BIC), 11497.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-03) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-03) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-02) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_GS_NN(self): + est_Int = self.GS_NN.by_col(' est_Intercept') + se_Int = self.GS_NN.by_col(' se_Intercept') + t_Int = self.GS_NN.by_col(' t_Intercept') + est_OCC = self.GS_NN.by_col(' est_OCC_TEC') + se_OCC = self.GS_NN.by_col(' se_OCC_TEC') + t_OCC = self.GS_NN.by_col(' t_OCC_TEC') + est_OWN = self.GS_NN.by_col(' est_OWNH') + se_OWN = self.GS_NN.by_col(' se_OWNH') + t_OWN = self.GS_NN.by_col(' t_OWNH') + est_POP = self.GS_NN.by_col(' est_POP65') + se_POP = self.GS_NN.by_col(' se_POP65') + t_POP = self.GS_NN.by_col(' t_POP65') + est_UNEMP = self.GS_NN.by_col(' est_UNEMP') + se_UNEMP = self.GS_NN.by_col(' se_UNEMP') + t_UNEMP = self.GS_NN.by_col(' t_UNEMP') + yhat = self.GS_NN.by_col(' yhat') + pdev = np.array(self.GS_NN.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=50, family=Poisson(), + kernel='gaussian', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 21070.0) + self.assertAlmostEquals(np.floor(AIC), 21069.0) + self.assertAlmostEquals(np.floor(BIC), 21111.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-02) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + +class TestGWRBinomial(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('landslides.csv')) + self.coords = zip(data.by_col('X'), data.by_col('Y')) + self.y = np.array(data.by_col('Landslid')).reshape((-1,1)) + ELEV = np.array(data.by_col('Elev')).reshape((-1,1)) + SLOPE = np.array(data.by_col('Slope')).reshape((-1,1)) + SIN = np.array(data.by_col('SinAspct')).reshape((-1,1)) + COS = np.array(data.by_col('CosAspct')).reshape((-1,1)) + SOUTH = np.array(data.by_col('AbsSouth')).reshape((-1,1)) + DIST = np.array(data.by_col('DistStrm')).reshape((-1,1)) + self.X = np.hstack([ELEV, SLOPE, SIN, COS, SOUTH, DIST]) + self.BS_F = pysal.open(pysal.examples.get_path('clearwater_BS_F_listwise.csv')) + self.BS_NN = pysal.open(pysal.examples.get_path('clearwater_BS_NN_listwise.csv')) + self.GS_F = pysal.open(pysal.examples.get_path('clearwater_GS_F_listwise.csv')) + self.GS_NN = pysal.open(pysal.examples.get_path('clearwater_GS_NN_listwise.csv')) + + def test_BS_F(self): + est_Int = self.BS_F.by_col(' est_Intercept') + se_Int = self.BS_F.by_col(' se_Intercept') + t_Int = self.BS_F.by_col(' t_Intercept') + est_elev = self.BS_F.by_col(' est_Elev') + se_elev = self.BS_F.by_col(' se_Elev') + t_elev = self.BS_F.by_col(' t_Elev') + est_slope = self.BS_F.by_col(' est_Slope') + se_slope = self.BS_F.by_col(' se_Slope') + t_slope = self.BS_F.by_col(' t_Slope') + est_sin = self.BS_F.by_col(' est_SinAspct') + se_sin = self.BS_F.by_col(' se_SinAspct') + t_sin = self.BS_F.by_col(' t_SinAspct') + est_cos = self.BS_F.by_col(' est_CosAspct') + se_cos = self.BS_F.by_col(' se_CosAspct') + t_cos = self.BS_F.by_col(' t_CosAspct') + est_south = self.BS_F.by_col(' est_AbsSouth') + se_south = self.BS_F.by_col(' se_AbsSouth') + t_south = self.BS_F.by_col(' t_AbsSouth') + est_strm = self.BS_F.by_col(' est_DistStrm') + se_strm = self.BS_F.by_col(' se_DistStrm') + t_strm = self.BS_F.by_col(' t_DistStrm') + yhat = self.BS_F.by_col(' yhat') + pdev = np.array(self.BS_F.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=19642.170, family=Binomial(), + kernel='bisquare', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 275.0) + self.assertAlmostEquals(np.floor(AIC), 271.0) + self.assertAlmostEquals(np.floor(BIC), 349.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_BS_NN(self): + est_Int = self.BS_NN.by_col(' est_Intercept') + se_Int = self.BS_NN.by_col(' se_Intercept') + t_Int = self.BS_NN.by_col(' t_Intercept') + est_elev = self.BS_NN.by_col(' est_Elev') + se_elev = self.BS_NN.by_col(' se_Elev') + t_elev = self.BS_NN.by_col(' t_Elev') + est_slope = self.BS_NN.by_col(' est_Slope') + se_slope = self.BS_NN.by_col(' se_Slope') + t_slope = self.BS_NN.by_col(' t_Slope') + est_sin = self.BS_NN.by_col(' est_SinAspct') + se_sin = self.BS_NN.by_col(' se_SinAspct') + t_sin = self.BS_NN.by_col(' t_SinAspct') + est_cos = self.BS_NN.by_col(' est_CosAspct') + se_cos = self.BS_NN.by_col(' se_CosAspct') + t_cos = self.BS_NN.by_col(' t_CosAspct') + est_south = self.BS_NN.by_col(' est_AbsSouth') + se_south = self.BS_NN.by_col(' se_AbsSouth') + t_south = self.BS_NN.by_col(' t_AbsSouth') + est_strm = self.BS_NN.by_col(' est_DistStrm') + se_strm = self.BS_NN.by_col(' se_DistStrm') + t_strm = self.BS_NN.by_col(' t_DistStrm') + yhat = self.BS_NN.by_col(' yhat') + pdev = self.BS_NN.by_col(' localpdev') + + model = GWR(self.coords, self.y, self.X, bw=158, family=Binomial(), + kernel='bisquare', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 277.0) + self.assertAlmostEquals(np.floor(AIC), 271.0) + self.assertAlmostEquals(np.floor(BIC), 358.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e03) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e03) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_GS_F(self): + est_Int = self.GS_F.by_col(' est_Intercept') + se_Int = self.GS_F.by_col(' se_Intercept') + t_Int = self.GS_F.by_col(' t_Intercept') + est_elev = self.GS_F.by_col(' est_Elev') + se_elev = self.GS_F.by_col(' se_Elev') + t_elev = self.GS_F.by_col(' t_Elev') + est_slope = self.GS_F.by_col(' est_Slope') + se_slope = self.GS_F.by_col(' se_Slope') + t_slope = self.GS_F.by_col(' t_Slope') + est_sin = self.GS_F.by_col(' est_SinAspct') + se_sin = self.GS_F.by_col(' se_SinAspct') + t_sin = self.GS_F.by_col(' t_SinAspct') + est_cos = self.GS_F.by_col(' est_CosAspct') + se_cos = self.GS_F.by_col(' se_CosAspct') + t_cos = self.GS_F.by_col(' t_CosAspct') + est_south = self.GS_F.by_col(' est_AbsSouth') + se_south = self.GS_F.by_col(' se_AbsSouth') + t_south = self.GS_F.by_col(' t_AbsSouth') + est_strm = self.GS_F.by_col(' est_DistStrm') + se_strm = self.GS_F.by_col(' se_DistStrm') + t_strm = self.GS_F.by_col(' t_DistStrm') + yhat = self.GS_F.by_col(' yhat') + pdev = self.GS_F.by_col(' localpdev') + + model = GWR(self.coords, self.y, self.X, bw=8929.061, family=Binomial(), + kernel='gaussian', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 276.0) + self.assertAlmostEquals(np.floor(AIC), 272.0) + self.assertAlmostEquals(np.floor(BIC), 341.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_GS_NN(self): + est_Int = self.GS_NN.by_col(' est_Intercept') + se_Int = self.GS_NN.by_col(' se_Intercept') + t_Int = self.GS_NN.by_col(' t_Intercept') + est_elev = self.GS_NN.by_col(' est_Elev') + se_elev = self.GS_NN.by_col(' se_Elev') + t_elev = self.GS_NN.by_col(' t_Elev') + est_slope = self.GS_NN.by_col(' est_Slope') + se_slope = self.GS_NN.by_col(' se_Slope') + t_slope = self.GS_NN.by_col(' t_Slope') + est_sin = self.GS_NN.by_col(' est_SinAspct') + se_sin = self.GS_NN.by_col(' se_SinAspct') + t_sin = self.GS_NN.by_col(' t_SinAspct') + est_cos = self.GS_NN.by_col(' est_CosAspct') + se_cos = self.GS_NN.by_col(' se_CosAspct') + t_cos = self.GS_NN.by_col(' t_CosAspct') + est_south = self.GS_NN.by_col(' est_AbsSouth') + se_south = self.GS_NN.by_col(' se_AbsSouth') + t_south = self.GS_NN.by_col(' t_AbsSouth') + est_strm = self.GS_NN.by_col(' est_DistStrm') + se_strm = self.GS_NN.by_col(' se_DistStrm') + t_strm = self.GS_NN.by_col(' t_DistStrm') + yhat = self.GS_NN.by_col(' yhat') + pdev = self.GS_NN.by_col(' localpdev') + + model = GWR(self.coords, self.y, self.X, bw=64, family=Binomial(), + kernel='gaussian', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 276.0) + self.assertAlmostEquals(np.floor(AIC), 273.0) + self.assertAlmostEquals(np.floor(BIC), 331.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-00) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py new file mode 100644 index 0000000..ea044b9 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py @@ -0,0 +1,84 @@ +import unittest +import numpy as np +import pysal +from pysal.contrib.gwr.kernels import * + +PEGP = pysal.examples.get_path + +class TestKernels(unittest.TestCase): + def setUp(self): + np.random.seed(1234) + x = np.arange(1,6) + y = np.arange(5,0, -1) + np.random.shuffle(x) + np.random.shuffle(y) + self.coords = np.array(zip(x, y)) + self.fix_gauss_kern = np.array([ + [ 1. , 0.38889556, 0.48567179, 0.48567179, 0.89483932], + [ 0.38889556, 1. , 0.89483932, 0.64118039, 0.48567179], + [ 0.48567179, 0.89483932, 1. , 0.89483932, 0.48567179], + [ 0.48567179, 0.64118039, 0.89483932, 1. , 0.38889556], + [ 0.89483932, 0.48567179, 0.48567179, 0.38889556, 1. ]]) + self.adapt_gauss_kern = np.array([ + [ 1. , 0.52004183, 0.60653072, 0.60653072, 0.92596109], + [ 0.34559083, 1. , 0.88249692, 0.60653072, 0.44374738], + [ 0.03877423, 0.60653072, 1. , 0.60653072, 0.03877423], + [ 0.44374738, 0.60653072, 0.88249692, 1. , 0.34559083], + [ 0.92596109, 0.60653072, 0.60653072, 0.52004183, 1. ]]) + self.fix_bisquare_kern = np.array([ + [ 1. , 0. , 0. , 0. , 0.60493827], + [ 0. , 1. , 0.60493827, 0.01234568, 0. ], + [ 0. , 0.60493827, 1. , 0.60493827, 0. ], + [ 0. , 0.01234568, 0.60493827, 1. , 0. ], + [ 0.60493827, 0. , 0. , 0. , 1. ]]) + self.adapt_bisquare_kern = np.array([ + [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, + 3.99999881e-14, 7.15976383e-01], + [ 0.00000000e+00, 1.00000000e+00, 5.62500075e-01, + 3.99999881e-14, 0.00000000e+00], + [ 0.00000000e+00, 3.99999881e-14, 1.00000000e+00, + 3.99999881e-14, 0.00000000e+00], + [ 0.00000000e+00, 3.99999881e-14, 5.62500075e-01, + 1.00000000e+00, 0.00000000e+00], + [ 7.15976383e-01, 0.00000000e+00, 3.99999881e-14, + 0.00000000e+00, 1.00000000e+00]]) + self.fix_exp_kern = np.array([ + [ 1. , 0.2529993 , 0.30063739, 0.30063739, 0.62412506], + [ 0.2529993 , 1. , 0.62412506, 0.38953209, 0.30063739], + [ 0.30063739, 0.62412506, 1. , 0.62412506, 0.30063739], + [ 0.30063739, 0.38953209, 0.62412506, 1. , 0.2529993 ], + [ 0.62412506, 0.30063739, 0.30063739, 0.2529993 , 1. ]]) + self.adapt_exp_kern = np.array([ + [ 1. , 0.31868771, 0.36787948, 0.36787948, 0.67554721], + [ 0.23276223, 1. , 0.60653069, 0.36787948, 0.27949951], + [ 0.07811997, 0.36787948, 1. , 0.36787948, 0.07811997], + [ 0.27949951, 0.36787948, 0.60653069, 1. , 0.23276223], + [ 0.67554721, 0.36787948, 0.36787948, 0.31868771, 1. ]]) + + def test_fix_gauss(self): + kern = fix_gauss(self.coords, 3) + np.testing.assert_allclose(kern, self.fix_gauss_kern) + + def test_adapt_gauss(self): + kern = adapt_gauss(self.coords, 3) + np.testing.assert_allclose(kern, self.adapt_gauss_kern) + + def test_fix_biqsquare(self): + kern = fix_bisquare(self.coords, 3) + np.testing.assert_allclose(kern, self.fix_bisquare_kern, + atol=1e-01) + + def test_adapt_bisqaure(self): + kern = adapt_bisquare(self.coords, 3) + np.testing.assert_allclose(kern, self.adapt_bisquare_kern, atol=1e-012) + + def test_fix_exp(self): + kern = fix_exp(self.coords, 3) + np.testing.assert_allclose(kern, self.fix_exp_kern) + + def test_adapt_exp(self): + kern = adapt_exp(self.coords, 3) + np.testing.assert_allclose(kern, self.adapt_exp_kern) + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py new file mode 100644 index 0000000..47c6d9d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py @@ -0,0 +1,139 @@ + +""" +GWR is tested against results from GWR4 +""" + +import unittest +import pickle as pk +from pysal.contrib.glm.family import Gaussian, Poisson, Binomial +from pysal.contrib.gwr.sel_bw import Sel_BW +import numpy as np +import pysal + +class TestSelBW(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('GData_utm.csv')) + self.coords = zip(data.by_col('X'), data.by_col('Y')) + self.y = np.array(data.by_col('PctBach')).reshape((-1,1)) + rural = np.array(data.by_col('PctRural')).reshape((-1,1)) + pov = np.array(data.by_col('PctPov')).reshape((-1,1)) + black = np.array(data.by_col('PctBlack')).reshape((-1,1)) + self.X = np.hstack([rural, pov, black]) + self.XB = pk.load(open(pysal.examples.get_path('XB.p'), 'r')) + self.err = pk.load(open(pysal.examples.get_path('err.p'), 'r')) + + def test_golden_fixed_AICc(self): + bw1 = 211027.34 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=True).search(criterion='AICc') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_AICc(self): + bw1 = 93.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=False).search(criterion='AICc') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_fixed_AIC(self): + bw1 = 76169.15 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='AIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_AIC(self): + bw1 = 50.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='AIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_fixed_BIC(self): + bw1 = 279451.43 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='BIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_BIC(self): + bw1 = 62.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='BIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_fixed_CV(self): + bw1 = 130406.67 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='CV') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_CV(self): + bw1 = 68.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='CV') + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_AICc(self): + bw1 = 211025.0#211027.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=True).search(criterion='AICc', search='interval', bw_min=211001., + bw_max=211035.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_AICc(self): + bw1 = 93.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=False).search(criterion='AICc', search='interval', + bw_min=90.0, bw_max=95.0, interval=1) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_AIC(self): + bw1 = 76175.0#76169.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='AIC', search='interval', + bw_min=76161.0, bw_max=76175.0, interval=1) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_AIC(self): + bw1 = 40.0#50.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='AIC', search='interval', bw_min=40.0, + bw_max=60.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_BIC(self): + bw1 = 279461.0#279451.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='BIC', search='interval', bw_min=279441.0, + bw_max=279461.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_BIC(self): + bw1 = 62.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='BIC', search='interval', + bw_min=52.0, bw_max=72.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_CV(self): + bw1 = 130400.0#130406.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='CV', search='interval', bw_min=130400.0, + bw_max=130410.0, interval=1) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_CV(self): + bw1 = 62.0#68.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='CV', search='interval', bw_min=60.0, + bw_max=76.0 , interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_FBGWR_AIC(self): + bw1 = [157.0, 65.0, 52.0] + sel = Sel_BW(self.coords, self.y, self.X, fb=True, kernel='bisquare', + constant=False) + bw2 = sel.search(tol_fb=1e-03) + np.testing.assert_allclose(bw1, bw2) + np.testing.assert_allclose(sel.XB, self.XB, atol=1e-05) + np.testing.assert_allclose(sel.err, self.err, atol=1e-05) + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py new file mode 100644 index 0000000..9ccaefb --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py @@ -0,0 +1,202 @@ +""" + Geographically weighted regression +""" +import numpy as np +from gwr.base.gwr import GWR as PySAL_GWR +from gwr.base.sel_bw import Sel_BW +import json +from crankshaft.analysis_data_provider import AnalysisDataProvider +import plpy + + +class GWR: + def __init__(self, data_provider=None): + if data_provider: + self.data_provider = data_provider + else: + self.data_provider = AnalysisDataProvider() + + def gwr(self, subquery, dep_var, ind_vars, + bw=None, fixed=False, kernel='bisquare', + geom_col='the_geom', id_col='cartodb_id'): + """ + subquery: 'select * from demographics' + dep_var: 'pctbachelor' + ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack'] + bw: value of bandwidth, if None then select optimal + fixed: False (kNN) or True ('distance') + kernel: 'bisquare' (default), or 'exponential', 'gaussian' + """ + + params = {'geom_col': geom_col, + 'id_col': id_col, + 'subquery': subquery, + 'dep_var': dep_var, + 'ind_vars': ind_vars} + + # get data from data provider + query_result = self.data_provider.get_gwr(params) + + # exit if data to analyze is empty + if len(query_result) == 0: + plpy.error('No data passed to analysis or independent variables ' + 'are all null-valued') + + # unique ids and variable names list + rowid = np.array(query_result[0]['rowid'], dtype=np.int) + + # x, y are centroids of input geometries + x = np.array(query_result[0]['x'], dtype=np.float) + y = np.array(query_result[0]['y'], dtype=np.float) + coords = zip(x, y) + + # extract dependent variable + Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape((-1, 1)) + + n = Y.shape[0] + k = len(ind_vars) + X = np.zeros((n, k)) + + # extract query result + for attr in range(0, k): + attr_name = 'attr' + str(attr + 1) + X[:, attr] = np.array( + query_result[0][attr_name], dtype=np.float).flatten() + + # add intercept variable name + ind_vars.insert(0, 'intercept') + + # calculate bandwidth if none is supplied + if bw is None: + bw = Sel_BW(coords, Y, X, + fixed=fixed, kernel=kernel).search() + model = PySAL_GWR(coords, Y, X, bw, + fixed=fixed, kernel=kernel).fit() + + # containers for outputs + coeffs = [] + stand_errs = [] + t_vals = [] + filtered_t_vals = [] + + # extracted model information + c_alpha = model.adj_alpha + filtered_t = model.filter_tvals(c_alpha[1]) + predicted = model.predy.flatten() + residuals = model.resid_response + r_squared = model.localR2.flatten() + bw = np.repeat(float(bw), n) + + # create lists of json objs for model outputs + for idx in xrange(n): + coeffs.append(json.dumps({var: model.params[idx, k] + for k, var in enumerate(ind_vars)})) + stand_errs.append(json.dumps({var: model.bse[idx, k] + for k, var in enumerate(ind_vars)})) + t_vals.append(json.dumps({var: model.tvalues[idx, k] + for k, var in enumerate(ind_vars)})) + filtered_t_vals.append( + json.dumps({var: filtered_t[idx, k] + for k, var in enumerate(ind_vars)})) + + return zip(coeffs, stand_errs, t_vals, filtered_t_vals, + predicted, residuals, r_squared, bw, rowid) + + def gwr_predict(self, subquery, dep_var, ind_vars, + bw=None, fixed=False, kernel='bisquare', + geom_col='the_geom', id_col='cartodb_id'): + """ + subquery: 'select * from demographics' + dep_var: 'pctbachelor' + ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack'] + bw: value of bandwidth, if None then select optimal + fixed: False (kNN) or True ('distance') + kernel: 'bisquare' (default), or 'exponential', 'gaussian' + """ + + params = {'geom_col': geom_col, + 'id_col': id_col, + 'subquery': subquery, + 'dep_var': dep_var, + 'ind_vars': ind_vars} + + # get data from data provider + query_result = self.data_provider.get_gwr_predict(params) + + # exit if data to analyze is empty + if len(query_result) == 0: + plpy.error('No data passed to analysis or independent variables ' + 'are all null-valued') + + # unique ids and variable names list + rowid = np.array(query_result[0]['rowid'], dtype=np.int) + + x = np.array(query_result[0]['x'], dtype=np.float) + y = np.array(query_result[0]['y'], dtype=np.float) + coords = np.array(zip(x, y), dtype=np.float) + + # extract dependent variable + Y = np.array(query_result[0]['dep_var']).reshape((-1, 1)) + + n = Y.shape[0] + k = len(ind_vars) + X = np.empty((n, k), dtype=np.float) + + for attr in range(0, k): + attr_name = 'attr' + str(attr + 1) + X[:, attr] = np.array( + query_result[0][attr_name], dtype=np.float).flatten() + + # add intercept variable name + ind_vars.insert(0, 'intercept') + + # split data into "training" and "test" for predictions + # create index to split based on null y values + train = np.where(Y != np.array(None))[0] + test = np.where(Y == np.array(None))[0] + + # report error if there is no data to predict + if len(test) < 1: + plpy.error('No rows flagged for prediction: verify that rows ' + 'denoting prediction locations have a dependent ' + 'variable value of `null`') + + # split dependent variable (only need training which is non-Null's) + Y_train = Y[train].reshape((-1, 1)) + Y_train = Y_train.astype(np.float) + + # split coords + coords_train = coords[train] + coords_test = coords[test] + + # split explanatory variables + X_train = X[train] + X_test = X[test] + + # calculate bandwidth if none is supplied + if bw is None: + bw = Sel_BW(coords_train, Y_train, X_train, + fixed=fixed, kernel=kernel).search() + + # estimate model and predict at new locations + model = PySAL_GWR(coords_train, Y_train, X_train, + bw, fixed=fixed, + kernel=kernel).predict(coords_test, X_test) + + coeffs = [] + stand_errs = [] + t_vals = [] + r_squared = model.localR2.flatten() + predicted = model.predy.flatten() + + m = len(model.predy) + for idx in xrange(m): + coeffs.append(json.dumps({var: model.params[idx, k] + for k, var in enumerate(ind_vars)})) + stand_errs.append(json.dumps({var: model.bse[idx, k] + for k, var in enumerate(ind_vars)})) + t_vals.append(json.dumps({var: model.tvalues[idx, k] + for k, var in enumerate(ind_vars)})) + + return zip(coeffs, stand_errs, t_vals, + r_squared, predicted, rowid[test]) diff --git a/release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py new file mode 100644 index 0000000..b825e85 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py @@ -0,0 +1 @@ +from segmentation import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py b/release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py new file mode 100644 index 0000000..ed61139 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py @@ -0,0 +1,176 @@ +""" +Segmentation creation and prediction +""" + +import sklearn +import numpy as np +import plpy +from sklearn.ensemble import GradientBoostingRegressor +from sklearn import metrics +from sklearn.cross_validation import train_test_split + +# Lower level functions +#---------------------- + +def replace_nan_with_mean(array): + """ + Input: + @param array: an array of floats which may have null-valued entries + Output: + array with nans filled in with the mean of the dataset + """ + # returns an array of rows and column indices + indices = np.where(np.isnan(array)) + + # iterate through entries which have nan values + for row, col in zip(*indices): + array[row, col] = np.mean(array[~np.isnan(array[:, col]), col]) + + return array + +def get_data(variable, feature_columns, query): + """ + Fetch data from the database, clean, and package into + numpy arrays + Input: + @param variable: name of the target variable + @param feature_columns: list of column names + @param query: subquery that data is pulled from for the packaging + Output: + prepared data, packaged into NumPy arrays + """ + + columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns]) + + try: + data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format( + variable=variable, + columns=columns, + query=query)) + except Exception, e: + plpy.error('Failed to access data to build segmentation model: %s' % e) + + # extract target data from plpy object + target = np.array(data[0]['target']) + + # put n feature data arrays into an n x m array of arrays + features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns]) + + return replace_nan_with_mean(target), replace_nan_with_mean(features) + +# High level interface +# -------------------- + +def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters): + """ + Version of create_and_predict_segment that works on arrays that come stright form the SQL calling + the function. + + Input: + @param target: The 1D array of lenth NSamples containing the target variable we want the model to predict + @param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model + @param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from + @param model_parameters: A dictionary containing parameters for the model. + """ + + clean_target = replace_nan_with_mean(target) + clean_features = replace_nan_with_mean(features) + target_features = replace_nan_with_mean(target_features) + + model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2) + prediction = model.predict(target_features) + accuracy_array = [accuracy]*prediction.shape[0] + return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array)) + + + +def create_and_predict_segment(query, variable, target_query, model_params): + """ + generate a segment with machine learning + Stuart Lynn + """ + + ## fetch column names + try: + columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys() + except Exception, e: + plpy.error('Failed to build segmentation model: %s' % e) + + ## extract column names to be used in building the segmentation model + feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator']) + ## get data from database + target, features = get_data(variable, feature_columns, query) + + model, accuracy = train_model(target, features, model_params, 0.2) + cartodb_ids, result = predict_segment(model, feature_columns, target_query) + accuracy_array = [accuracy]*result.shape[0] + return zip(cartodb_ids, result, accuracy_array) + + +def train_model(target, features, model_params, test_split): + """ + Train the Gradient Boosting model on the provided data and calculate the accuracy of the model + Input: + @param target: 1D Array of the variable that the model is to be trianed to predict + @param features: 2D Array NSamples * NFeatures to use in trining the model + @param model_params: A dictionary of model parameters, the full specification can be found on the + scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) + @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray + """ + features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) + model = GradientBoostingRegressor(**model_params) + model.fit(features_train, target_train) + accuracy = calculate_model_accuracy(model, features, target) + return model, accuracy + +def calculate_model_accuracy(model, features, target): + """ + Calculate the mean squared error of the model prediction + Input: + @param model: model trained from input features + @param features: features to make a prediction from + @param target: target to compare prediction to + Output: + mean squared error of the model prection compared to the target + """ + prediction = model.predict(features) + return metrics.mean_squared_error(prediction, target) + +def predict_segment(model, features, target_query): + """ + Use the provided model to predict the values for the new feature set + Input: + @param model: The pretrained model + @features: A list of features to use in the model prediction (list of column names) + @target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it. + """ + + batch_size = 1000 + joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features]) + + try: + cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format( + joined_features=joined_features, + target_query=target_query)) + except Exception, e: + plpy.error('Failed to build segmentation model: %s' % e) + + results = [] + + while True: + rows = cursor.fetch(batch_size) + if not rows: + break + batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows]) + + #Need to fix this. Should be global mean. This will cause weird effects + batch = replace_nan_with_mean(batch) + prediction = model.predict(batch) + results.append(prediction) + + try: + cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids'] + except Exception, e: + plpy.error('Failed to build segmentation model: %s' % e) + + return cartodb_ids, np.concatenate(results) diff --git a/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py new file mode 100644 index 0000000..a439286 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py @@ -0,0 +1,2 @@ +"""Import all functions from clustering libraries.""" +from markov import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py new file mode 100644 index 0000000..20daaf1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py @@ -0,0 +1,194 @@ +""" +Spatial dynamics measurements using Spatial Markov +""" + +# TODO: remove all plpy dependencies + +import numpy as np +import pysal as ps +import plpy +import crankshaft.pysal_utils as pu +from crankshaft.analysis_data_provider import AnalysisDataProvider + + +class Markov(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def spatial_trend(self, subquery, time_cols, num_classes=7, + w_type='knn', num_ngbrs=5, permutations=0, + geom_col='the_geom', id_col='cartodb_id'): + """ + Predict the trends of a unit based on: + 1. history of its transitions to different classes (e.g., 1st + quantile -> 2nd quantile) + 2. average class of its neighbors + + Inputs: + @param subquery string: e.g., SELECT the_geom, cartodb_id, + interesting_time_column FROM table_name + @param time_cols list of strings: list of strings of column names + @param num_classes (optional): number of classes to break + distribution of values into. Currently uses quantile bins. + @param w_type string (optional): weight type ('knn' or 'queen') + @param num_ngbrs int (optional): number of neighbors (if knn type) + @param permutations int (optional): number of permutations for test + stats + @param geom_col string (optional): name of column which contains + the geometries + @param id_col string (optional): name of column which has the ids + of the table + + Outputs: + @param trend_up float: probablity that a geom will move to a higher + class + @param trend_down float: probablity that a geom will move to a + lower class + @param trend float: (trend_up - trend_down) / trend_static + @param volatility float: a measure of the volatility based on + probability stddev(prob array) + """ + + if len(time_cols) < 2: + plpy.error('More than one time column needs to be passed') + + params = {"id_col": id_col, + "time_cols": time_cols, + "geom_col": geom_col, + "subquery": subquery, + "num_ngbrs": num_ngbrs} + + result = self.data_provider.get_markov(w_type, params) + + # build weight + weights = pu.get_weight(result, w_type) + weights.transform = 'r' + + # prep time data + t_data = get_time_data(result, time_cols) + + sp_markov_result = ps.Spatial_Markov(t_data, + weights, + k=num_classes, + fixed=False, + permutations=permutations) + + # get lag classes + lag_classes = ps.Quantiles( + ps.lag_spatial(weights, t_data[:, -1]), + k=num_classes).yb + + # look up probablity distribution for each unit according to class and + # lag class + prob_dist = get_prob_dist(sp_markov_result.P, + lag_classes, + sp_markov_result.classes[:, -1]) + + # find the ups and down and overall distribution of each cell + trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1]) + + # output the results + return zip(trend, trend_up, trend_down, volatility, weights.id_order) + + + +def get_time_data(markov_data, time_cols): + """ + Extract the time columns and bin appropriately + """ + num_attrs = len(time_cols) + return np.array([[x['attr' + str(i)] for x in markov_data] + for i in range(1, num_attrs+1)], dtype=float).transpose() + + +# not currently used +def rebin_data(time_data, num_time_per_bin): + """ + Convert an n x l matrix into an (n/m) x l matrix where the values are + reduced (averaged) for the intervening states: + 1 2 3 4 1.5 3.5 + 5 6 7 8 -> 5.5 7.5 + 9 8 7 6 8.5 6.5 + 5 4 3 2 4.5 2.5 + + if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix. + + This process effectively resamples the data at a longer time span n + units longer than the input data. + For cases when there is a remainder (remainder(5/3) = 2), the remaining + two columns are binned together as the last time period, while the + first three are binned together for the first period. + + Input: + @param time_data n x l ndarray: measurements of an attribute at + different time intervals + @param num_time_per_bin int: number of columns to average into a new + column + Output: + ceil(n / m) x l ndarray of resampled time series + """ + + if time_data.shape[1] % num_time_per_bin == 0: + # if fit is perfect, then use it + n_max = time_data.shape[1] / num_time_per_bin + else: + # fit remainders into an additional column + n_max = time_data.shape[1] / num_time_per_bin + 1 + + return np.array( + [time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1) + for i in range(n_max)]).T + + +def get_prob_dist(transition_matrix, lag_indices, unit_indices): + """ + Given an array of transition matrices, look up the probability + associated with the arrangements passed + + Input: + @param transition_matrix ndarray[k,k,k]: + @param lag_indices ndarray: + @param unit_indices ndarray: + + Output: + Array of probability distributions + """ + + return np.array([transition_matrix[(lag_indices[i], unit_indices[i])] + for i in range(len(lag_indices))]) + + +def get_prob_stats(prob_dist, unit_indices): + """ + get the statistics of the probability distributions + + Outputs: + @param trend_up ndarray(float): sum of probabilities for upward + movement (relative to the unit index of that prob) + @param trend_down ndarray(float): sum of probabilities for downward + movement (relative to the unit index of that prob) + @param trend ndarray(float): difference of upward and downward + movements + """ + + num_elements = len(unit_indices) + trend_up = np.empty(num_elements, dtype=float) + trend_down = np.empty(num_elements, dtype=float) + trend = np.empty(num_elements, dtype=float) + + for i in range(num_elements): + trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum() + trend_down[i] = prob_dist[i, :unit_indices[i]].sum() + if prob_dist[i, unit_indices[i]] > 0.0: + trend[i] = (trend_up[i] - trend_down[i]) / ( + prob_dist[i, unit_indices[i]]) + else: + trend[i] = None + + # calculate volatility of distribution + volatility = prob_dist.std(axis=1) + + return trend_up, trend_down, trend, volatility diff --git a/release/python/0.7.0/crankshaft/requirements.txt b/release/python/0.7.0/crankshaft/requirements.txt new file mode 100644 index 0000000..88c0a9e --- /dev/null +++ b/release/python/0.7.0/crankshaft/requirements.txt @@ -0,0 +1,5 @@ +joblib==0.8.3 +numpy==1.6.1 +scipy==0.14.0 +pysal==1.14.3 +scikit-learn==0.14.1 diff --git a/release/python/0.7.0/crankshaft/setup.py b/release/python/0.7.0/crankshaft/setup.py new file mode 100644 index 0000000..a1f9ab2 --- /dev/null +++ b/release/python/0.7.0/crankshaft/setup.py @@ -0,0 +1,49 @@ + +""" +CartoDB Spatial Analysis Python Library +See: +https://github.com/CartoDB/crankshaft +""" + +from setuptools import setup, find_packages + +setup( + name='crankshaft', + + version='0.7.0', + + description='CartoDB Spatial Analysis Python Library', + + url='https://github.com/CartoDB/crankshaft', + + author='Data Services Team - CartoDB', + author_email='dataservices@cartodb.com', + + license='MIT', + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Mapping comunity', + 'Topic :: Maps :: Mapping Tools', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7', + ], + + keywords='maps mapping tools spatial analysis geostatistics', + + packages=find_packages(exclude=['contrib', 'docs', 'tests']), + + extras_require={ + 'dev': ['unittest'], + 'test': ['unittest', 'nose', 'mock'], + }, + + # The choice of component versions is dictated by what's + # provisioned in the production servers. + # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation. + install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1'], + + requires=['pysal', 'numpy', 'sklearn'], + + test_suite='test' +) diff --git a/release/python/0.7.0/crankshaft/test/fixtures/getis.json b/release/python/0.7.0/crankshaft/test/fixtures/getis.json new file mode 100644 index 0000000..02566fc --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/getis.json @@ -0,0 +1 @@ +[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json new file mode 100644 index 0000000..cbee3fb --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json @@ -0,0 +1 @@ +[{"x":[941396.6,895553,930946.4,745398.6,849431.3,819317.3,803747.1,699011.5,863020.8,859915.8,809736.9,844270.1,979288.9,827822,1023145,994903.4,971593.8,782448.2,724741.2,1008480,964264.9,678778.6,670055.9,962612.3,1059706,704959.2,653026.6,734240.9,832508.6,695793.9,745538.8,908046.1,724646.8,894463.9,808691.8,942527.9,839816.1,705457.9,783416.5,805648.4,635964.3,764386.1,732628.4,759231.9,860451.4,800031.3,764116.9,707288.7,703495.1,896654,1031899,879541.2,943066.2,981727.8,739255.8,731468.7,662257.4,765397.3,845701.3,733728.4,732702.3,908386.8,1023411,695325.1,765058.1,855577.3,772634.6,818917.1,794419.5,873518.8,665933.8,695500.6,870749.9,675280.4,763488.4,814118.9,855461.8,815753.1,807249.1,915741.9,924108.1,970465.7,908636.7,821367.1,766461.7,873804.3,884830.4,770455.5,1014742,919396.5,1004544,864781.1,772600,917730.9,1030500,777055.3,848638.8,732876.8,715359.8,716369.8,766238.6,790338.7,920887.4,825920.1,707834.3,700833.7,793263.9,830735.9,863291.8,695329.2,798061.4,733846.7,953533.8,744180.8,668031.4,833819.6,840169.1,686875.4,824645.5,712437.1,954272.3,777759,752973.1,1004028,704495.6,754916.2,842085.9,703256.8,763457.1,734217.9,884376.9,963427.8,759410.8,882069.4,743031.8,795506.2,831682.3,941734.4,797981.7,919077.6,682616.8,819399.6,832935,777040.1,752165.2,658870.4,800384.3,938349.6,902471.1,894704.3,986832.8,731576.3,898776.3,796905.6,686891.4,838551.5,891228.5,858796.9,801018.1],"y":[3521764,3471916,3502787,3474765,3665553,3807616,3769623,3793408,3520432,3466377,3636468,3595691,3463849,3421638,3554982,3600493,3671394,3684504,3492653,3437933,3598842,3713250,3862318,3432769,3556747,3577608,3813760,3794110,3762905,3495219,3711726,3428340,3757187,3492465,3455994,3722100,3449007,3694344,3623343,3537103,3854592,3812502,3421800,3735253,3569933,3564188,3494367,3731361,3467152,3401148,3596117,3785425,3616602,3571315,3866604,3700612,3789664,3789005,3813323,3733248,3844809,3685752,3471063,3822135,3421817,3722330,3764306,3839931,3803344,3689861,3740622,3624790,3810303,3685569,3699716,3590553,3506293,3783949,3695092,3530869,3668080,3640263,3624562,3660143,3663959,3439981,3599291,3520161,3537225,3752562,3517834,3419313,3832429,3716368,3500535,3584821,3785405,3584393,3660275,3451034,3453930,3660608,3568473,3717990,3854188,3598228,3719734,3750903,3756777,3758093,3609091,3812828,3482044,3665561,3764766,3567447,3695254,3524124,3864805,3519627,3697862,3729605,3570222,3641918,3422002,3685029,3827075,3552857,3551752,3623162,3717493,3560039,3608179,3534470,3522636,3421725,3487715,3567586,3872640,3595170,3660254,3514927,3623868,3858779,3639192,3842167,3742691,3446675,3699878,3648583,3494323,3544716,3563384,3841086,3855274,3538547,3749769,3637891,3487328],"dep_var":[8.2,6.4,6.6,9.4,13.3,6.4,9.2,9,7.6,7.5,17,10.3,5.8,9.1,11.8,19.9,9.6,7.2,10.1,13.5,9.9,12,8.1,6.4,18.6,20.2,5.9,18.4,37.5,11.2,14.7,6.7,33,11.1,10,23.9,6.5,13.3,5.7,10,8,8.6,11.7,32.7,8,9.5,17,12,9.4,4.7,7.6,8,9.1,8.6,7.8,25.8,13.7,15.6,9.5,31.6,8.6,5.3,19.9,9.2,7.7,8.8,29.6,12,15.4,6.8,7.5,13.6,9.1,5.7,10.7,16,8.3,9,10.8,8.3,6.2,7.7,4.9,12,10,5.4,12,13.7,13.4,8.2,5.2,16.3,11.1,10.4,8.7,10.1,9.7,4.6,6.7,8.2,7.8,12.9,10.1,11,5.5,16.6,9.5,28.4,12.8,7.6,15.2,9,6.3,9.3,6.8,10.7,11.7,7.3,11.6,6,17.3,18.1,8,8.6,7.8,11.1,13.1,8,15.9,7.1,5.6,6.5,7.1,8.6,9.2,13.4,14,11.4,11.4,6.3,13.6,7.2,4.8,10.1,9,8.4,9.4,10.4,4.2,9.8,9.6,5.5,8.6,13.6,12,7.6,10.4,8.8,6.3],"attr1":[75.6,100,61.7,100,42.7,100,64.6,75.2,47,66.2,16.1,57.9,100,65.6,80.6,63.2,72.3,73.4,100,47.1,52.1,68.5,43.6,100,5.1,13.7,77.4,57.8,17.6,100,4.4,58.6,5.8,64.6,59.4,30.6,62,76.1,100,48.4,96.5,100,58,2.5,70.7,72.6,10,26.7,52.8,100,89.1,70,64.2,100,100,53.9,36.1,93.7,87.2,4.2,100,100,20.3,79.7,55.4,75.7,13.6,88.5,81.1,100,67.8,95.8,73.8,100,76,20.9,63.4,78,100,65.1,100,53.8,100,81.9,63.6,100,52.9,78.2,32.9,100,100,47.6,78.6,65.9,100,65.6,100,100,82.3,100,56.2,75.1,98.6,73,89,3.2,76,95.2,100,93.7,61.3,100,74.4,100,66.5,56.5,66.5,100,100,53.5,9.9,59.2,100,79.3,69.4,53.6,64.5,100,45.4,97.9,100,79.3,100,72.6,50.3,55.2,51.1,35.7,100,53.3,44,44.5,100,100,65.3,44.8,61.2,54.2,100,67.1,59.9,100,100,100,70,100,59.6,100,71.1],"attr2":[19.9,26,24.1,24.8,17.5,15.1,14.7,10.7,22,19.3,19.2,18.3,18.2,25.9,13.2,27.5,30.3,15.6,31.8,11.5,24.1,14.4,12,18.3,17.2,10.4,14.6,6.1,27,35.7,8.6,26.4,5.6,22.5,22.8,6.6,22.4,11.4,14,29,14.6,12.8,23.3,9.9,21.8,32.9,24.4,6.6,31.4,14.6,12.7,19.7,25.7,25.4,17.2,2.6,13.6,6.8,16.5,18.4,16.6,16.8,14.3,11.1,22.3,25.1,4,11.6,10.6,30.1,14.4,13.7,14.2,19.1,6.1,10.6,27.2,14.1,17.4,18.8,31.3,27.8,22.2,10.8,16.3,25.9,20.5,12.6,17.2,17.8,23.7,19.9,15.3,21.6,22.3,29.2,15.7,28.2,22.4,22.1,28.7,13.8,24.5,15,11.3,18.6,14.4,7.9,16.2,8.8,24,12.8,21.3,13.4,16.3,24.3,16.4,33,13.6,35.9,18.2,6.2,19.9,22.9,29.1,15.6,17,31.4,24.8,24.9,31.9,21.9,29.5,27.3,29.1,22.6,22.9,24,14,27.1,16.3,31.3,26,18.3,14.7,12.8,13.2,21.1,32.6,21.6,21.2,22.5,30.3,12.5,11.1,28.6,22.6,15.3,26.2],"attr3":[20.76,26.86,15.42,51.67,42.39,3.49,11.44,9.21,31.33,11.62,41.68,22.36,4.58,41.47,14.85,25.95,52.19,35.48,58.89,20.19,30.94,15.46,0.91,27.05,38.02,30.94,8.61,1.77,26.23,60.76,23.82,27.29,9.84,25.46,24.16,10.93,29.94,22.59,30.66,40.66,0.35,0.29,39.47,42.23,27.64,48.98,50.15,7.63,44.09,11.48,14.03,29.99,32.58,33.88,0.03,5.13,13.56,0,9.89,49.92,0.26,12.69,25.57,3.78,31.5,49.89,5.11,5.42,8.48,79.64,6.47,25.49,20.41,13.38,10.24,21.8,30.5,9.58,34.8,15.36,55.92,41.51,33.89,25.6,34.03,26.58,33.32,19.22,39.15,38.19,21.75,31.88,1.41,36.38,43.34,58.72,8.32,41.32,44.62,27.48,47.91,31.78,28.27,34.74,0.26,37.95,22.35,7.37,24.74,3.94,47.53,1.48,11.69,20.04,14.3,32.46,32.79,49.93,0.35,58.17,41.96,8.03,34.09,44.69,32.74,29.08,11.81,63.46,46.53,62.34,61.36,29.19,43.21,34.45,59.9,37.93,26.68,23.38,0,33.1,30.03,40.66,45.93,0.1,27.78,3.73,18.37,25.88,60.23,51.86,19.45,50.2,30.06,2.59,4.06,31.76,45.94,41.99,30.71],"rowid":[13001,13003,13005,13007,13009,13011,13013,13015,13017,13019,13021,13023,13025,13027,13029,13031,13033,13035,13037,13039,13043,13045,13047,13049,13051,13053,13055,13057,13059,13061,13063,13065,13067,13069,13071,13073,13075,13077,13079,13081,13083,13085,13087,13089,13091,13093,13095,13097,13099,13101,13103,13105,13107,13109,13111,13113,13115,13117,13119,13121,13123,13125,13127,13129,13131,13133,13135,13137,13139,13141,13143,13145,13147,13149,13151,13153,13155,13157,13159,13161,13163,13165,13167,13169,13171,13173,13175,13177,13179,13181,13183,13185,13187,13189,13191,13193,13195,13197,13199,13201,13205,13207,13209,13211,13213,13215,13217,13219,13221,13223,13225,13227,13229,13231,13233,13235,13237,13239,13241,13243,13245,13247,13249,13251,13253,13255,13257,13259,13261,13263,13265,13267,13269,13271,13273,13275,13277,13279,13281,13283,13285,13287,13289,13291,13293,13295,13297,13299,13301,13303,13305,13307,13309,13311,13313,13315,13317,13319,13321]}] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json new file mode 100644 index 0000000..4e73b79 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json @@ -0,0 +1 @@ +{"y_coord": [3521764, 3471916, 3502787, 3474765, 3665553, 3807616, 3769623, 3793408, 3520432, 3466377, 3636468, 3595691, 3463849, 3421638, 3554982, 3600493, 3671394, 3684504, 3492653, 3437933, 3598842, 3713250, 3862318, 3432769, 3556747, 3577608, 3813760, 3794110, 3762905, 3495219, 3711726, 3428340, 3757187, 3492465, 3455994, 3722100, 3449007, 3694344, 3623343, 3537103, 3854592, 3812502, 3421800, 3735253, 3569933, 3564188, 3494367, 3731361, 3467152, 3401148, 3596117, 3785425, 3616602, 3571315, 3866604, 3700612, 3789664, 3789005, 3813323, 3733248, 3844809, 3685752, 3471063, 3822135, 3421817, 3722330, 3764306, 3839931, 3803344, 3689861, 3740622, 3624790, 3810303, 3685569, 3699716, 3590553, 3506293, 3783949, 3695092, 3530869, 3668080, 3640263, 3624562, 3660143, 3663959, 3439981, 3599291, 3520161, 3537225, 3752562, 3517834, 3419313, 3832429, 3716368, 3500535, 3584821, 3785405, 3584393, 3660275, 3451034, 3453930, 3660608, 3568473, 3717990, 3854188, 3598228, 3719734, 3750903, 3756777, 3758093, 3609091, 3812828, 3482044, 3665561, 3764766, 3567447, 3695254, 3524124, 3864805, 3519627, 3697862, 3729605, 3570222, 3641918, 3422002, 3685029, 3827075, 3552857, 3551752, 3623162, 3717493, 3560039, 3608179, 3534470, 3522636, 3421725, 3487715, 3567586, 3872640, 3595170, 3660254, 3514927, 3623868, 3858779, 3639192, 3842167, 3742691, 3446675, 3699878, 3648583, 3494323, 3544716, 3563384, 3841086, 3855274, 3538547, 3749769, 3637891, 3487328], "influence": [0.041718, 0.093454, 0.10983, 0.118198, 0.097548, 0.059443, 0.041031, 0.032462, 0.058498, 0.100714, 0.170747, 0.082058, 0.184081, 0.037431, 0.131419, 0.11251, 0.101114, 0.047942, 0.113107, 0.181309, 0.05591, 0.037814, 0.109586, 0.130853, 0.237285, 0.172302, 0.036228, 0.064756, 0.443808, 0.13641, 0.141066, 0.076699, 0.150241, 0.032761, 0.051816, 0.223754, 0.029353, 0.06002, 0.105169, 0.076247, 0.069329, 0.051965, 0.039944, 0.20308, 0.046086, 0.098109, 0.194576, 0.093258, 0.093739, 0.186424, 0.178352, 0.036825, 0.053419, 0.069314, 0.088385, 0.131412, 0.08002, 0.085975, 0.047994, 0.201466, 0.077724, 0.153041, 0.174215, 0.034634, 0.040242, 0.063712, 0.149308, 0.040139, 0.036003, 0.285484, 0.063911, 0.090216, 0.033014, 0.133651, 0.097922, 0.198633, 0.060055, 0.032494, 0.071734, 0.073892, 0.121394, 0.087385, 0.077991, 0.093922, 0.03193, 0.091298, 0.053937, 0.126429, 0.13414, 0.102608, 0.078631, 0.067484, 0.06222, 0.036613, 0.166179, 0.107871, 0.06172, 0.080811, 0.060716, 0.096678, 0.05549, 0.054726, 0.076288, 0.051877, 0.045589, 0.170792, 0.030476, 0.097062, 0.057117, 0.077883, 0.057165, 0.050752, 0.097715, 0.08115, 0.047653, 0.049173, 0.03286, 0.099923, 0.061466, 0.137244, 0.27255, 0.071571, 0.070927, 0.063635, 0.098482, 0.034063, 0.064541, 0.143175, 0.059673, 0.219599, 0.140317, 0.030782, 0.102154, 0.05155, 0.118173, 0.045636, 0.052966, 0.12225, 0.059551, 0.084581, 0.048937, 0.120127, 0.087659, 0.106444, 0.034626, 0.096799, 0.02894, 0.040209, 0.138238, 0.09661, 0.053615, 0.116263, 0.159493, 0.050856, 0.035205, 0.123618, 0.061337, 0.156479, 0.044714], "x_coord": [941396.6, 895553, 930946.4, 745398.6, 849431.3, 819317.3, 803747.1, 699011.5, 863020.8, 859915.8, 809736.9, 844270.1, 979288.9, 827822, 1023145, 994903.4, 971593.8, 782448.2, 724741.2, 1008480, 964264.9, 678778.6, 670055.9, 962612.3, 1059706, 704959.2, 653026.6, 734240.9, 832508.6, 695793.9, 745538.8, 908046.1, 724646.8, 894463.9, 808691.8, 942527.9, 839816.1, 705457.9, 783416.5, 805648.4, 635964.3, 764386.1, 732628.4, 759231.9, 860451.4, 800031.3, 764116.9, 707288.7, 703495.1, 896654, 1031899, 879541.2, 943066.2, 981727.8, 739255.8, 731468.7, 662257.4, 765397.3, 845701.3, 733728.4, 732702.3, 908386.8, 1023411, 695325.1, 765058.1, 855577.3, 772634.6, 818917.1, 794419.5, 873518.8, 665933.8, 695500.6, 870749.9, 675280.4, 763488.4, 814118.9, 855461.8, 815753.1, 807249.1, 915741.9, 924108.1, 970465.7, 908636.7, 821367.1, 766461.7, 873804.3, 884830.4, 770455.5, 1014742, 919396.5, 1004544, 864781.1, 772600, 917730.9, 1030500, 777055.3, 848638.8, 732876.8, 715359.8, 716369.8, 766238.6, 790338.7, 920887.4, 825920.1, 707834.3, 700833.7, 793263.9, 830735.9, 863291.8, 695329.2, 798061.4, 733846.7, 953533.8, 744180.8, 668031.4, 833819.6, 840169.1, 686875.4, 824645.5, 712437.1, 954272.3, 777759, 752973.1, 1004028, 704495.6, 754916.2, 842085.9, 703256.8, 763457.1, 734217.9, 884376.9, 963427.8, 759410.8, 882069.4, 743031.8, 795506.2, 831682.3, 941734.4, 797981.7, 919077.6, 682616.8, 819399.6, 832935, 777040.1, 752165.2, 658870.4, 800384.3, 938349.6, 902471.1, 894704.3, 986832.8, 731576.3, 898776.3, 796905.6, 686891.4, 838551.5, 891228.5, 858796.9, 801018.1], "se_pctblack": [0.048422, 0.053382, 0.050307, 0.052233, 0.050361, 0.041694, 0.041354, 0.041423, 0.048378, 0.049213, 0.051088, 0.050376, 0.05714, 0.049579, 0.047733, 0.045694, 0.048206, 0.048881, 0.054756, 0.06, 0.044953, 0.046902, 0.043006, 0.057079, 0.048872, 0.056886, 0.043753, 0.039689, 0.044035, 0.057198, 0.046953, 0.054178, 0.040812, 0.051914, 0.049325, 0.047751, 0.048778, 0.04959, 0.053515, 0.049339, 0.044162, 0.039811, 0.053106, 0.041446, 0.0497, 0.051743, 0.051662, 0.042636, 0.055382, 0.053312, 0.047102, 0.044382, 0.046127, 0.045928, 0.041238, 0.048361, 0.043512, 0.03973, 0.043382, 0.042789, 0.041214, 0.048842, 0.055459, 0.04163, 0.051566, 0.04629, 0.04035, 0.041668, 0.0405, 0.047923, 0.043671, 0.059377, 0.044651, 0.05022, 0.047643, 0.050203, 0.048129, 0.04155, 0.048144, 0.047513, 0.049365, 0.046377, 0.048207, 0.051755, 0.050068, 0.050653, 0.047949, 0.05133, 0.049083, 0.044645, 0.050489, 0.050633, 0.040368, 0.046813, 0.05234, 0.056291, 0.043751, 0.056776, 0.054586, 0.05369, 0.05193, 0.051624, 0.044863, 0.047448, 0.042099, 0.058191, 0.045691, 0.044437, 0.044577, 0.04214, 0.053326, 0.040489, 0.052861, 0.051687, 0.042796, 0.048223, 0.048837, 0.058596, 0.042367, 0.056893, 0.048682, 0.044234, 0.055933, 0.046173, 0.053802, 0.049959, 0.042715, 0.057894, 0.054198, 0.05754, 0.046272, 0.045435, 0.055455, 0.048073, 0.05567, 0.050858, 0.048796, 0.044856, 0.041348, 0.045268, 0.057257, 0.047627, 0.052411, 0.041008, 0.05606, 0.043488, 0.043092, 0.054819, 0.047066, 0.050849, 0.052717, 0.05641, 0.046776, 0.040768, 0.042667, 0.048373, 0.04495, 0.050613, 0.04877], "cooksd": [7.7e-05, 0.000315, 0.002225, 0.000205, 0.001606, 0.001427, 0.00652, 0.001942, 0.003764, 0.000481, 0.000267, 0.000157, 8.8e-05, 0.000123, 0.003019, 0.072736, 0.001193, 0.005313, 0.005343, 1e-06, 0.000363, 3.1e-05, 0.034878, 0.00106, 0.001396, 0.001463, 0.003457, 0.001192, 1.423109, 0.021445, 0.098779, 0.001687, 0.055873, 0.000531, 0.000136, 0.021004, 0.001615, 6.7e-05, 0.006305, 2e-05, 0.000228, 3.7e-05, 6.9e-05, 0.065765, 0.000349, 0.001329, 0.003647, 0.0419, 7e-06, 0.00795, 0.003791, 0.003857, 8.5e-05, 0.002004, 0.000484, 0.042878, 0.003828, 0.011277, 0.000442, 0.094025, 0.001259, 0.014043, 0.018378, 0.000812, 0.001675, 0.00019, 0.016271, 0.000218, 0.001723, 0.026378, 0.003864, 0.007504, 0.002657, 0.000982, 0.005258, 0.000183, 9e-06, 0.002044, 0.004633, 0.000515, 0.00141, 0.003848, 0.002212, 8.9e-05, 0.001534, 5e-06, 0.000109, 0.006075, 0.001587, 0.000427, 0.000156, 0.006069, 4e-06, 0.000344, 0.001701, 1.1e-05, 0.000192, 0.000832, 0.002053, 0.000569, 0.001872, 0.000233, 0.005009, 0.000331, 0.004382, 0.003999, 0.00118, 0.170092, 0.006522, 0.002474, 0.005137, 0.00027, 0.001113, 0.000403, 0.004921, 5.5e-05, 0.000272, 0.001283, 0.002676, 0.008287, 0.003014, 0.000491, 7e-06, 6.4e-05, 3.7e-05, 0.002458, 0.000757, 0.000476, 0.004148, 0.002506, 0.002465, 0.00071, 0.00184, 1.5e-05, 0.004272, 0.000945, 0.003874, 0.000189, 0.002952, 0.006644, 0.000699, 0.004442, 0.0013, 0.005564, 0.002488, 0.024203, 0.00519, 1.8e-05, 3.8e-05, 0.000263, 2.9e-05, 0.007778, 0.008509, 0.006229, 0.000105, 0.002952, 0.001612, 8e-06, 0.000767], "est_pctrural": [-0.087919, -0.077996, -0.085464, -0.072676, -0.128431, -0.180965, -0.18567, -0.143921, -0.072048, -0.074505, -0.117008, -0.087278, -0.091904, -0.073817, -0.099557, -0.098698, -0.10867, -0.160167, -0.073158, -0.094154, -0.094601, -0.135079, -0.133974, -0.08942, -0.101838, -0.091672, -0.131022, -0.155214, -0.190065, -0.073282, -0.165429, -0.081867, -0.156664, -0.076699, -0.072464, -0.133921, -0.074071, -0.141117, -0.111564, -0.078856, -0.127798, -0.162809, -0.070745, -0.174179, -0.077704, -0.084767, -0.073785, -0.14867, -0.071717, -0.080666, -0.101715, -0.180059, -0.092867, -0.096093, -0.148798, -0.15388, -0.133349, -0.168574, -0.185284, -0.163127, -0.149285, -0.123165, -0.096677, -0.141254, -0.071164, -0.167284, -0.178183, -0.17299, -0.175146, -0.138416, -0.134046, -0.111022, -0.185981, -0.129588, -0.165865, -0.090566, -0.072647, -0.18459, -0.167686, -0.081112, -0.109965, -0.101055, -0.090609, -0.134897, -0.143216, -0.077016, -0.082642, -0.076247, -0.098465, -0.155026, -0.096848, -0.076702, -0.162008, -0.138472, -0.098386, -0.091171, -0.18779, -0.09308, -0.131121, -0.071531, -0.071437, -0.141435, -0.082891, -0.177965, -0.142118, -0.099834, -0.18257, -0.188049, -0.178213, -0.144304, -0.099787, -0.153281, -0.089241, -0.140148, -0.135427, -0.08363, -0.158342, -0.0768, -0.169776, -0.075752, -0.120551, -0.184296, -0.086448, -0.103888, -0.070842, -0.154438, -0.180631, -0.082675, -0.081094, -0.112567, -0.151714, -0.09336, -0.103733, -0.072526, -0.075826, -0.071792, -0.072662, -0.088969, -0.161521, -0.085392, -0.123629, -0.075361, -0.098113, -0.15897, -0.123605, -0.131963, -0.188348, -0.086523, -0.134341, -0.101212, -0.094024, -0.079862, -0.074241, -0.166676, -0.137364, -0.076497, -0.164552, -0.101861, -0.07352], "std_residual": [-0.162278, 0.213714, -0.518796, 0.151238, -0.470868, -0.580528, -1.508125, -0.929355, -0.95091, -0.253303, -0.139052, -0.162155, 0.076204, -0.217609, 0.545752, 2.926313, 0.397864, -1.254875, 0.790774, -0.006914, -0.30256, 0.108831, -2.056611, -0.32422, 0.258772, 0.32387, -1.171601, 0.506852, 5.159316, 1.423472, -2.996125, -0.550564, 2.171775, 0.483877, 0.193059, 1.042856, -0.892786, 0.125184, -0.894825, -0.060875, 0.213632, 0.099734, 0.157788, 1.962586, -0.328491, 0.426944, 0.474682, -2.46583, 0.031892, -0.719617, -0.510536, -1.227088, -0.150029, 0.633771, 0.272938, 2.056669, -0.810461, 1.337637, -0.361836, 2.358452, 0.472247, -1.07699, 1.140253, -0.581307, -0.772136, -0.20392, 1.176268, 0.278975, 0.829814, 0.99264, -0.919034, 1.062743, -1.077658, 0.308164, -0.850273, -0.104966, -0.046276, -0.953096, 0.945945, -0.310309, 0.390222, -0.774466, -0.62474, 0.113205, -0.833277, -0.027681, 0.168806, 0.791532, -0.390956, 0.236217, -0.165374, 1.118755, -0.031656, -0.36767, 0.356947, 0.036409, 0.208857, -0.375831, -0.688536, 0.281743, -0.689616, 0.244942, 0.95144, -0.300273, -1.170075, -0.538318, -0.748598, 4.85965, 1.267608, -0.661149, 1.124549, 0.274469, -0.391676, 0.261042, -1.211531, 0.126055, -0.345827, 0.415296, 0.780979, -0.881768, -0.346488, 0.308225, -0.037905, -0.118149, 0.071239, -1.020007, -0.404594, 0.206144, 0.987719, -0.364614, 0.474766, -0.577719, 0.491319, 0.06313, -0.68978, 0.543157, 1.016772, -0.142276, 0.834189, -1.035981, -0.450355, -0.696808, -0.449433, 0.834905, -1.01757, -1.835909, -1.612155, -0.080997, 0.059813, -0.191521, -0.086895, -0.939373, 0.818087, 1.31727, -0.206834, 0.558896, -0.606735, 0.025605, -0.494558], "localr2": [0.551117, 0.557455, 0.553851, 0.571077, 0.559486, 0.551175, 0.558752, 0.571809, 0.513439, 0.550571, 0.57839, 0.545373, 0.604611, 0.563673, 0.606627, 0.579241, 0.547193, 0.58401, 0.57804, 0.622744, 0.554506, 0.616314, 0.553322, 0.610492, 0.618849, 0.631907, 0.568832, 0.566261, 0.551402, 0.582397, 0.594597, 0.591202, 0.583349, 0.535237, 0.556551, 0.54599, 0.55833, 0.62123, 0.609202, 0.58125, 0.557535, 0.557277, 0.560922, 0.57934, 0.511668, 0.593458, 0.573619, 0.599358, 0.573775, 0.594004, 0.599749, 0.551664, 0.534209, 0.576381, 0.547637, 0.607718, 0.577489, 0.562801, 0.547044, 0.592063, 0.552589, 0.548079, 0.619311, 0.562128, 0.557048, 0.560955, 0.566841, 0.546595, 0.555068, 0.560605, 0.599587, 0.649273, 0.546138, 0.629902, 0.587461, 0.578893, 0.529977, 0.555579, 0.569569, 0.503741, 0.539692, 0.553734, 0.51914, 0.565167, 0.607287, 0.56818, 0.491248, 0.580526, 0.605081, 0.551278, 0.603162, 0.573075, 0.551941, 0.550795, 0.617381, 0.608093, 0.550716, 0.627532, 0.634592, 0.569713, 0.55971, 0.589995, 0.481273, 0.557708, 0.552413, 0.644127, 0.563542, 0.552622, 0.554574, 0.588412, 0.591719, 0.56091, 0.584391, 0.619238, 0.587011, 0.5735, 0.563046, 0.59661, 0.541754, 0.589691, 0.545224, 0.568126, 0.608463, 0.570086, 0.566278, 0.601112, 0.546778, 0.611534, 0.594733, 0.640115, 0.5595, 0.564123, 0.626729, 0.473791, 0.587694, 0.557015, 0.54805, 0.527237, 0.543414, 0.493954, 0.643286, 0.563722, 0.537914, 0.546488, 0.628564, 0.559385, 0.560098, 0.595942, 0.553755, 0.534857, 0.598634, 0.599113, 0.423182, 0.548624, 0.553793, 0.557453, 0.555125, 0.538754, 0.561673], "est_intercept": [18.375924, 18.039692, 18.173904, 18.612431, 25.027931, 28.868732, 29.126594, 26.73874, 17.332852, 18.009999, 23.331917, 18.575691, 18.853338, 18.212539, 20.021869, 20.563701, 23.303807, 27.304692, 18.937685, 19.091389, 20.107213, 26.58407, 25.993181, 18.861404, 20.315761, 21.024737, 26.101723, 27.237578, 29.623695, 19.199889, 28.067985, 18.570921, 27.550537, 17.692965, 18.158333, 26.14995, 18.119124, 26.596568, 22.804616, 18.569581, 25.775193, 27.602714, 18.386703, 28.434523, 17.509755, 19.093217, 18.613133, 27.243553, 18.862486, 18.63039, 20.726368, 29.409853, 20.50841, 19.704834, 26.704982, 27.33758, 26.343322, 27.988326, 29.33117, 28.013628, 26.776764, 25.151079, 19.341762, 26.468527, 18.258714, 28.364577, 28.606646, 28.307209, 28.404865, 26.321627, 26.500457, 23.059413, 29.625733, 25.689204, 27.858068, 19.647124, 17.629684, 29.102336, 27.867154, 17.579579, 23.757825, 21.960048, 20.80086, 25.164234, 26.024579, 18.276315, 18.598483, 18.808846, 19.712953, 27.921286, 19.433808, 18.35095, 27.53054, 26.556594, 19.58565, 20.302644, 29.590949, 21.013811, 25.224396, 18.655811, 18.36884, 25.731843, 17.978451, 28.693695, 26.386878, 21.843074, 28.847315, 29.460662, 29.177378, 26.994593, 21.207599, 27.084437, 18.612524, 25.907755, 26.504604, 18.394926, 27.466413, 19.674459, 28.095647, 19.400471, 24.815945, 28.99667, 20.143981, 22.000899, 18.552729, 27.04672, 28.950415, 20.171, 19.436785, 23.078763, 27.520861, 19.256298, 22.034483, 16.895516, 19.123574, 18.208195, 18.006901, 18.743279, 27.490506, 18.866189, 24.730344, 18.225195, 20.965009, 27.320591, 24.200809, 25.991901, 29.279015, 18.656967, 26.15185, 22.653152, 19.058441, 19.682721, 16.844017, 27.838898, 26.164642, 18.0256, 28.516555, 22.169471, 18.263625], "yhat": [8.815245, 5.611921, 8.495724, 8.849965, 15.03242, 8.580509, 14.919817, 12.540446, 11.17349, 8.430319, 17.490415, 10.9017, 5.533408, 9.926865, 9.830106, 9.223105, 8.139072, 11.942142, 7.215765, 13.524229, 11.038573, 11.586549, 15.616062, 7.570655, 17.724733, 19.058832, 10.354608, 16.501607, 22.597924, 6.076758, 25.4543, 8.748904, 25.246368, 9.256918, 9.271919, 20.341504, 9.906595, 12.829942, 8.978318, 10.226601, 7.201808, 8.223906, 11.101222, 25.914554, 9.242568, 7.929672, 15.350101, 21.093841, 9.282415, 7.213871, 9.392305, 12.664129, 9.665323, 6.232029, 6.790721, 18.376414, 16.710674, 10.647101, 10.867332, 23.437631, 6.843525, 9.138708, 15.886925, 11.412046, 10.629657, 9.564201, 25.398206, 10.941449, 12.244551, 3.550323, 10.943758, 9.674096, 13.204245, 4.58911, 13.827683, 16.363919, 8.473759, 12.630831, 7.270247, 9.456558, 4.783389, 10.565422, 7.223319, 11.582661, 13.175302, 5.502197, 11.364096, 10.834768, 14.808946, 7.333349, 5.814789, 12.115865, 11.218728, 11.797658, 7.437642, 9.966811, 8.916465, 5.995525, 9.284445, 7.162908, 10.395689, 11.977673, 6.558461, 12.132376, 9.927145, 18.498512, 12.354765, 10.515515, 8.032877, 10.058864, 10.970992, 7.964322, 7.740923, 8.330884, 11.379041, 10.22395, 13.017184, 5.774051, 8.669737, 9.172055, 18.444544, 16.949769, 8.141504, 9.042787, 7.538033, 14.982577, 14.615563, 7.260976, 12.190501, 8.347484, 3.895132, 8.702775, 5.296954, 8.361885, 11.708678, 11.344938, 10.167793, 11.916248, 8.266897, 10.138875, 15.30099, 9.731427, 6.462592, 7.04339, 12.872171, 15.157496, 15.552791, 10.707326, 3.984952, 10.505011, 9.927395, 8.920127, 5.695227, 8.629694, 12.786831, 5.573625, 12.676648, 8.708923, 8.172088], "est_pctpov": [-0.218522, -0.291285, -0.235007, -0.325567, -0.188146, -0.137907, -0.119547, -0.379195, -0.259626, -0.310736, -0.244202, -0.206076, -0.248679, -0.334842, -0.215786, -0.21054, -0.199793, -0.192097, -0.322443, -0.253757, -0.206276, -0.534973, -0.384352, -0.274742, -0.216591, -0.341741, -0.443096, -0.30198, -0.068634, -0.327487, -0.315412, -0.310284, -0.345864, -0.269059, -0.328491, -0.16491, -0.323102, -0.475082, -0.288485, -0.275601, -0.429848, -0.242496, -0.341276, -0.227016, -0.2048, -0.252217, -0.316204, -0.425002, -0.332753, -0.326117, -0.210497, -0.10558, -0.209458, -0.206017, -0.292783, -0.384946, -0.455325, -0.227329, -0.109925, -0.340684, -0.302287, -0.187858, -0.23682, -0.36714, -0.34067, -0.110955, -0.186993, -0.170485, -0.176104, -0.172523, -0.500233, -0.412376, -0.097125, -0.519982, -0.243051, -0.236613, -0.282387, -0.119102, -0.113713, -0.208408, -0.202582, -0.214505, -0.217432, -0.180505, -0.284279, -0.319038, -0.196772, -0.299851, -0.216323, -0.148542, -0.221665, -0.329237, -0.235629, -0.167632, -0.226774, -0.265047, -0.086238, -0.318679, -0.423909, -0.334852, -0.335002, -0.222813, -0.183316, -0.067768, -0.338818, -0.366487, -0.102857, -0.064893, -0.096511, -0.426293, -0.262799, -0.301715, -0.245085, -0.354426, -0.464528, -0.230167, -0.116111, -0.327569, -0.179779, -0.31768, -0.180967, -0.141807, -0.290041, -0.211583, -0.340959, -0.30384, -0.131568, -0.32466, -0.28365, -0.360965, -0.151989, -0.206422, -0.31246, -0.218596, -0.305719, -0.341804, -0.311581, -0.194575, -0.219397, -0.189617, -0.491018, -0.292708, -0.229657, -0.239476, -0.344539, -0.411469, -0.091658, -0.278733, -0.177206, -0.216008, -0.23146, -0.304909, -0.172899, -0.203986, -0.367377, -0.261962, -0.131495, -0.224866, -0.31454], "area_key": [13001, 13003, 13005, 13007, 13009, 13011, 13013, 13015, 13017, 13019, 13021, 13023, 13025, 13027, 13029, 13031, 13033, 13035, 13037, 13039, 13043, 13045, 13047, 13049, 13051, 13053, 13055, 13057, 13059, 13061, 13063, 13065, 13067, 13069, 13071, 13073, 13075, 13077, 13079, 13081, 13083, 13085, 13087, 13089, 13091, 13093, 13095, 13097, 13099, 13101, 13103, 13105, 13107, 13109, 13111, 13113, 13115, 13117, 13119, 13121, 13123, 13125, 13127, 13129, 13131, 13133, 13135, 13137, 13139, 13141, 13143, 13145, 13147, 13149, 13151, 13153, 13155, 13157, 13159, 13161, 13163, 13165, 13167, 13169, 13171, 13173, 13175, 13177, 13179, 13181, 13183, 13185, 13187, 13189, 13191, 13193, 13195, 13197, 13199, 13201, 13205, 13207, 13209, 13211, 13213, 13215, 13217, 13219, 13221, 13223, 13225, 13227, 13229, 13231, 13233, 13235, 13237, 13239, 13241, 13243, 13245, 13247, 13249, 13251, 13253, 13255, 13257, 13259, 13261, 13263, 13265, 13267, 13269, 13271, 13273, 13275, 13277, 13279, 13281, 13283, 13285, 13287, 13289, 13291, 13293, 13295, 13297, 13299, 13301, 13303, 13305, 13307, 13309, 13311, 13313, 13315, 13317, 13319, 13321], "residual": [-0.615245, 0.788079, -1.895724, 0.550035, -1.73242, -2.180509, -5.719817, -3.540446, -3.57349, -0.930319, -0.490415, -0.6017, 0.266592, -0.826865, 1.969894, 10.676895, 1.460928, -4.742142, 2.884235, -0.024229, -1.138573, 0.413451, -7.516062, -1.170655, 0.875267, 1.141168, -4.454608, 1.898393, 14.902076, 5.123242, -10.7543, -2.048904, 7.753632, 1.843082, 0.728081, 3.558496, -3.406595, 0.470058, -3.278318, -0.226601, 0.798192, 0.376094, 0.598778, 6.785446, -1.242568, 1.570328, 1.649899, -9.093841, 0.117585, -2.513871, -1.792305, -4.664129, -0.565323, 2.367971, 1.009279, 7.423586, -3.010674, 4.952899, -1.367332, 8.162369, 1.756475, -3.838708, 4.013075, -2.212046, -2.929657, -0.764201, 4.201794, 1.058551, 3.155449, 3.249677, -3.443758, 3.925904, -4.104245, 1.11089, -3.127683, -0.363919, -0.173759, -3.630831, 3.529753, -1.156558, 1.416611, -2.865422, -2.323319, 0.417339, -3.175302, -0.102197, 0.635904, 2.865232, -1.408946, 0.866651, -0.614789, 4.184135, -0.118728, -1.397658, 1.262358, 0.133189, 0.783535, -1.395525, -2.584445, 1.037092, -2.595689, 0.922327, 3.541539, -1.132376, -4.427145, -1.898512, -2.854765, 17.884485, 4.767123, -2.458864, 4.229008, 1.035678, -1.440923, 0.969116, -4.579041, 0.47605, -1.317184, 1.525949, 2.930263, -3.172055, -1.144544, 1.150231, -0.141504, -0.442787, 0.261967, -3.882577, -1.515563, 0.739024, 3.709499, -1.247484, 1.704868, -2.202775, 1.803046, 0.238115, -2.508678, 2.055062, 3.832207, -0.516248, 3.133103, -3.838875, -1.70099, -2.531427, -1.662592, 3.05661, -3.872171, -6.757496, -6.152791, -0.307326, 0.215048, -0.705011, -0.327395, -3.420127, 2.904773, 4.970306, -0.786831, 2.026375, -2.276648, 0.091077, -1.872088], "se_pctpov": [0.115485, 0.126975, 0.118227, 0.10991, 0.106755, 0.130001, 0.128153, 0.129687, 0.122903, 0.120344, 0.105854, 0.108188, 0.131652, 0.119039, 0.120983, 0.118868, 0.116824, 0.124573, 0.109529, 0.137167, 0.114816, 0.135058, 0.134571, 0.132733, 0.125196, 0.107398, 0.135228, 0.126003, 0.132212, 0.111889, 0.132315, 0.129999, 0.126709, 0.122888, 0.11624, 0.118168, 0.117511, 0.133364, 0.107538, 0.105176, 0.13661, 0.127189, 0.118856, 0.125138, 0.117719, 0.102601, 0.108322, 0.128055, 0.11285, 0.129822, 0.124218, 0.128278, 0.113438, 0.116881, 0.131403, 0.132619, 0.134311, 0.126518, 0.132408, 0.128477, 0.131258, 0.109249, 0.130866, 0.13096, 0.119307, 0.122192, 0.126621, 0.130642, 0.128241, 0.10831, 0.131083, 0.122289, 0.132841, 0.129917, 0.129134, 0.10135, 0.121095, 0.12867, 0.124665, 0.113515, 0.109758, 0.114253, 0.106067, 0.112681, 0.119755, 0.12324, 0.109178, 0.103299, 0.122218, 0.117998, 0.123027, 0.123298, 0.128914, 0.113454, 0.126838, 0.103437, 0.131109, 0.105346, 0.12633, 0.113005, 0.117147, 0.119013, 0.111667, 0.130562, 0.133131, 0.112803, 0.130715, 0.131731, 0.126917, 0.129851, 0.10397, 0.128916, 0.12393, 0.12372, 0.131301, 0.108238, 0.119382, 0.111591, 0.132555, 0.108586, 0.116635, 0.130817, 0.102402, 0.118168, 0.116027, 0.129284, 0.131295, 0.108254, 0.101675, 0.115852, 0.114644, 0.113088, 0.106211, 0.11992, 0.107638, 0.120813, 0.118432, 0.11174, 0.130834, 0.110938, 0.132968, 0.10945, 0.105169, 0.13064, 0.119565, 0.135161, 0.129666, 0.128515, 0.108749, 0.105576, 0.125065, 0.104974, 0.117933, 0.129239, 0.134044, 0.115963, 0.121461, 0.101262, 0.110659], "area_num": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158], "t_pctblack": [1.427054, 2.054089, 1.62247, 2.067223, -0.571001, -0.751165, -0.962027, 1.787476, 1.938852, 2.360389, 0.343293, 1.021487, 1.514925, 2.543305, 0.960576, 0.579333, -0.498492, -0.351502, 1.81343, 1.474091, 0.597696, 2.701643, 1.951858, 1.735149, 0.890002, 1.615912, 2.291372, 1.102801, -1.582224, 1.696627, 0.739203, 2.141897, 1.347938, 1.946313, 2.439857, -1.191861, 2.476966, 2.131924, 0.834293, 1.727777, 2.256296, 0.527207, 2.275422, 0.093091, 1.231076, 1.297547, 2.003273, 1.920576, 1.906397, 2.306169, 0.608413, -1.548925, 0.334071, 0.879985, 1.142487, 1.345685, 2.328186, 0.326093, -1.149781, 1.113164, 1.234299, -0.871034, 1.335771, 1.758505, 2.407358, -1.451449, -0.180277, -0.347764, -0.26002, -0.97909, 2.60129, 1.916199, -1.445025, 2.664429, 0.118778, 1.021292, 2.132797, -0.990281, -1.104017, 1.473619, -0.593063, 0.002792, 0.189985, -0.440808, 0.524102, 2.369783, 0.73308, 1.790638, 1.07006, -1.431878, 1.201306, 2.458042, 0.476659, -1.18185, 1.211263, 1.023896, -1.487726, 1.396634, 1.784685, 2.074325, 2.273394, -0.035106, 0.981109, -1.549049, 1.58959, 1.716912, -1.112541, -1.619114, -1.59564, 2.029086, 0.863052, 1.167835, 1.599251, 1.146231, 2.372841, 1.371699, -1.258477, 1.568643, -0.223734, 1.580033, -0.922473, -0.724635, 1.266086, 0.060804, 2.170711, 0.65643, -0.887132, 1.511118, 1.375946, 1.471151, -1.270003, 1.033424, 1.190796, 1.62997, 1.588052, 2.50093, 2.311368, 0.971615, 0.272884, 0.642506, 2.334434, 2.076718, 0.531731, 0.537312, 1.160625, 2.122713, -1.249899, 1.847558, -1.042728, -0.262048, 1.373168, 1.442209, 1.076834, 0.077966, 1.821677, 1.750865, -1.482242, 0.078119, 2.254575], "t_pctpov": [-1.892203, -2.294038, -1.987757, -2.962119, -1.762416, -1.060813, -0.932843, -2.923921, -2.112442, -2.582057, -2.306955, -1.904795, -1.888905, -2.812877, -1.783596, -1.771213, -1.710202, -1.542034, -2.943902, -1.84999, -1.796571, -3.96105, -2.856124, -2.069892, -1.730021, -3.181992, -3.276647, -2.396616, -0.519117, -2.926894, -2.383793, -2.386815, -2.729594, -2.189468, -2.825986, -1.395554, -2.749553, -3.562294, -2.682624, -2.620376, -3.146541, -1.906591, -2.871335, -1.814127, -1.739742, -2.458238, -2.919104, -3.318914, -2.948627, -2.512025, -1.694582, -0.823055, -1.846445, -1.762619, -2.228132, -2.902644, -3.390087, -1.796814, -0.830198, -2.651717, -2.303005, -1.719545, -1.809644, -2.80344, -2.855407, -0.908037, -1.476799, -1.304983, -1.373228, -1.592868, -3.816142, -3.372144, -0.731138, -4.002403, -1.88216, -2.334626, -2.331943, -0.925637, -0.912148, -1.835951, -1.845709, -1.877457, -2.049956, -1.601903, -2.373838, -2.588749, -1.802296, -2.902752, -1.769972, -1.258854, -1.801754, -2.670262, -1.827804, -1.477533, -1.787908, -2.56241, -0.657756, -3.025058, -3.355567, -2.963173, -2.859663, -1.872183, -1.641636, -0.519046, -2.544991, -3.248908, -0.786878, -0.492614, -0.760425, -3.282933, -2.527649, -2.340399, -1.977606, -2.864736, -3.537897, -2.126485, -0.972603, -2.935453, -1.356259, -2.925619, -1.551575, -1.084007, -2.832382, -1.790533, -2.938625, -2.350179, -1.002078, -2.999061, -2.789759, -3.115739, -1.325746, -1.825317, -2.941886, -1.822841, -2.840247, -2.829192, -2.630888, -1.741319, -1.676917, -1.709216, -3.692752, -2.674345, -2.1837, -1.833093, -2.881611, -3.044283, -0.706881, -2.168882, -1.6295, -2.045982, -1.850712, -2.904624, -1.466082, -1.57836, -2.740722, -2.259014, -1.082611, -2.220636, -2.842413], "t_intercept": [7.609379, 6.697503, 7.195672, 8.255795, 14.156497, 18.407904, 18.455024, 16.077957, 6.054346, 7.204967, 13.296838, 7.329656, 6.866246, 7.675677, 8.4897, 9.071485, 9.999358, 17.841186, 8.604366, 6.69702, 9.080628, 16.810189, 15.17167, 6.835291, 8.384399, 11.955242, 15.299161, 16.868109, 18.579766, 8.871642, 17.194655, 7.052968, 16.742201, 6.53967, 7.638512, 11.604368, 7.593943, 17.183962, 13.302195, 8.155776, 14.929177, 17.265368, 7.668111, 17.658053, 6.083667, 8.634759, 8.295939, 16.901099, 8.408389, 7.259141, 8.810223, 17.697046, 9.122639, 8.7043, 16.183129, 17.280266, 15.592031, 17.414749, 18.485488, 16.905, 16.099393, 12.162038, 7.239849, 15.818031, 7.578958, 17.668373, 17.682992, 18.050671, 17.981138, 14.869228, 16.588831, 14.879699, 18.090369, 17.196475, 17.748172, 9.091638, 6.535975, 18.607913, 18.083927, 7.076811, 10.768372, 9.726515, 9.322075, 14.772789, 17.375394, 7.339344, 7.649891, 8.714896, 8.184484, 14.374787, 7.88758, 7.504212, 17.187921, 12.960213, 7.696175, 10.289304, 18.564107, 11.821484, 16.94767, 8.180987, 7.707789, 16.470342, 7.740734, 18.284333, 15.557935, 13.164332, 18.149077, 18.57868, 18.007492, 16.32635, 10.980793, 16.440536, 7.153743, 17.304261, 16.205781, 7.203743, 17.151505, 9.602895, 17.765752, 9.255965, 10.787743, 17.733848, 10.47511, 9.635323, 7.958089, 17.585678, 18.452111, 10.484603, 9.497019, 14.547858, 15.52837, 8.620986, 12.9729, 5.923095, 8.817958, 7.528327, 7.241212, 8.412973, 17.323629, 8.263278, 16.566306, 7.755803, 9.926525, 16.978309, 15.470937, 15.129371, 18.3484, 6.97004, 13.472636, 10.58946, 7.433559, 9.7771, 6.279439, 17.679091, 15.298295, 6.775579, 16.180906, 11.101535, 7.890733], "t_pctrural": [-4.164093, -3.455642, -3.984466, -3.502894, -6.433839, -9.00415, -9.330583, -7.273569, -3.216792, -3.500257, -5.878384, -3.709493, -3.960341, -3.505663, -4.520597, -4.513625, -4.899843, -8.51144, -3.51973, -3.918325, -4.387163, -6.760826, -6.474081, -3.862817, -4.494979, -4.783707, -6.350931, -8.039987, -9.289223, -3.501551, -8.415808, -3.635542, -8.063056, -3.433894, -3.469055, -5.926957, -3.549694, -7.252536, -5.86143, -3.892085, -6.085906, -8.344896, -3.287008, -8.992089, -3.202256, -4.180517, -3.586052, -7.6905, -3.396553, -3.617773, -4.518828, -8.679796, -4.198436, -4.462907, -7.393499, -7.965298, -6.544475, -8.638036, -8.9432, -8.304627, -7.421519, -5.740929, -4.173442, -7.05162, -3.322371, -8.713754, -9.045413, -8.630204, -8.865623, -7.075921, -6.790647, -5.812039, -8.758158, -6.627627, -8.67066, -4.359978, -3.354427, -9.25172, -8.815745, -3.810158, -4.97107, -4.671502, -4.036906, -6.657152, -7.832129, -3.570442, -3.507163, -3.796826, -4.452569, -7.373823, -4.376163, -3.569278, -8.212728, -6.485034, -4.339861, -4.626838, -9.107394, -4.892205, -6.95731, -3.39262, -3.353496, -7.438071, -3.829113, -9.139108, -6.952847, -5.251575, -9.25942, -9.319456, -8.890033, -7.332245, -4.988105, -7.76106, -4.047026, -7.595919, -6.829612, -3.826044, -8.286358, -3.730161, -8.37047, -3.692142, -5.351946, -9.213078, -4.448672, -4.779433, -3.320568, -8.197081, -8.853278, -4.155026, -4.096627, -6.011687, -7.675126, -4.450236, -5.547404, -3.188337, -3.686576, -3.351909, -3.445811, -4.216177, -8.09466, -3.813858, -6.277743, -3.697309, -4.365304, -7.969116, -6.606297, -6.362486, -9.425633, -3.841971, -6.526246, -4.606206, -4.214534, -3.997518, -3.167862, -8.434538, -6.669445, -3.479159, -8.106824, -4.771271, -3.595321], "se_pctrural": [0.021113, 0.022571, 0.021449, 0.020748, 0.019962, 0.020098, 0.019899, 0.019787, 0.022398, 0.021286, 0.019905, 0.023528, 0.023206, 0.021056, 0.022023, 0.021867, 0.022178, 0.018818, 0.020785, 0.024029, 0.021563, 0.01998, 0.020694, 0.023149, 0.022656, 0.019163, 0.02063, 0.019305, 0.020461, 0.020929, 0.019657, 0.022518, 0.01943, 0.022336, 0.020889, 0.022595, 0.020867, 0.019458, 0.019034, 0.020261, 0.020999, 0.01951, 0.021523, 0.01937, 0.024265, 0.020277, 0.020575, 0.019332, 0.021115, 0.022297, 0.022509, 0.020745, 0.022119, 0.021531, 0.020126, 0.019319, 0.020376, 0.019515, 0.020718, 0.019643, 0.020115, 0.021454, 0.023165, 0.020031, 0.02142, 0.019198, 0.019699, 0.020045, 0.019756, 0.019562, 0.01974, 0.019102, 0.021235, 0.019553, 0.019129, 0.020772, 0.021657, 0.019952, 0.019021, 0.021288, 0.022121, 0.021632, 0.022445, 0.020264, 0.018286, 0.02157, 0.023564, 0.020082, 0.022114, 0.021024, 0.022131, 0.021489, 0.019726, 0.021353, 0.02267, 0.019705, 0.02062, 0.019026, 0.018846, 0.021084, 0.021302, 0.019015, 0.021648, 0.019473, 0.02044, 0.01901, 0.019717, 0.020178, 0.020046, 0.019681, 0.020005, 0.01975, 0.022051, 0.01845, 0.019829, 0.021858, 0.019109, 0.020589, 0.020283, 0.020517, 0.022525, 0.020004, 0.019432, 0.021736, 0.021334, 0.018841, 0.020403, 0.019898, 0.019795, 0.018725, 0.019767, 0.020979, 0.018699, 0.022747, 0.020568, 0.021418, 0.021087, 0.021102, 0.019954, 0.02239, 0.019693, 0.020383, 0.022476, 0.019948, 0.01871, 0.020741, 0.019983, 0.02252, 0.020585, 0.021973, 0.02231, 0.019978, 0.023436, 0.019761, 0.020596, 0.021987, 0.020298, 0.021349, 0.020449], "est_pctblack": [0.069101, 0.109652, 0.081621, 0.107978, -0.028756, -0.031319, -0.039784, 0.074043, 0.093798, 0.116161, 0.017538, 0.051458, 0.086563, 0.126093, 0.045851, 0.026472, -0.024031, -0.017182, 0.099296, 0.088445, 0.026868, 0.126713, 0.083942, 0.099041, 0.043496, 0.091923, 0.100254, 0.043769, -0.069673, 0.097044, 0.034708, 0.116045, 0.055012, 0.101042, 0.120346, -0.056913, 0.120821, 0.105723, 0.044647, 0.085246, 0.099642, 0.020989, 0.120838, 0.003858, 0.061184, 0.067138, 0.103493, 0.081887, 0.105581, 0.122948, 0.028657, -0.068744, 0.01541, 0.040416, 0.047114, 0.065078, 0.101303, 0.012956, -0.04988, 0.047631, 0.05087, -0.042543, 0.07408, 0.073207, 0.124138, -0.067188, -0.007274, -0.014491, -0.010531, -0.046921, 0.113601, 0.113778, -0.064522, 0.133808, 0.005659, 0.051272, 0.102649, -0.041146, -0.053152, 0.070016, -0.029277, 0.00013, 0.009159, -0.022814, 0.026241, 0.120036, 0.035151, 0.091913, 0.052522, -0.063926, 0.060652, 0.124459, 0.019242, -0.055326, 0.063397, 0.057636, -0.06509, 0.079295, 0.09742, 0.111371, 0.118058, -0.001812, 0.044016, -0.073499, 0.066919, 0.09991, -0.050833, -0.071948, -0.07113, 0.085506, 0.046023, 0.047284, 0.084538, 0.059245, 0.101547, 0.066148, -0.061461, 0.091916, -0.009479, 0.089892, -0.044908, -0.032054, 0.070816, 0.002808, 0.116789, 0.032794, -0.037893, 0.087485, 0.074573, 0.08465, -0.058766, 0.046954, 0.066035, 0.078357, 0.088406, 0.127193, 0.112785, 0.043583, 0.011283, 0.029085, 0.133664, 0.098907, 0.027869, 0.022034, 0.065065, 0.092313, -0.053861, 0.101281, -0.049077, -0.013325, 0.072389, 0.081356, 0.05037, 0.003179, 0.077726, 0.084694, -0.066626, 0.003954, 0.109955], "se_intercept": [2.414905, 2.693495, 2.525672, 2.254469, 1.767947, 1.568279, 1.578247, 1.663068, 2.862878, 2.499664, 1.754697, 2.53432, 2.7458, 2.37276, 2.358372, 2.266851, 2.33053, 1.53043, 2.200939, 2.850729, 2.214298, 1.581426, 1.713271, 2.759415, 2.423043, 1.758621, 1.706088, 1.614738, 1.594406, 2.164187, 1.632367, 2.633065, 1.645574, 2.705483, 2.377208, 2.253458, 2.385997, 1.547755, 1.71435, 2.276863, 1.726498, 1.598733, 2.397814, 1.610286, 2.878158, 2.211205, 2.243644, 1.61194, 2.243294, 2.566473, 2.352536, 1.661851, 2.248079, 2.263805, 1.650174, 1.582012, 1.689538, 1.607162, 1.586713, 1.657121, 1.663216, 2.067999, 2.67157, 1.673314, 2.409132, 1.605387, 1.617749, 1.568208, 1.579703, 1.770208, 1.597488, 1.549723, 1.637652, 1.493865, 1.56963, 2.16101, 2.69733, 1.563976, 1.54099, 2.48411, 2.20626, 2.257751, 2.231355, 1.703418, 1.497784, 2.490184, 2.431209, 2.158241, 2.408576, 1.942379, 2.463849, 2.44542, 1.601738, 2.049086, 2.544855, 1.97318, 1.593987, 1.777595, 1.48837, 2.280386, 2.383153, 1.562314, 2.322577, 1.569305, 1.69604, 1.659262, 1.589465, 1.585724, 1.620291, 1.653437, 1.931336, 1.647418, 2.601788, 1.497189, 1.635503, 2.553523, 1.6014, 2.048805, 1.58145, 2.095997, 2.300383, 1.635103, 1.923033, 2.283359, 2.331305, 1.537997, 1.568949, 1.923869, 2.046619, 1.586403, 1.772296, 2.233654, 1.698501, 2.852481, 2.168708, 2.418624, 2.486725, 2.227902, 1.586879, 2.283136, 1.49281, 2.349879, 2.112019, 1.609147, 1.564276, 1.717976, 1.595726, 2.676737, 1.941109, 2.139217, 2.563838, 2.013145, 2.682408, 1.574679, 1.710298, 2.660378, 1.762358, 1.996974, 2.314566], "y": [8.2, 6.4, 6.6, 9.4, 13.3, 6.4, 9.2, 9.0, 7.6, 7.5, 17.0, 10.3, 5.8, 9.1, 11.8, 19.9, 9.6, 7.2, 10.1, 13.5, 9.9, 12.0, 8.1, 6.4, 18.6, 20.2, 5.9, 18.4, 37.5, 11.2, 14.7, 6.7, 33.0, 11.1, 10.0, 23.9, 6.5, 13.3, 5.7, 10.0, 8.0, 8.6, 11.7, 32.7, 8.0, 9.5, 17.0, 12.0, 9.4, 4.7, 7.6, 8.0, 9.1, 8.6, 7.8, 25.8, 13.7, 15.6, 9.5, 31.6, 8.6, 5.3, 19.9, 9.2, 7.7, 8.8, 29.6, 12.0, 15.4, 6.8, 7.5, 13.6, 9.1, 5.7, 10.7, 16.0, 8.3, 9.0, 10.8, 8.3, 6.2, 7.7, 4.9, 12.0, 10.0, 5.4, 12.0, 13.7, 13.4, 8.2, 5.2, 16.3, 11.1, 10.4, 8.7, 10.1, 9.7, 4.6, 6.7, 8.2, 7.8, 12.9, 10.1, 11.0, 5.5, 16.6, 9.5, 28.4, 12.8, 7.6, 15.2, 9.0, 6.3, 9.3, 6.8, 10.7, 11.7, 7.3, 11.6, 6.0, 17.3, 18.1, 8.0, 8.6, 7.8, 11.1, 13.1, 8.0, 15.9, 7.1, 5.6, 6.5, 7.1, 8.6, 9.2, 13.4, 14.0, 11.4, 11.4, 6.3, 13.6, 7.2, 4.8, 10.1, 9.0, 8.4, 9.4, 10.4, 4.2, 9.8, 9.6, 5.5, 8.6, 13.6, 12.0, 7.6, 10.4, 8.8, 6.3]} \ No newline at end of file diff --git a/release/python/0.7.0/crankshaft/test/fixtures/kmeans.json b/release/python/0.7.0/crankshaft/test/fixtures/kmeans.json new file mode 100644 index 0000000..8f31c79 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/kmeans.json @@ -0,0 +1 @@ +[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}] \ No newline at end of file diff --git a/release/python/0.7.0/crankshaft/test/fixtures/markov.json b/release/python/0.7.0/crankshaft/test/fixtures/markov.json new file mode 100644 index 0000000..d60e4e0 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/markov.json @@ -0,0 +1 @@ +[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/moran.json b/release/python/0.7.0/crankshaft/test/fixtures/moran.json new file mode 100644 index 0000000..2f75cf1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/moran.json @@ -0,0 +1,52 @@ +[[0.9319096128346788, "HH"], +[-1.135787401862846, "HL"], +[0.11732030672508517, "LL"], +[0.6152779669180425, "LL"], +[-0.14657336660125297, "LH"], +[0.6967858120189607, "LL"], +[0.07949310115714454, "HH"], +[0.4703198759258987, "HH"], +[0.4421125200498064, "HH"], +[0.5724288737143592, "LL"], +[0.8970743435692062, "LL"], +[0.18327334401918674, "LL"], +[-0.01466729201304962, "HL"], +[0.3481559372544409, "LL"], +[0.06547094736902978, "LL"], +[0.15482141569329988, "HH"], +[0.4373841193538136, "HH"], +[0.15971286468915544, "LL"], +[1.0543588860308968, "HH"], +[1.7372866900020818, "HH"], +[1.091998586053999, "LL"], +[0.1171572584252222, "HH"], +[0.08438455015300014, "LL"], +[0.06547094736902978, "LL"], +[0.15482141569329985, "HH"], +[1.1627044812890683, "HH"], +[0.06547094736902978, "LL"], +[0.795275137550483, "HH"], +[0.18562939195219, "LL"], +[0.3010757406693439, "LL"], +[2.8205795942839376, "HH"], +[0.11259190602909264, "LL"], +[-0.07116352791516614, "HL"], +[-0.09945240794119009, "LH"], +[0.18562939195219, "LL"], +[0.1832733440191868, "LL"], +[-0.39054253768447705, "HL"], +[-0.1672071289487642, "HL"], +[0.3337669247916343, "HH"], +[0.2584386102554792, "HH"], +[-0.19733845476322634, "HL"], +[-0.9379282899805409, "LH"], +[-0.028770969951095866, "LH"], +[0.051367269430983485, "LL"], +[-0.2172548045913472, "LH"], +[0.05136726943098351, "LL"], +[0.04191046803899837, "LL"], +[0.7482357030403517, "HH"], +[-0.014585767863118111, "LH"], +[0.5410013139159929, "HH"], +[1.0223932668429925, "LL"], +[1.4179402898927476, "LL"]] \ No newline at end of file diff --git a/release/python/0.7.0/crankshaft/test/fixtures/neighbors.json b/release/python/0.7.0/crankshaft/test/fixtures/neighbors.json new file mode 100644 index 0000000..055b359 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/neighbors.json @@ -0,0 +1,54 @@ +[ + {"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5}, + {"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7}, + {"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2}, + {"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1}, + {"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3}, + {"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05}, + {"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4}, + {"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7}, + {"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5}, + {"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04}, + {"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08}, + {"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2}, + {"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4}, + {"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2}, + {"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3}, + {"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4}, + {"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6}, + {"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3}, + {"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7}, + {"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8}, + {"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1}, + {"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4}, + {"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1}, + {"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3}, + {"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4}, + {"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6}, + {"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3}, + {"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8}, + {"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3}, + {"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1}, + {"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9}, + {"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3}, + {"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4}, + {"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3}, + {"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3}, + {"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2}, + {"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5}, + {"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4}, + {"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6}, + {"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5}, + {"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4}, + {"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2}, + {"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3}, + {"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2}, + {"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3}, + {"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2}, + {"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3}, + {"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5}, + {"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2}, + {"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6}, + {"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01}, + {"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01} + ] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json new file mode 100644 index 0000000..be367ff --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json @@ -0,0 +1 @@ +[{"neighbors": [3, 6, 7], "id": 1, "value": 1.624458}, {"neighbors": [10, 5, 8], "id": 2, "value": 2.2554919999999998}, {"neighbors": [1, 4, 7], "id": 3, "value": 1.4678899999999999}, {"neighbors": [9, 3, 5, 7], "id": 4, "value": 2.4842559999999998}, {"neighbors": [9, 2, 4, 10], "id": 5, "value": 0.0}, {"neighbors": [1, 11, 12, 7, 16], "id": 6, "value": 9.0486730000000009}, {"neighbors": [1, 3, 4, 6, 9, 11, 18, 19], "id": 7, "value": 6.0294889999999999}, {"neighbors": [2, 15, 10], "id": 8, "value": 1.8003849999999999}, {"neighbors": [4, 5, 7, 10, 13, 19, 20], "id": 9, "value": 4.581251}, {"neighbors": [2, 5, 8, 9, 13, 15, 17, 20, 21], "id": 10, "value": 3.7906070000000001}, {"neighbors": [18, 6, 7, 16], "id": 11, "value": 1.4474359999999999}, {"neighbors": [16, 6, 14], "id": 12, "value": 1.1919660000000001}, {"neighbors": [9, 10, 20], "id": 13, "value": 0.0}, {"neighbors": [12, 22, 16], "id": 14, "value": 1.608017}, {"neighbors": [17, 10, 23, 8], "id": 15, "value": 1.9498120000000001}, {"neighbors": [6, 11, 12, 14, 18, 22, 27, 28], "id": 16, "value": 0.74509000000000003}, {"neighbors": [10, 15, 21, 23, 26, 30], "id": 17, "value": 4.1733180000000001}, {"neighbors": [33, 7, 11, 16, 19, 27, 32], "id": 18, "value": 3.7832520000000001}, {"neighbors": [33, 7, 9, 18, 20, 24], "id": 19, "value": 2.0851359999999999}, {"neighbors": [9, 10, 13, 19, 21, 24], "id": 20, "value": 2.1763020000000002}, {"neighbors": [35, 10, 17, 20, 24, 26], "id": 21, "value": 6.3093469999999998}, {"neighbors": [28, 29, 14, 16], "id": 22, "value": 10.855743}, {"neighbors": [17, 25, 31, 30, 15], "id": 23, "value": 4.211354}, {"neighbors": [33, 19, 20, 21, 35], "id": 24, "value": 0.80481000000000003}, {"neighbors": [42, 31, 23], "id": 25, "value": 3.2153309999999999}, {"neighbors": [17, 34, 35, 21, 30], "id": 26, "value": 2.8336640000000002}, {"neighbors": [36, 39, 41, 16, 18, 28, 32], "id": 27, "value": 1.5920399999999999}, {"neighbors": [27, 36, 29, 22, 16], "id": 28, "value": 1.5711580000000001}, {"neighbors": [36, 28, 22, 38], "id": 29, "value": 3.1275900000000001}, {"neighbors": [34, 43, 17, 23, 26, 31], "id": 30, "value": 4.4168960000000004}, {"neighbors": [42, 43, 44, 23, 25, 30], "id": 31, "value": 3.0174859999999999}, {"neighbors": [33, 18, 27, 41], "id": 32, "value": 9.9242450000000009}, {"neighbors": [35, 37, 40, 41, 46, 18, 19, 24, 32], "id": 33, "value": 7.9739570000000004}, {"neighbors": [26, 35, 43, 45, 30], "id": 34, "value": 5.0054639999999999}, {"neighbors": [33, 34, 37, 40, 45, 21, 24, 26], "id": 35, "value": 2.4638909999999998}, {"neighbors": [38, 39, 47, 27, 28, 29], "id": 36, "value": 0.0}, {"neighbors": [33, 35, 40, 45, 46, 49, 51], "id": 37, "value": 7.377974}, {"neighbors": [36, 29, 47, 48], "id": 38, "value": 1.0038750000000001}, {"neighbors": [36, 41, 47, 50, 52, 27], "id": 39, "value": 3.1900469999999999}, {"neighbors": [33, 35, 37, 46], "id": 40, "value": 45.905405999999999}, {"neighbors": [33, 39, 46, 50, 27, 32], "id": 41, "value": 2.447597}, {"neighbors": [25, 44, 53, 31], "id": 42, "value": 1.2949580000000001}, {"neighbors": [34, 44, 45, 54, 59, 61, 30, 31], "id": 43, "value": 5.9330980000000002}, {"neighbors": [42, 43, 53, 54, 31], "id": 44, "value": 4.1339969999999999}, {"neighbors": [34, 35, 37, 43, 51, 59, 60], "id": 45, "value": 4.298311}, {"neighbors": [33, 37, 40, 41, 49, 50, 57], "id": 46, "value": 27.483827000000002}, {"neighbors": [36, 38, 39, 48, 52, 55, 56], "id": 47, "value": 0.96979099999999996}, {"neighbors": [55, 38, 47], "id": 48, "value": 0.0}, {"neighbors": [57, 51, 37, 46, 63], "id": 49, "value": 2.934466}, {"neighbors": [39, 41, 46, 52, 57, 58], "id": 50, "value": 4.4564269999999997}, {"neighbors": [37, 45, 49, 60, 63, 64], "id": 51, "value": 4.629264}, {"neighbors": [39, 47, 50, 56, 58, 62], "id": 52, "value": 4.9415329999999997}, {"neighbors": [65, 42, 44, 54], "id": 53, "value": 3.9900410000000002}, {"neighbors": [65, 61, 43, 44, 53], "id": 54, "value": 2.064324}, {"neighbors": [56, 47, 48], "id": 55, "value": 3.0402529999999999}, {"neighbors": [52, 55, 47, 62], "id": 56, "value": 3.905411}, {"neighbors": [66, 67, 46, 49, 50, 58, 63], "id": 57, "value": 4.3328389999999999}, {"neighbors": [57, 50, 52, 62, 66], "id": 58, "value": 3.8941110000000001}, {"neighbors": [69, 70, 43, 45, 60, 61], "id": 59, "value": 6.8287940000000003}, {"neighbors": [51, 64, 45, 59, 70], "id": 60, "value": 3.2639469999999999}, {"neighbors": [65, 69, 72, 43, 54, 59], "id": 61, "value": 3.2821630000000002}, {"neighbors": [58, 68, 52, 66, 56], "id": 62, "value": 3.2957619999999999}, {"neighbors": [49, 57, 51, 67, 64], "id": 63, "value": 7.2496790000000004}, {"neighbors": [67, 70, 71, 51, 60, 63], "id": 64, "value": 3.041846}, {"neighbors": [61, 53, 54, 72], "id": 65, "value": 1.618018}, {"neighbors": [67, 68, 73, 76, 57, 58, 62], "id": 66, "value": 4.9108010000000002}, {"neighbors": [66, 71, 73, 75, 76, 57, 63, 64], "id": 67, "value": 1.991457}, {"neighbors": [73, 66, 62], "id": 68, "value": 3.1461920000000001}, {"neighbors": [70, 72, 74, 77, 59, 61], "id": 69, "value": 7.2666500000000003}, {"neighbors": [69, 71, 74, 78, 59, 60, 64], "id": 70, "value": 3.1109040000000001}, {"neighbors": [67, 75, 70, 78, 64], "id": 71, "value": 2.9802710000000001}, {"neighbors": [65, 69, 61, 77], "id": 72, "value": 3.8667669999999998}, {"neighbors": [76, 66, 67, 68], "id": 73, "value": 1.8684080000000001}, {"neighbors": [77, 69, 70, 78], "id": 74, "value": 12.577033999999999}, {"neighbors": [67, 76, 78, 71], "id": 75, "value": 7.8035990000000002}, {"neighbors": [73, 66, 67, 75], "id": 76, "value": 3.4714900000000002}, {"neighbors": [74, 69, 72], "id": 77, "value": 4.334822}, {"neighbors": [74, 75, 70, 71], "id": 78, "value": 8.4515370000000001}] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json new file mode 100644 index 0000000..45a20e7 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json @@ -0,0 +1 @@ +[{"neighbors": [10, 7, 21, 23, 1], "y1995": 0.87654416055651474, "y1997": 0.85637566664752718, "y1996": 0.8631470006766887, "y1999": 0.84461540228037335, "y1998": 0.84811668329242784, "y2006": 0.86302631339545688, "y2007": 0.86148266513456728, "y2004": 0.86416611731111015, "y2005": 0.87119374831581786, "y2002": 0.85012592862683589, "y2003": 0.8550965633336135, "y2000": 0.83271652434603094, "y2001": 0.83786313566577242, "id": 0, "y2008": 0.86252252380501315, "y2009": 0.86746356478544273}, {"neighbors": [5, 7, 22, 29, 3], "y1995": 0.91889509774542122, "y1997": 0.92333257900976462, "y1996": 0.91757931190043385, "y1999": 0.92552387732371888, "y1998": 0.92517289327379471, "y2006": 0.91706053906277052, "y2007": 0.90139504820726424, "y2004": 0.89815175749309051, "y2005": 0.91832090781161113, "y2002": 0.89431990798552208, "y2003": 0.88924793576523797, "y2000": 0.90746978227271013, "y2001": 0.89830489127332913, "id": 1, "y2008": 0.87897455159080617, "y2009": 0.86216858051752643}, {"neighbors": [11, 8, 13, 18, 17], "y1995": 0.82591007476914713, "y1997": 0.81989792988843901, "y1996": 0.82548595539161707, "y1999": 0.81731522200916285, "y1998": 0.81503235035017918, "y2006": 0.81814804358939286, "y2007": 0.83675961003285626, "y2004": 0.82668195534569056, "y2005": 0.82373723764184559, "y2002": 0.80849979516360859, "y2003": 0.82258550658074148, "y2000": 0.78964559168205917, "y2001": 0.8058444152731008, "id": 2, "y2008": 0.8357419865626442, "y2009": 0.84647177436289112}, {"neighbors": [4, 14, 9, 5, 12], "y1995": 1.0908817638059434, "y1997": 1.0845641754849344, "y1996": 1.0853768890893893, "y1999": 1.098988414417104, "y1998": 1.0841540389418189, "y2006": 1.1316479722785828, "y2007": 1.1295850763954971, "y2004": 1.1139980568106316, "y2005": 1.1216802898290368, "y2002": 1.1116069731657288, "y2003": 1.1088862051501811, "y2000": 1.1450694824791507, "y2001": 1.1215113292620285, "id": 3, "y2008": 1.1137181812756343, "y2009": 1.0993677488645406}, {"neighbors": [14, 3, 9, 31, 12], "y1995": 1.1073144618319228, "y1997": 1.1328363804627946, "y1996": 1.1137394350312471, "y1999": 1.1591002514611153, "y1998": 1.144725587086376, "y2006": 1.1173646811350333, "y2007": 1.1086324218539598, "y2004": 1.1102496406140896, "y2005": 1.11943471361418, "y2002": 1.1475230282561595, "y2003": 1.1184328424005199, "y2000": 1.1689820101690329, "y2001": 1.1721248787169682, "id": 4, "y2008": 1.0964251552643696, "y2009": 1.0776233718455337}, {"neighbors": [29, 1, 22, 7, 4], "y1995": 1.422697571371182, "y1997": 1.4427350196405593, "y1996": 1.4211843379728528, "y1999": 1.4440068434166562, "y1998": 1.4357757095632602, "y2006": 1.4405276647793266, "y2007": 1.4524121586440921, "y2004": 1.4059372049179741, "y2005": 1.4078864636665769, "y2002": 1.4197822680667809, "y2003": 1.3909220829548647, "y2000": 1.4418473669388905, "y2001": 1.4478283203013527, "id": 5, "y2008": 1.4330609762040207, "y2009": 1.4174430982377491}, {"neighbors": [12, 47, 9, 25, 20], "y1995": 1.1307388498039153, "y1997": 1.1107470843142355, "y1996": 1.1311051255854685, "y1999": 1.130881491772973, "y1998": 1.1336463608751246, "y2006": 1.1088003408832796, "y2007": 1.0840170924825394, "y2004": 1.1244623853593112, "y2005": 1.1167100811401538, "y2002": 1.1306293052597198, "y2003": 1.1194498381213465, "y2000": 1.1088813841947593, "y2001": 1.1185662918783175, "id": 6, "y2008": 1.0695920556329086, "y2009": 1.0787522517402164}, {"neighbors": [21, 1, 22, 10, 0], "y1995": 1.0470612357366649, "y1997": 1.0425337165747406, "y1996": 1.0451683097376836, "y1999": 1.0207254480945218, "y1998": 1.0323998680588111, "y2006": 1.0405109962442973, "y2007": 1.0174964540280445, "y2004": 1.0140090547678748, "y2005": 1.0317674181861733, "y2002": 0.99669586934394627, "y2003": 0.99327675611171373, "y2000": 0.99854316295509526, "y2001": 0.98802579761429143, "id": 7, "y2008": 0.9936394033949828, "y2009": 0.98279746069218921}, {"neighbors": [11, 13, 17, 18, 15], "y1995": 0.98996985668705595, "y1997": 0.99491000469481983, "y1996": 1.0014356415938011, "y1999": 1.0045584503565237, "y1998": 1.0018840754492748, "y2006": 0.92232873520447411, "y2007": 0.91284090705064902, "y2004": 0.93694786512729977, "y2005": 0.94308212820743131, "y2002": 0.96834820215592055, "y2003": 0.95335147249088092, "y2000": 0.99127006477048718, "y2001": 0.97925917470464008, "id": 8, "y2008": 0.89689832627117483, "y2009": 0.88928857608264111}, {"neighbors": [12, 6, 4, 3, 14], "y1995": 0.87418390853652306, "y1997": 0.84425695187978567, "y1996": 0.86416601430334228, "y1999": 0.83903043942542854, "y1998": 0.8404493987171674, "y2006": 0.87204140839730271, "y2007": 0.86633032299764789, "y2004": 0.86981997840756087, "y2005": 0.86837929279319737, "y2002": 0.86107306112852877, "y2003": 0.85007719735663123, "y2000": 0.85787080050645603, "y2001": 0.86036185149249467, "id": 9, "y2008": 0.84946077011565357, "y2009": 0.83287145944123797}, {"neighbors": [0, 7, 21, 23, 22], "y1995": 1.1419611801631209, "y1997": 1.1489271154554144, "y1996": 1.146602624490825, "y1999": 1.1443662376135306, "y1998": 1.1490959392942743, "y2006": 1.1049125811637337, "y2007": 1.1105984164317646, "y2004": 1.1119989015058092, "y2005": 1.1025779214946556, "y2002": 1.1259666377127024, "y2003": 1.1221399558345004, "y2000": 1.144501826035474, "y2001": 1.1234975172649961, "id": 10, "y2008": 1.1050979494645479, "y2009": 1.1002009697391872}, {"neighbors": [8, 13, 18, 17, 2], "y1995": 0.97282462974938089, "y1997": 0.96252588061647382, "y1996": 0.96700147279313231, "y1999": 0.96057686787383312, "y1998": 0.96538780087103548, "y2006": 0.91010201260822066, "y2007": 0.89280392121658247, "y2004": 0.94103988614185807, "y2005": 0.9212251863828258, "y2002": 0.94804194711420009, "y2003": 0.9543028555845573, "y2000": 0.95831051250950716, "y2001": 0.94480908623936988, "id": 11, "y2008": 0.89298242828382146, "y2009": 0.89165384824292859}, {"neighbors": [33, 9, 6, 25, 31], "y1995": 0.94325467991401402, "y1997": 0.96455242154753429, "y1996": 0.96436902092427723, "y1999": 0.94117647058823528, "y1998": 0.95243008993884537, "y2006": 0.9346681464882507, "y2007": 0.94281559150403071, "y2004": 0.96918424441756057, "y2005": 0.94781280876672958, "y2002": 0.95388717527096822, "y2003": 0.94597005193649519, "y2000": 0.94809269652332606, "y2001": 0.93539181553564288, "id": 12, "y2008": 0.965203150896216, "y2009": 0.967154410723015}, {"neighbors": [18, 17, 11, 8, 19], "y1995": 0.97478408425654373, "y1997": 0.98712808751954773, "y1996": 0.98169225257738801, "y1999": 0.985598971191053, "y1998": 0.98474769442356791, "y2006": 0.98416665248276058, "y2007": 0.98423613480079708, "y2004": 0.97399471186978948, "y2005": 0.96910087128357136, "y2002": 0.9820996926750224, "y2003": 0.98776529543110569, "y2000": 0.98687072733199255, "y2001": 0.99237486444837619, "id": 13, "y2008": 0.99823861244053191, "y2009": 0.99545704236827348}, {"neighbors": [4, 31, 3, 29, 12], "y1995": 0.85570268988941878, "y1997": 0.85986131704895119, "y1996": 0.85575915188345031, "y1999": 0.85380119644969055, "y1998": 0.85693406055397725, "y2006": 0.82803647591954255, "y2007": 0.81987360180979219, "y2004": 0.83998883284341452, "y2005": 0.83478547261894065, "y2002": 0.85472102128186755, "y2003": 0.84564834502399988, "y2000": 0.86191535266765262, "y2001": 0.84981450830432048, "id": 14, "y2008": 0.82265395167873867, "y2009": 0.83994039782937002}, {"neighbors": [19, 8, 17, 16, 13], "y1995": 0.87022046646521634, "y1997": 0.85961813213722393, "y1996": 0.85996258309339635, "y1999": 0.8394713575455558, "y1998": 0.85689572413110093, "y2006": 0.94202108334913126, "y2007": 0.94222309998743192, "y2004": 0.86763340229291142, "y2005": 0.89179316746010362, "y2002": 0.86776297543511893, "y2003": 0.86720209304280604, "y2000": 0.82785596604704892, "y2001": 0.86008789452656809, "id": 15, "y2008": 0.93902708112840494, "y2009": 0.94479183757120588}, {"neighbors": [28, 26, 15, 19, 32], "y1995": 0.90134907329491731, "y1997": 0.90403990934606904, "y1996": 0.904077381347274, "y1999": 0.90399237579083946, "y1998": 0.90201769385650832, "y2006": 0.91108803862404764, "y2007": 0.90543476309316473, "y2004": 0.94338264626469681, "y2005": 0.91981795862151561, "y2002": 0.93695966482853577, "y2003": 0.94242697007039, "y2000": 0.90906631602055099, "y2001": 0.92693339421265908, "id": 16, "y2008": 0.91737137682250491, "y2009": 0.94793657442067902}, {"neighbors": [13, 18, 11, 19, 8], "y1995": 1.1977611005602815, "y1997": 1.1843915817489725, "y1996": 1.1822256425225894, "y1999": 1.1928672308275252, "y1998": 1.1826786457339149, "y2006": 1.2392938410349985, "y2007": 1.2341867605077472, "y2004": 1.2385704217423759, "y2005": 1.2441989281116201, "y2002": 1.2262477774195681, "y2003": 1.2239707531714479, "y2000": 1.2017286912636342, "y2001": 1.2132869128474402, "id": 17, "y2008": 1.2362673914436095, "y2009": 1.2675439750795283}, {"neighbors": [13, 17, 11, 8, 19], "y1995": 1.2491967813733067, "y1997": 1.2699116090397236, "y1996": 1.2575477330927329, "y1999": 1.3062566740535762, "y1998": 1.2802065055312271, "y2006": 1.3210776560048689, "y2007": 1.329362443219563, "y2004": 1.3054484140490119, "y2005": 1.3030330249408666, "y2002": 1.3257518058685978, "y2003": 1.3079549159235695, "y2000": 1.3479002255103918, "y2001": 1.3439986302151703, "id": 18, "y2008": 1.3300124123891741, "y2009": 1.3328846185074705}, {"neighbors": [26, 17, 28, 15, 16], "y1995": 1.0676800411188558, "y1997": 1.0363730321443168, "y1996": 1.0379927554499979, "y1999": 1.0329609259280523, "y1998": 1.027684488045026, "y2006": 0.94241549375546196, "y2007": 0.92754546923532677, "y2004": 0.99614160423102482, "y2005": 0.97356208269708677, "y2002": 1.0274762326434594, "y2003": 1.0316273366809443, "y2000": 1.0505901631347052, "y2001": 1.0340505678899605, "id": 19, "y2008": 0.92549226593721745, "y2009": 0.92138101880290568}, {"neighbors": [30, 25, 24, 37, 47], "y1995": 1.0947561397632881, "y1997": 1.1165429913770684, "y1996": 1.1152679554712275, "y1999": 1.1314326394231322, "y1998": 1.1310394841195361, "y2006": 1.1090538904302065, "y2007": 1.1057776900012568, "y2004": 1.1402994437897009, "y2005": 1.1197940058085571, "y2002": 1.133670175399079, "y2003": 1.139822558851451, "y2000": 1.1388962186541665, "y2001": 1.1244221220249986, "id": 20, "y2008": 1.1116682481010467, "y2009": 1.0998515545336902}, {"neighbors": [23, 22, 7, 10, 34], "y1995": 0.76530058421804126, "y1997": 0.76542450966153397, "y1996": 0.76612841163904621, "y1999": 0.76014283909933289, "y1998": 0.7672268310234307, "y2006": 0.76842416021983684, "y2007": 0.77487117798086069, "y2004": 0.76533287692895391, "y2005": 0.78205934309410463, "y2002": 0.76156903267949927, "y2003": 0.76651951668098528, "y2000": 0.74480073263159763, "y2001": 0.76098396210261965, "id": 21, "y2008": 0.77768682781054099, "y2009": 0.78801192267396702}, {"neighbors": [21, 34, 5, 7, 29], "y1995": 0.98391336093764348, "y1997": 0.98295341320156315, "y1996": 0.98075815675295552, "y1999": 0.96913802803963667, "y1998": 0.97386015032669815, "y2006": 0.93965462091114671, "y2007": 0.93069644684632924, "y2004": 0.9635616201227476, "y2005": 0.94745351657235244, "y2002": 0.97209860866113018, "y2003": 0.97441312580606143, "y2000": 0.97370819354423843, "y2001": 0.96419154157867693, "id": 22, "y2008": 0.94020973488297466, "y2009": 0.94358232339833159}, {"neighbors": [21, 10, 22, 34, 7], "y1995": 0.83561828119099946, "y1997": 0.81738501913392403, "y1996": 0.82298088022609361, "y1999": 0.80904800725677739, "y1998": 0.81748588141426259, "y2006": 0.87170334233473346, "y2007": 0.8786379876833581, "y2004": 0.85954307066870839, "y2005": 0.86790023653402792, "y2002": 0.83451612857812574, "y2003": 0.85175031934895873, "y2000": 0.80071489233375537, "y2001": 0.83358255807316928, "id": 23, "y2008": 0.87497981001981484, "y2009": 0.87888675419592222}, {"neighbors": [27, 20, 30, 32, 47], "y1995": 0.98845573274970278, "y1997": 0.99665282989553183, "y1996": 1.0209242772035507, "y1999": 0.99386618594343845, "y1998": 0.99141823200404444, "y2006": 0.97906748937234156, "y2007": 0.9932312332800689, "y2004": 1.0111665058188304, "y2005": 0.9998802359352077, "y2002": 0.99669586934394627, "y2003": 1.0255909749831356, "y2000": 0.98733194819247994, "y2001": 0.99644997431653437, "id": 24, "y2008": 1.0020493856497013, "y2009": 0.99602148231561483}, {"neighbors": [20, 33, 6, 30, 12], "y1995": 1.1493091345649815, "y1997": 1.143009615936718, "y1996": 1.1524194939429724, "y1999": 1.1398468268822266, "y1998": 1.1426554202510555, "y2006": 1.0889107875354573, "y2007": 1.0860369499254896, "y2004": 1.0856975145267398, "y2005": 1.1244348633192611, "y2002": 1.0423089214343333, "y2003": 1.0557727834721793, "y2000": 1.0831239730629278, "y2001": 1.0519262599166714, "id": 25, "y2008": 1.0599731384290745, "y2009": 1.0216094265950888}, {"neighbors": [28, 19, 16, 32, 17], "y1995": 1.1136826889802023, "y1997": 1.1189343096757198, "y1996": 1.1057147027213501, "y1999": 1.1432271991365353, "y1998": 1.1377866945457653, "y2006": 1.1268023587150906, "y2007": 1.1235793669317915, "y2004": 1.1482023546040769, "y2005": 1.1238659840114973, "y2002": 1.1600919581655105, "y2003": 1.1446778932605579, "y2000": 1.1825702862895446, "y2001": 1.1622624279436105, "id": 26, "y2008": 1.115925801617498, "y2009": 1.1257082797404696}, {"neighbors": [32, 24, 36, 16, 28], "y1995": 1.303794309231981, "y1997": 1.3120636604057812, "y1996": 1.3075218596998686, "y1999": 1.3062566740535762, "y1998": 1.3153226688859194, "y2006": 1.2865667454509278, "y2007": 1.2973409698906584, "y2004": 1.2683078569016086, "y2005": 1.2617743046198988, "y2002": 1.2920319347677043, "y2003": 1.2718351646774422, "y2000": 1.3121023910310281, "y2001": 1.2998915587009874, "id": 27, "y2008": 1.2939020510829768, "y2009": 1.2934544564717687}, {"neighbors": [26, 16, 19, 32, 27], "y1995": 0.83953719020532513, "y1997": 0.82006005316292385, "y1996": 0.82701447583159737, "y1999": 0.80294863992835086, "y1998": 0.8118887636743225, "y2006": 0.8389109342655191, "y2007": 0.84349246817602375, "y2004": 0.83108634437662732, "y2005": 0.84373783646216949, "y2002": 0.82596790474192727, "y2003": 0.82435704751379402, "y2000": 0.78772975118465016, "y2001": 0.82848010958278628, "id": 28, "y2008": 0.85637272428125033, "y2009": 0.86539395164519117}, {"neighbors": [5, 39, 22, 14, 31], "y1995": 1.2345008725695852, "y1997": 1.2353793515744536, "y1996": 1.2426021999018138, "y1999": 1.2452262575926329, "y1998": 1.2358129278404693, "y2006": 1.2365329681906834, "y2007": 1.2796200872578414, "y2004": 1.1967443443492951, "y2005": 1.2153657295128597, "y2002": 1.1937780418204111, "y2003": 1.1835533748469893, "y2000": 1.2256766974812463, "y2001": 1.2112664802237314, "id": 29, "y2008": 1.2796839248335934, "y2009": 1.2590773758694083}, {"neighbors": [37, 20, 24, 25, 27], "y1995": 0.97696620404861145, "y1997": 0.98035944080980575, "y1996": 0.9740071914763756, "y1999": 0.95543282313901556, "y1998": 0.97581530789338955, "y2006": 0.92100464312607799, "y2007": 0.9147530387633086, "y2004": 0.9298883479571457, "y2005": 0.93442917452618346, "y2002": 0.93679072759857129, "y2003": 0.92540049332494034, "y2000": 0.96480308308405971, "y2001": 0.9468637634838194, "id": 30, "y2008": 0.90249622070947177, "y2009": 0.90213630440783921}, {"neighbors": [35, 14, 33, 12, 4], "y1995": 0.84986885942491119, "y1997": 0.84295996568390696, "y1996": 0.89868510090623221, "y1999": 0.85659367787716301, "y1998": 0.87280533962476625, "y2006": 0.92562487931452408, "y2007": 0.96635366357254426, "y2004": 0.92698332540482575, "y2005": 0.94745351657235244, "y2002": 0.90448992922937876, "y2003": 0.95495898185605821, "y2000": 0.88937573313051443, "y2001": 0.89440100450887505, "id": 31, "y2008": 1.025203118044723, "y2009": 1.0394296020754366}, {"neighbors": [36, 27, 28, 16, 26], "y1995": 1.0192280751235561, "y1997": 1.0097442843101825, "y1996": 1.0025820319237864, "y1999": 0.99765073314119712, "y1998": 1.0030341681355639, "y2006": 0.94779637858468868, "y2007": 0.93759089358493275, "y2004": 0.97583768316642261, "y2005": 0.96101679691008712, "y2002": 0.99747298060178258, "y2003": 0.99550758543481688, "y2000": 1.0075901875261932, "y2001": 0.99192968437874551, "id": 32, "y2008": 0.93353431146829191, "y2009": 0.94121705123804411}, {"neighbors": [44, 25, 12, 35, 31], "y1995": 0.86367410708901315, "y1997": 0.85544345781923936, "y1996": 0.85558931627900803, "y1999": 0.84336613427334628, "y1998": 0.85103025143102673, "y2006": 0.89455097373003656, "y2007": 0.88283929116469462, "y2004": 0.85951183386707053, "y2005": 0.87194227372077004, "y2002": 0.84667960913556228, "y2003": 0.84374557883664714, "y2000": 0.83434853662160158, "y2001": 0.85813595114434105, "id": 33, "y2008": 0.90349490610221961, "y2009": 0.9060067497610369}, {"neighbors": [22, 39, 21, 29, 23], "y1995": 1.0094753356447226, "y1997": 1.0069881886439402, "y1996": 1.0041105523637666, "y1999": 0.99291086334982948, "y1998": 0.99513686502304577, "y2006": 0.96382634438484593, "y2007": 0.95011400973122428, "y2004": 0.975119236728752, "y2005": 0.96134614808826613, "y2002": 0.99291167539274383, "y2003": 0.98983209318633369, "y2000": 1.0058162611397035, "y2001": 0.98850522230466298, "id": 34, "y2008": 0.94346860300667812, "y2009": 0.9463776450423077}, {"neighbors": [31, 38, 44, 33, 14], "y1995": 1.0571257066143651, "y1997": 1.0575301194645879, "y1996": 1.0545941857842291, "y1999": 1.0510385688532684, "y1998": 1.0488078570498685, "y2006": 1.0247627521629479, "y2007": 1.0234752320591773, "y2004": 1.0329697933620496, "y2005": 1.0219168238570018, "y2002": 1.0420048344203974, "y2003": 1.0402553971511816, "y2000": 1.0480002306104303, "y2001": 1.030249414987729, "id": 35, "y2008": 1.0251768368501768, "y2009": 1.0435957064486703}, {"neighbors": [32, 43, 27, 28, 42], "y1995": 1.070841888164505, "y1997": 1.0793762307014196, "y1996": 1.0666949726007404, "y1999": 1.0794043012481198, "y1998": 1.0738798776109699, "y2006": 1.087727556316465, "y2007": 1.0885954360198933, "y2004": 1.1032213602455734, "y2005": 1.0916793915985508, "y2002": 1.0938347765734742, "y2003": 1.1052447043433509, "y2000": 1.0531800956589803, "y2001": 1.0745277096056161, "id": 36, "y2008": 1.0917733838297285, "y2009": 1.1096083021948762}, {"neighbors": [30, 40, 20, 42, 41], "y1995": 0.8671922185905101, "y1997": 0.86675155621455668, "y1996": 0.86628895935887062, "y1999": 0.86511809486628932, "y1998": 0.86425631732335095, "y2006": 0.84488343470424199, "y2007": 0.83374328958471722, "y2004": 0.84517414191529749, "y2005": 0.84843857600526962, "y2002": 0.85411284725399572, "y2003": 0.84886336375435456, "y2000": 0.86287327291635718, "y2001": 0.8516979624450659, "id": 37, "y2008": 0.82812044014430564, "y2009": 0.82878598934619596}, {"neighbors": [35, 31, 45, 39, 44], "y1995": 0.8838921149583755, "y1997": 0.90282398478743275, "y1996": 0.92288667453925455, "y1999": 0.92023285988219217, "y1998": 0.91229185518735723, "y2006": 0.93869676706720051, "y2007": 0.96947770975097391, "y2004": 0.99223700402629367, "y2005": 0.97984969609868555, "y2002": 0.93682451504456421, "y2003": 0.98655146182882891, "y2000": 0.92652175166361039, "y2001": 0.94278865361566122, "id": 38, "y2008": 1.0036262573224608, "y2009": 0.98102350657197357}, {"neighbors": [29, 34, 38, 22, 35], "y1995": 0.970820642185237, "y1997": 0.94534081352108112, "y1996": 0.95320232993219844, "y1999": 0.93967000034446724, "y1998": 0.94215592860799646, "y2006": 0.91035556215514757, "y2007": 0.90430364292511256, "y2004": 0.92879505989982103, "y2005": 0.9211054223180335, "y2002": 0.93412151936513388, "y2003": 0.93501274320242933, "y2000": 0.93092108910210503, "y2001": 0.92662519262599163, "id": 39, "y2008": 0.89994694483851023, "y2009": 0.9007386435858511}, {"neighbors": [41, 37, 42, 30, 45], "y1995": 0.95861858457245008, "y1997": 0.98254810501535106, "y1996": 0.95774543235102894, "y1999": 0.98684823919808018, "y1998": 0.98919471947721893, "y2006": 0.97163003599581876, "y2007": 0.97007020126757271, "y2004": 0.9493488753775261, "y2005": 0.97152609359561659, "y2002": 0.95601578436851964, "y2003": 0.94905384541254967, "y2000": 0.98882204635713133, "y2001": 0.97662233890759653, "id": 40, "y2008": 0.97158948117089283, "y2009": 0.95884908006927827}, {"neighbors": [40, 45, 44, 37, 42], "y1995": 0.83980438854721107, "y1997": 0.85746999875029983, "y1996": 0.84726737166133714, "y1999": 0.85567509846023126, "y1998": 0.85467221160427542, "y2006": 0.8333891885768886, "y2007": 0.83511679264592342, "y2004": 0.81743586206088703, "y2005": 0.83550405700769481, "y2002": 0.84502402428191115, "y2003": 0.82645665158259707, "y2000": 0.84818516243622177, "y2001": 0.85265681182580899, "id": 41, "y2008": 0.82136617314598481, "y2009": 0.80921873783836296}, {"neighbors": [43, 40, 46, 37, 36], "y1995": 0.95118156405662746, "y1997": 0.94688098462868708, "y1996": 0.9466212002600608, "y1999": 0.95124410099780687, "y1998": 0.95085829660091703, "y2006": 0.96895367966714574, "y2007": 0.9700163384024274, "y2004": 0.97583768316642261, "y2005": 0.95571723704302525, "y2002": 0.96804411514198463, "y2003": 0.97136213864358201, "y2000": 0.95440787445922959, "y2001": 0.96364362764682376, "id": 42, "y2008": 0.97082732652905901, "y2009": 0.9878236640328002}, {"neighbors": [36, 42, 32, 27, 46], "y1995": 1.0891004415267045, "y1997": 1.0849289528525252, "y1996": 1.0824896838138709, "y1999": 1.0945424900391545, "y1998": 1.0865692335830259, "y2006": 1.1450297539219478, "y2007": 1.1447474729339102, "y2004": 1.1334273474293739, "y2005": 1.1468606844516303, "y2002": 1.1229257675733433, "y2003": 1.1302103089739621, "y2000": 1.1055818811158884, "y2001": 1.1214085953998059, "id": 43, "y2008": 1.1408403740471014, "y2009": 1.1614292649793569}, {"neighbors": [33, 41, 45, 35, 40], "y1995": 1.0633603345917013, "y1997": 1.0869149629649646, "y1996": 1.0736582323828732, "y1999": 1.1166986255755473, "y1998": 1.0976484597942771, "y2006": 1.0839806574563229, "y2007": 1.0983176831786272, "y2004": 1.0927882684985315, "y2005": 1.0700320368873319, "y2002": 1.0881584856466706, "y2003": 1.0804431312806149, "y2000": 1.1185670222649935, "y2001": 1.0976428286056732, "id": 44, "y2008": 1.0929823187788443, "y2009": 1.0917612486217978}, {"neighbors": [41, 44, 40, 35, 33], "y1995": 0.79772064970019041, "y1997": 0.7858115114280021, "y1996": 0.78829195801876151, "y1999": 0.77035744221561353, "y1998": 0.77615921755360906, "y2006": 0.79949806580432425, "y2007": 0.80172181625581262, "y2004": 0.79603865293896003, "y2005": 0.78966436120841943, "y2002": 0.81437881076636964, "y2003": 0.80788827809912023, "y2000": 0.77751193519846906, "y2001": 0.79902973574567659, "id": 45, "y2008": 0.82168154748053679, "y2009": 0.85587910681858015}, {"neighbors": [42, 43, 40, 36, 37], "y1995": 1.0052446952315301, "y1997": 1.0047589936197736, "y1996": 1.0000769567582628, "y1999": 1.0063956091903872, "y1998": 1.0061394183885444, "y2006": 0.97292595590233411, "y2007": 0.96519561197191939, "y2004": 0.99030032232474696, "y2005": 0.97682565346267858, "y2002": 1.0081498135355325, "y2003": 1.0057431552702318, "y2000": 1.0016297948675874, "y2001": 0.99860738542320637, "id": 46, "y2008": 0.9617340332161447, "y2009": 0.95890283625473927}, {"neighbors": [20, 6, 24, 25, 30], "y1995": 0.95808418788867844, "y1997": 0.9654440995572009, "y1996": 0.93825679674127938, "y1999": 0.96987289157318213, "y1998": 0.95561201303757848, "y2006": 1.1704973973021624, "y2007": 1.1702515395802287, "y2004": 1.0533361880299275, "y2005": 1.0983262971945267, "y2002": 1.0078119390756035, "y2003": 1.0348423554112989, "y2000": 0.96608031008233231, "y2001": 0.99727184521431422, "id": 47, "y2008": 1.1873055260044207, "y2009": 1.1424264534188653}] diff --git a/release/python/0.7.0/crankshaft/test/helper.py b/release/python/0.7.0/crankshaft/test/helper.py new file mode 100644 index 0000000..7d28b94 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/helper.py @@ -0,0 +1,13 @@ +import unittest + +from mock_plpy import MockPlPy +plpy = MockPlPy() + +import sys +sys.modules['plpy'] = plpy + +import os + +def fixture_file(name): + dir = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(dir, 'fixtures', name) diff --git a/release/python/0.7.0/crankshaft/test/mock_plpy.py b/release/python/0.7.0/crankshaft/test/mock_plpy.py new file mode 100644 index 0000000..9c3340c --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/mock_plpy.py @@ -0,0 +1,57 @@ +import re + + +class MockCursor: + def __init__(self, data): + self.cursor_pos = 0 + self.data = data + + def fetch(self, batch_size): + batch = self.data[self.cursor_pos:self.cursor_pos + batch_size] + self.cursor_pos += batch_size + return batch + + +class MockPlPy: + def __init__(self): + self._reset() + + def _reset(self): + self.infos = [] + self.notices = [] + self.debugs = [] + self.logs = [] + self.warnings = [] + self.errors = [] + self.fatals = [] + self.executes = [] + self.results = [] + self.prepares = [] + self.results = [] + + def _define_result(self, query, result): + pattern = re.compile(query, re.IGNORECASE | re.MULTILINE) + self.results.append([pattern, result]) + + def notice(self, msg): + self.notices.append(msg) + + def debug(self, msg): + self.notices.append(msg) + + def info(self, msg): + self.infos.append(msg) + + def error(self, msg): + self.notices.append(msg) + + def cursor(self, query): + data = self.execute(query) + return MockCursor(data) + + # TODO: additional arguments + def execute(self, query): + for result in self.results: + if result[0].match(query): + return result[1] + return [] diff --git a/release/python/0.7.0/crankshaft/test/test_clustering_getis.py b/release/python/0.7.0/crankshaft/test/test_clustering_getis.py new file mode 100644 index 0000000..61add11 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_clustering_getis.py @@ -0,0 +1,78 @@ +import unittest +import numpy as np + +from helper import fixture_file + +from crankshaft.clustering import Getis +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json +from crankshaft.analysis_data_provider import AnalysisDataProvider + +# Fixture files produced as follows +# +# import pysal as ps +# import numpy as np +# import random +# +# # setup variables +# f = ps.open(ps.examples.get_path("stl_hom.dbf")) +# y = np.array(f.by_col['HR8893']) +# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp")) +# +# out_queen = [{"id": index + 1, +# "neighbors": [x+1 for x in w_queen.neighbors[index]], +# "value": val} for index, val in enumerate(y)] +# +# with open('neighbors_queen_getis.json', 'w') as f: +# f.write(str(out_queen)) +# +# random.seed(1234) +# np.random.seed(1234) +# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True, +# permutations=999) +# +# with open('getis_queen.json', 'w') as f: +# f.write(str(zip(lgstar_queen.z_sim, +# lgstar_queen.p_sim, lgstar_queen.p_z_sim))) + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mock_data): + self.mock_result = mock_data + + def get_getis(self, w_type, param): + return self.mock_result + + +class GetisTest(unittest.TestCase): + """Testing class for Getis-Ord's G* funtion + This test replicates the work done in PySAL documentation: + https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g + """ + + def setUp(self): + # load raw data for analysis + self.neighbors_data = json.loads( + open(fixture_file('neighbors_getis.json')).read()) + + # load pre-computed/known values + self.getis_data = json.loads( + open(fixture_file('getis.json')).read()) + + def test_getis_ord(self): + """Test Getis-Ord's G*""" + data = [{'id': d['id'], + 'attr1': d['value'], + 'neighbors': d['neighbors']} for d in self.neighbors_data] + + random_seeds.set_random_seeds(1234) + getis = Getis(FakeDataProvider(data)) + + result = getis.getis_ord('subquery', 'value', + 'queen', None, 999, 'the_geom', + 'cartodb_id') + result = [(row[0], row[1]) for row in result] + expected = np.array(self.getis_data)[:, 0:2] + for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected): + self.assertAlmostEqual(res_z, exp_z, delta=1e-2) diff --git a/release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py b/release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py new file mode 100644 index 0000000..c118d34 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py @@ -0,0 +1,87 @@ +import unittest +import numpy as np + + +from helper import fixture_file +from crankshaft.clustering import Kmeans +from crankshaft.analysis_data_provider import AnalysisDataProvider +import crankshaft.clustering as cc +from crankshaft import random_seeds + +import json +from collections import OrderedDict + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mocked_result): + self.mocked_result = mocked_result + + def get_spatial_kmeans(self, query): + return self.mocked_result + + def get_nonspatial_kmeans(self, query): + return self.mocked_result + + +class KMeansTest(unittest.TestCase): + """Testing class for k-means spatial""" + + def setUp(self): + self.cluster_data = json.loads( + open(fixture_file('kmeans.json')).read()) + self.params = {"subquery": "select * from table", + "no_clusters": "10"} + + def test_kmeans(self): + """ + """ + data = [{'xs': d['xs'], + 'ys': d['ys'], + 'ids': d['ids']} for d in self.cluster_data] + + random_seeds.set_random_seeds(1234) + kmeans = Kmeans(FakeDataProvider(data)) + clusters = kmeans.spatial('subquery', 2) + labels = [a[1] for a in clusters] + c1 = [a for a in clusters if a[1] == 0] + c2 = [a for a in clusters if a[1] == 1] + + self.assertEqual(len(np.unique(labels)), 2) + self.assertEqual(len(c1), 20) + self.assertEqual(len(c2), 20) + + +class KMeansNonspatialTest(unittest.TestCase): + """Testing class for k-means non-spatial""" + + def setUp(self): + self.params = {"subquery": "SELECT * FROM TABLE", + "n_clusters": 5} + + def test_kmeans_nonspatial(self): + """ + test for k-means non-spatial + """ + # data from: + # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans + data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]), + ("arr_col2", [2, 4, 0, 2, 4, 0]), + ("rowid", [1, 2, 3, 4, 5, 6])])] + + random_seeds.set_random_seeds(1234) + kmeans = Kmeans(FakeDataProvider(data_raw)) + clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2) + + cl1 = clusters[0][0] + cl2 = clusters[3][0] + + for idx, val in enumerate(clusters): + if idx < 3: + self.assertEqual(val[0], cl1) + else: + self.assertEqual(val[0], cl2) + + # raises exception for no data + with self.assertRaises(Exception): + kmeans = Kmeans(FakeDataProvider([])) + kmeans.nonspatial('subquery', ['col1', 'col2'], 2) diff --git a/release/python/0.7.0/crankshaft/test/test_clustering_moran.py b/release/python/0.7.0/crankshaft/test/test_clustering_moran.py new file mode 100644 index 0000000..cc1930e --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_clustering_moran.py @@ -0,0 +1,112 @@ +import unittest +import numpy as np + +from helper import fixture_file +from crankshaft.clustering import Moran +from crankshaft.analysis_data_provider import AnalysisDataProvider +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json +from collections import OrderedDict + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mock_data): + self.mock_result = mock_data + + def get_moran(self, w_type, params): + return self.mock_result + + +class MoranTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + self.params = {"id_col": "cartodb_id", + "attr1": "andy", + "attr2": "jay_z", + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.params_markov = {"id_col": "cartodb_id", + "time_cols": ["_2013_dec", "_2014_jan", + "_2014_feb"], + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.neighbors_data = json.loads( + open(fixture_file('neighbors.json')).read()) + self.moran_data = json.loads( + open(fixture_file('moran.json')).read()) + + def test_map_quads(self): + """Test map_quads""" + from crankshaft.clustering import map_quads + self.assertEqual(map_quads(1), 'HH') + self.assertEqual(map_quads(2), 'LH') + self.assertEqual(map_quads(3), 'LL') + self.assertEqual(map_quads(4), 'HL') + self.assertEqual(map_quads(33), None) + self.assertEqual(map_quads('andy'), None) + + def test_quad_position(self): + """Test lisa_sig_vals""" + from crankshaft.clustering import quad_position + + quads = np.array([1, 2, 3, 4], np.int) + + ans = np.array(['HH', 'LH', 'LL', 'HL']) + test_ans = quad_position(quads) + + self.assertTrue((test_ans == ans).all()) + + def test_local_stat(self): + """Test Moran's I local""" + data = [OrderedDict([('id', d['id']), + ('attr1', d['value']), + ('neighbors', d['neighbors'])]) + for d in self.neighbors_data] + + moran = Moran(FakeDataProvider(data)) + random_seeds.set_random_seeds(1234) + result = moran.local_stat('subquery', 'value', + 'knn', 5, 99, 'the_geom', 'cartodb_id') + result = [(row[0], row[1]) for row in result] + zipped_values = zip(result, self.moran_data) + + for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values: + self.assertAlmostEqual(res_val, exp_val) + self.assertEqual(res_quad, exp_quad) + + def test_moran_local_rate(self): + """Test Moran's I rate""" + data = [{'id': d['id'], + 'attr1': d['value'], + 'attr2': 1, + 'neighbors': d['neighbors']} for d in self.neighbors_data] + + random_seeds.set_random_seeds(1234) + moran = Moran(FakeDataProvider(data)) + result = moran.local_rate_stat('subquery', 'numerator', 'denominator', + 'knn', 5, 99, 'the_geom', 'cartodb_id') + result = [(row[0], row[1]) for row in result] + + zipped_values = zip(result, self.moran_data) + + for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values: + self.assertAlmostEqual(res_val, exp_val) + + def test_moran(self): + """Test Moran's I global""" + data = [{'id': d['id'], + 'attr1': d['value'], + 'neighbors': d['neighbors']} for d in self.neighbors_data] + random_seeds.set_random_seeds(1235) + moran = Moran(FakeDataProvider(data)) + result = moran.global_stat('table', 'value', + 'knn', 5, 99, 'the_geom', + 'cartodb_id') + + result_moran = result[0][0] + expected_moran = np.array([row[0] for row in self.moran_data]).mean() + self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2) diff --git a/release/python/0.7.0/crankshaft/test/test_pysal_utils.py b/release/python/0.7.0/crankshaft/test/test_pysal_utils.py new file mode 100644 index 0000000..be45164 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_pysal_utils.py @@ -0,0 +1,83 @@ +import unittest + +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +from collections import OrderedDict + + +class PysalUtilsTest(unittest.TestCase): + """Testing class for utility functions related to PySAL integrations""" + + def setUp(self): + self.params1 = OrderedDict([("id_col", "cartodb_id"), + ("attr1", "andy"), + ("attr2", "jay_z"), + ("subquery", "SELECT * FROM a_list"), + ("geom_col", "the_geom"), + ("num_ngbrs", 321)]) + + self.params2 = OrderedDict([("id_col", "cartodb_id"), + ("numerator", "price"), + ("denominator", "sq_meters"), + ("subquery", "SELECT * FROM pecan"), + ("geom_col", "the_geom"), + ("num_ngbrs", 321)]) + + self.params3 = OrderedDict([("id_col", "cartodb_id"), + ("numerator", "sq_meters"), + ("denominator", "price"), + ("subquery", "SELECT * FROM pecan"), + ("geom_col", "the_geom"), + ("num_ngbrs", 321)]) + + self.params_array = {"id_col": "cartodb_id", + "time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"], + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + + def test_query_attr_select(self): + """Test query_attr_select""" + + ans1 = ("i.\"andy\"::numeric As attr1, " + "i.\"jay_z\"::numeric As attr2, ") + + ans2 = ("i.\"price\"::numeric As attr1, " + "i.\"sq_meters\"::numeric As attr2, ") + + ans3 = ("i.\"sq_meters\"::numeric As attr1, " + "i.\"price\"::numeric As attr2, ") + + ans_array = ("i.\"_2013_dec\"::numeric As attr1, " + "i.\"_2014_jan\"::numeric As attr2, " + "i.\"_2014_feb\"::numeric As attr3, ") + + self.assertEqual(pu.query_attr_select(self.params1), ans1) + self.assertEqual(pu.query_attr_select(self.params2), ans2) + self.assertEqual(pu.query_attr_select(self.params3), ans3) + self.assertEqual(pu.query_attr_select(self.params_array), ans_array) + + def test_query_attr_where(self): + """Test pu.query_attr_where""" + + ans1 = ("idx_replace.\"andy\" IS NOT NULL AND " + "idx_replace.\"jay_z\" IS NOT NULL") + + ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND " + "idx_replace.\"_2014_jan\" IS NOT NULL AND " + "idx_replace.\"_2014_feb\" IS NOT NULL") + + self.assertEqual(pu.query_attr_where(self.params1), ans1) + self.assertEqual(pu.query_attr_where(self.params_array), ans_array) + + def test_get_attributes(self): + """Test get_attributes""" + + # need to add tests + + self.assertEqual(True, True) + + def test_get_weight(self): + """Test get_weight""" + + self.assertEqual(True, True) diff --git a/release/python/0.7.0/crankshaft/test/test_regression_gwr.py b/release/python/0.7.0/crankshaft/test/test_regression_gwr.py new file mode 100644 index 0000000..57cd952 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_regression_gwr.py @@ -0,0 +1,130 @@ +import unittest +import json +import numpy as np + + +from crankshaft import random_seeds +from helper import fixture_file +from crankshaft.regression import GWR +from crankshaft.analysis_data_provider import AnalysisDataProvider + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mocked_result): + self.mocked_result = mocked_result + + def get_gwr(self, params): + return self.mocked_result + + def get_gwr_predict(self, params): + return self.mocked_result + + +class GWRTest(unittest.TestCase): + """Testing class for geographically weighted regression (gwr)""" + + def setUp(self): + """ + fixture packed from canonical GWR georgia dataset using the + following query: + SELECT array_agg(x) As x, + array_agg(y) As y, + array_agg(pctbach) As dep_var, + array_agg(pctrural) As attr1, + array_agg(pctpov) As attr2, + array_agg(pctblack) As attr3, + array_agg(areakey) As rowid + FROM g_utm + WHERE pctbach is not NULL AND + pctrural IS NOT NULL AND + pctpov IS NOT NULL AND + pctblack IS NOT NULL + """ + import copy + # data packed from https://github.com/TaylorOshan/pysal/blob/1d6af33bda46b1d623f70912c56155064463383f/pysal/examples/georgia/GData_utm.csv + self.data = json.loads( + open(fixture_file('gwr_packed_data.json')).read()) + + # data packed from https://github.com/TaylorOshan/pysal/blob/a44c5541e2e0d10a99ff05edc1b7f81b70f5a82f/pysal/examples/georgia/georgia_BS_NN_listwise.csv + self.knowns = json.loads( + open(fixture_file('gwr_packed_knowns.json')).read()) + + # data for GWR prediction + self.data_predict = copy.deepcopy(self.data) + self.ids_of_unknowns = [13083, 13009, 13281, 13115, 13247, 13169] + self.idx_ids_of_unknowns = [self.data_predict[0]['rowid'].index(idx) + for idx in self.ids_of_unknowns] + + for idx in self.idx_ids_of_unknowns: + self.data_predict[0]['dep_var'][idx] = None + + self.predicted_knowns = {13009: 10.879, + 13083: 4.5259, + 13115: 9.4022, + 13169: 6.0793, + 13247: 8.1608, + 13281: 13.886} + + # params, with ind_vars in same ordering as query above + self.params = {'subquery': 'select * from table', + 'dep_var': 'pctbach', + 'ind_vars': ['pctrural', 'pctpov', 'pctblack'], + 'bw': 90.000, + 'fixed': False, + 'geom_col': 'the_geom', + 'id_col': 'areakey'} + + def test_gwr(self): + """ + """ + gwr = GWR(FakeDataProvider(self.data)) + gwr_resp = gwr.gwr(self.params['subquery'], + self.params['dep_var'], + self.params['ind_vars'], + bw=self.params['bw'], + fixed=self.params['fixed']) + + # unpack response + coeffs, stand_errs, t_vals, t_vals_filtered, predicteds, \ + residuals, r_squareds, bws, rowids = zip(*gwr_resp) + + # prepare for comparision + coeff_known_pctpov = self.knowns['est_pctpov'] + tval_known_pctblack = self.knowns['t_pctrural'] + pctpov_se = self.knowns['se_pctpov'] + ids = self.knowns['area_key'] + resp_idx = None + + # test pctpov coefficient estimates + for idx, val in enumerate(coeff_known_pctpov): + resp_idx = rowids.index(ids[idx]) + self.assertAlmostEquals(val, + json.loads(coeffs[resp_idx])['pctpov'], + places=4) + # test pctrural tvals + for idx, val in enumerate(tval_known_pctblack): + resp_idx = rowids.index(ids[idx]) + self.assertAlmostEquals(val, + json.loads(t_vals[resp_idx])['pctrural'], + places=4) + + def test_gwr_predict(self): + """Testing for GWR_Predict""" + gwr = GWR(FakeDataProvider(self.data_predict)) + gwr_resp = gwr.gwr_predict(self.params['subquery'], + self.params['dep_var'], + self.params['ind_vars'], + bw=self.params['bw'], + fixed=self.params['fixed']) + + # unpack response + coeffs, stand_errs, t_vals, \ + r_squareds, predicteds, rowid = zip(*gwr_resp) + threshold = 0.01 + + for i, idx in enumerate(self.idx_ids_of_unknowns): + + known_val = self.predicted_knowns[rowid[i]] + predicted_val = predicteds[i] + test_val = abs(known_val - predicted_val) / known_val + self.assertTrue(test_val < threshold) diff --git a/release/python/0.7.0/crankshaft/test/test_segmentation.py b/release/python/0.7.0/crankshaft/test/test_segmentation.py new file mode 100644 index 0000000..d02e8b1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_segmentation.py @@ -0,0 +1,64 @@ +import unittest +import numpy as np +from helper import plpy, fixture_file +import crankshaft.segmentation as segmentation +import json + +class SegmentationTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + plpy._reset() + + def generate_random_data(self,n_samples,random_state, row_type=False): + x1 = random_state.uniform(size=n_samples) + x2 = random_state.uniform(size=n_samples) + x3 = random_state.randint(0, 4, size=n_samples) + + y = x1+x2*x2+x3 + cartodb_id = range(len(x1)) + + if row_type: + return [ {'features': vals} for vals in zip(x1,x2,x3)], y + else: + return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))] + + def test_replace_nan_with_mean(self): + test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) + + def test_create_and_predict_segment(self): + n_samples = 1000 + + random_state_train = np.random.RandomState(13) + random_state_test = np.random.RandomState(134) + training_data = self.generate_random_data(n_samples, random_state_train) + test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True) + + + ids = [{'cartodb_ids': range(len(test_data))}] + rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}] + + plpy._define_result('select \* from \(select \* from training\) a limit 1',rows) + plpy._define_result('.*from \(select \* from training\) as a' ,training_data) + plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids) + plpy._define_result('.*select \* from test.*' ,test_data) + + model_parameters = {'n_estimators': 1200, + 'max_depth': 3, + 'subsample' : 0.5, + 'learning_rate': 0.01, + 'min_samples_leaf': 1} + + result = segmentation.create_and_predict_segment( + 'select * from training', + 'target', + 'select * from test', + model_parameters) + + prediction = [r[1] for r in result] + + accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) + + self.assertEqual(len(result),len(test_data)) + self.assertTrue( result[0][2] < 0.01) + self.assertTrue( accuracy < 0.5*np.mean(test_y) ) diff --git a/release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py b/release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py new file mode 100644 index 0000000..d14563e --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py @@ -0,0 +1,349 @@ +import unittest +import numpy as np + +import unittest + + +from helper import fixture_file + +from crankshaft.space_time_dynamics import Markov +import crankshaft.space_time_dynamics as std +from crankshaft import random_seeds +from crankshaft.analysis_data_provider import AnalysisDataProvider +import json + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, data): + self.mock_result = data + + def get_markov(self, w_type, params): + return self.mock_result + + +class SpaceTimeTests(unittest.TestCase): + """Testing class for Markov Functions.""" + + def setUp(self): + self.params = {"id_col": "cartodb_id", + "time_cols": ['dec_2013', 'jan_2014', 'feb_2014'], + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.neighbors_data = json.loads( + open(fixture_file('neighbors_markov.json')).read()) + self.markov_data = json.loads(open(fixture_file('markov.json')).read()) + + self.time_data = np.array([i * np.ones(10, dtype=float) + for i in range(10)]).T + + self.transition_matrix = np.array([ + [[0.96341463, 0.0304878, 0.00609756, 0., 0.], + [0.06040268, 0.83221477, 0.10738255, 0., 0.], + [0., 0.14, 0.74, 0.12, 0.], + [0., 0.03571429, 0.32142857, 0.57142857, 0.07142857], + [0., 0., 0., 0.16666667, 0.83333333]], + [[0.79831933, 0.16806723, 0.03361345, 0., 0.], + [0.0754717, 0.88207547, 0.04245283, 0., 0.], + [0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.], + [0., 0., 0.06372549, 0.90196078, 0.03431373], + [0., 0., 0., 0.19444444, 0.80555556]], + [[0.84693878, 0.15306122, 0., 0., 0.], + [0.08133971, 0.78947368, 0.1291866, 0., 0.], + [0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135], + [0., 0., 0.09411765, 0.87058824, 0.03529412], + [0., 0., 0., 0.10204082, 0.89795918]], + [[0.8852459, 0.09836066, 0., 0.01639344, 0.], + [0.03875969, 0.81395349, 0.13953488, 0., 0.00775194], + [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505], + [0., 0.02339181, 0.12865497, 0.75438596, 0.09356725], + [0., 0., 0., 0.09661836, 0.90338164]], + [[0.33333333, 0.66666667, 0., 0., 0.], + [0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.], + [0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.], + [0., 0.01036269, 0.06217617, 0.89637306, 0.03108808], + [0., 0., 0., 0.02352941, 0.97647059]]] + ) + + def test_spatial_markov(self): + """Test Spatial Markov.""" + data = [{'id': d['id'], + 'attr1': d['y1995'], + 'attr2': d['y1996'], + 'attr3': d['y1997'], + 'attr4': d['y1998'], + 'attr5': d['y1999'], + 'attr6': d['y2000'], + 'attr7': d['y2001'], + 'attr8': d['y2002'], + 'attr9': d['y2003'], + 'attr10': d['y2004'], + 'attr11': d['y2005'], + 'attr12': d['y2006'], + 'attr13': d['y2007'], + 'attr14': d['y2008'], + 'attr15': d['y2009'], + 'neighbors': d['neighbors']} for d in self.neighbors_data] + # print(str(data[0])) + markov = Markov(FakeDataProvider(data)) + random_seeds.set_random_seeds(1234) + + result = markov.spatial_trend('subquery', + ['y1995', 'y1996', 'y1997', 'y1998', + 'y1999', 'y2000', 'y2001', 'y2002', + 'y2003', 'y2004', 'y2005', 'y2006', + 'y2007', 'y2008', 'y2009'], + 5, 'knn', 5, 0, 'the_geom', + 'cartodb_id') + + self.assertTrue(result is not None) + result = [(row[0], row[1], row[2], row[3], row[4]) for row in result] + print result[0] + expected = self.markov_data + for ([res_trend, res_up, res_down, res_vol, res_id], + [exp_trend, exp_up, exp_down, exp_vol, exp_id] + ) in zip(result, expected): + self.assertAlmostEqual(res_trend, exp_trend) + + def test_get_time_data(self): + """Test get_time_data""" + data = [{'attr1': d['y1995'], + 'attr2': d['y1996'], + 'attr3': d['y1997'], + 'attr4': d['y1998'], + 'attr5': d['y1999'], + 'attr6': d['y2000'], + 'attr7': d['y2001'], + 'attr8': d['y2002'], + 'attr9': d['y2003'], + 'attr10': d['y2004'], + 'attr11': d['y2005'], + 'attr12': d['y2006'], + 'attr13': d['y2007'], + 'attr14': d['y2008'], + 'attr15': d['y2009']} for d in self.neighbors_data] + + result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998', + 'y1999', 'y2000', 'y2001', 'y2002', + 'y2003', 'y2004', 'y2005', 'y2006', + 'y2007', 'y2008', 'y2009']) + + # expected was prepared from PySAL example: + # f = ps.open(ps.examples.get_path("usjoin.csv")) + # pci = np.array([f.by_col[str(y)] + # for y in range(1995, 2010)]).transpose() + # rpci = pci / (pci.mean(axis = 0)) + + expected = np.array( + [[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154, + 0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612, + 0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356], + [0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388, + 0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176, + 0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858], + [0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522, + 0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196, + 0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177], + [1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841, + 1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806, + 1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775], + [1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025, + 1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964, + 1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337], + [1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684, + 1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372, + 1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431], + [1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149, + 1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239, + 1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225], + [1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545, + 0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905, + 1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746], + [0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845, + 0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787, + 0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858], + [0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044, + 0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998, + 0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146], + [1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624, + 1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989, + 1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097], + [0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687, + 0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989, + 0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385], + [0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647, + 0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424, + 0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441], + [0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897, + 0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471, + 0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704], + [0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012, + 0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883, + 0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404], + [0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136, + 0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334, + 0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184], + [0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238, + 0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265, + 0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657], + [1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723, + 1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042, + 1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398], + [1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667, + 1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841, + 1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462], + [1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093, + 1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416, + 0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102], + [1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264, + 1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944, + 1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155], + [0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284, + 0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288, + 0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192], + [0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803, + 0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162, + 0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232], + [0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801, + 0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307, + 0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675], + [0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619, + 0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651, + 0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148], + [1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683, + 1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751, + 1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943], + [1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272, + 1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235, + 1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828], + [1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667, + 1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786, + 1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446], + [0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864, + 0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634, + 0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395], + [1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626, + 1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434, + 1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738], + [0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282, + 0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835, + 0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363], + [0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368, + 0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333, + 0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296], + [1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073, + 1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768, + 0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705], + [0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613, + 0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183, + 0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675], + [1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086, + 1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924, + 0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765], + [1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857, + 1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979, + 1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571], + [1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043, + 1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136, + 1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083], + [0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809, + 0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414, + 0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599], + [0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286, + 0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237, + 0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351], + [0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967, + 0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506, + 0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864], + [0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824, + 0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888, + 0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908], + [0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751, + 0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586, + 0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874], + [0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441, + 0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768, + 0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366], + [1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249, + 1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735, + 1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926], + [1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863, + 1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827, + 1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125], + [0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744, + 0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865, + 0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911], + [1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561, + 1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032, + 0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284], + [0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289, + 0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619, + 1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]]) + + self.assertTrue(np.allclose(result, expected)) + self.assertTrue(type(result) == type(expected)) + self.assertTrue(result.shape == expected.shape) + + def test_rebin_data(self): + """Test rebin_data""" + # sample in double the time (even case since 10 % 2 = 0): + # (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2 + # = 0.5, 2.5, 4.5, 6.5, 8.5 + ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float) + for i in range(0, 10, 2)]).T + + self.assertTrue( + np.array_equal(std.rebin_data(self.time_data, 2), ans_even)) + + # sample in triple the time (uneven since 10 % 3 = 1): + # (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1 + # = 1, 4, 7, 9 + ans_odd = np.array([i * np.ones(10, dtype=float) + for i in (1, 4, 7, 9)]).T + self.assertTrue( + np.array_equal(std.rebin_data(self.time_data, 3), ans_odd)) + + def test_get_prob_dist(self): + """Test get_prob_dist""" + lag_indices = np.array([1, 2, 3, 4]) + unit_indices = np.array([1, 3, 2, 4]) + answer = np.array([ + [0.0754717, 0.88207547, 0.04245283, 0., 0.], + [0., 0., 0.09411765, 0.87058824, 0.03529412], + [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505], + [0., 0., 0., 0.02352941, 0.97647059] + ]) + result = std.get_prob_dist(self.transition_matrix, + lag_indices, unit_indices) + + self.assertTrue(np.array_equal(result, answer)) + + def test_get_prob_stats(self): + """Test get_prob_stats""" + + probs = np.array([ + [0.0754717, 0.88207547, 0.04245283, 0., 0.], + [0., 0., 0.09411765, 0.87058824, 0.03529412], + [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505], + [0., 0., 0., 0.02352941, 0.97647059] + ]) + unit_indices = np.array([1, 3, 2, 4]) + answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.]) + answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941]) + answer_trend = np.array([-0.03301887 / 0.88207547, + -0.05882353 / 0.87058824, + 0.02475248 / 0.77722772, + -0.02352941 / 0.97647059]) + answer_volatility = np.array([0.34221495, 0.33705421, + 0.29226542, 0.38834223]) + + result = std.get_prob_stats(probs, unit_indices) + result_up = result[0] + result_down = result[1] + result_trend = result[2] + result_volatility = result[3] + + self.assertTrue(np.allclose(result_up, answer_up)) + self.assertTrue(np.allclose(result_down, answer_down)) + self.assertTrue(np.allclose(result_trend, answer_trend)) + self.assertTrue(np.allclose(result_volatility, answer_volatility)) diff --git a/src/pg/crankshaft.control b/src/pg/crankshaft.control index 216a89f..7d5a93a 100644 --- a/src/pg/crankshaft.control +++ b/src/pg/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.6.1' +default_version = '0.7.0' requires = 'plpythonu, postgis' superuser = true schema = cdb_crankshaft diff --git a/src/pg/sql/11_kmeans.sql b/src/pg/sql/11_kmeans.sql index 1a01e03..b0362b8 100644 --- a/src/pg/sql/11_kmeans.sql +++ b/src/pg/sql/11_kmeans.sql @@ -1,18 +1,58 @@ -- Spatial k-means clustering -CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer, no_init integer default 20) -RETURNS table (cartodb_id integer, cluster_no integer) as $$ +CREATE OR REPLACE FUNCTION CDB_KMeans( + query TEXT, + no_clusters INTEGER, + no_init INTEGER DEFAULT 20 +) +RETURNS TABLE( + cartodb_id INTEGER, + cluster_no INTEGER +) AS $$ - from crankshaft.clustering import Kmeans - kmeans = Kmeans() - return kmeans.spatial(query, no_clusters, no_init) +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.spatial(query, no_clusters, no_init) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +-- Non-spatial k-means clustering +-- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- no_clusters: number of requested clusters +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column -CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC) -RETURNS Numeric[] AS -$$ +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + no_clusters INTEGER, + standardize BOOLEAN DEFAULT true, + id_col TEXT DEFAULT 'cartodb_id' +) +RETURNS TABLE( + cluster_label text, + cluster_center json, + silhouettes numeric, + inertia numeric, + rowid bigint +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.nonspatial(query, colnames, no_clusters, + standardize=standardize, + id_col=id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( + state NUMERIC[], + the_geom GEOMETRY(Point, 4326), + weight NUMERIC +) +RETURNS Numeric[] AS $$ DECLARE newX NUMERIC; newY NUMERIC; @@ -32,7 +72,8 @@ BEGIN END $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; -CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[]) + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[]) RETURNS GEOMETRY AS $$ BEGIN diff --git a/src/pg/sql/13_PIA.sql b/src/pg/sql/13_PIA.sql index 71b95b2..236713d 100644 --- a/src/pg/sql/13_PIA.sql +++ b/src/pg/sql/13_PIA.sql @@ -31,7 +31,7 @@ DECLARE sqr numeric; p geometry; BEGIN - sqr := |/2; + sqr := 0.5*(|/2.0); polygon := ST_Transform(polygon, 3857); -- grid #0 cell size @@ -46,6 +46,7 @@ BEGIN SELECT array_agg(c) INTO cells FROM c1; -- 1st guess: centroid + best_c := polygon; best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon)); -- looping the loop @@ -56,6 +57,7 @@ BEGIN EXIT WHEN i > n; cell := cells[i]; + i := i+1; -- cell side size, it's square @@ -63,13 +65,14 @@ BEGIN -- check distance test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell)); + IF test_d > best_d THEN best_d := test_d; - best_c := cells[i]; + best_c := cell; END IF; -- longest distance within the cell - test_mx := test_d + (test_h/2 * sqr); + test_mx := test_d + (test_h * sqr); -- if the cell has no chance to contains the desired point, continue CONTINUE WHEN test_mx - best_d <= tolerance; @@ -94,29 +97,46 @@ END; $$ language plpgsql IMMUTABLE PARALLEL SAFE; + -- signed distance point to polygon with holes -- negative is the point is out the polygon +-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm CREATE OR REPLACE FUNCTION _Signed_Dist( IN polygon geometry, IN point geometry ) RETURNS numeric AS $$ DECLARE + pols geometry[]; + pol geometry; i integer; + j integer; within integer; + w integer; holes integer; dist numeric; + d numeric; BEGIN dist := 1e999; - SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(polygon))::numeric) INTO dist; - SELECT CASE WHEN ST_Within(point,polygon) THEN 1 ELSE -1 END INTO within; - SELECT ST_NumInteriorRings(polygon) INTO holes; - IF holes > 0 THEN - FOR i IN 1..holes - LOOP - SELECT LEAST(dist, ST_distance(point, ST_InteriorRingN(polygon, i))::numeric) INTO dist; - END LOOP; - END IF; + WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection; + FOR j in 1..array_length(pols, 1) + LOOP + pol := pols[j]; + d := dist; + SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d; + SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w; + SELECT ST_NumInteriorRings(pol) INTO holes; + IF holes > 0 THEN + FOR i IN 1..holes + LOOP + SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d; + END LOOP; + END IF; + IF d < dist THEN + dist:= d; + within := w; + END IF; + END LOOP; dist := dist * within::numeric; RETURN dist; END; diff --git a/src/pg/test/expected/11_kmeans_test.out b/src/pg/test/expected/11_kmeans_test.out index 8c6ffa1..85b8b13 100644 --- a/src/pg/test/expected/11_kmeans_test.out +++ b/src/pg/test/expected/11_kmeans_test.out @@ -1,10 +1,43 @@ \pset format unaligned \set ECHO all -SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2); +-- spatial kmeans +SELECT + count(DISTINCT cluster_no) as clusters +FROM + cdb_crankshaft.cdb_kmeans('select * from ppoints', 2); clusters 2 (1 row) -SELECT count(*) clusters from (select cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), code from ppoints group by code) p; +-- weighted mean +SELECT + count(*) clusters +FROM ( + SELECT + cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), + code + FROM ppoints + GROUP BY code +) p; clusters 52 (1 row) +-- nonspatial kmeans +SELECT + cluster_label::int in (0, 1) As cluster_label, + cluster_center::json->>'col1' As cc_col1, + cluster_center::json->>'col2' As cc_col2, + silhouettes, + inertia, + rowid +FROM cdb_crankshaft.CDB_KMeansNonspatial( + 'SELECT unnest(Array[1, 1, 10, 10]) As col1, ' || + 'unnest(Array[100, 100, 2, 2]) As col2, ' || + 'unnest(Array[1, 2, 3, 4]) As cartodb_id ', + Array['col1', 'col2']::text[], + 2); +cluster_label|cc_col1|cc_col2|silhouettes|inertia|rowid +t|-1.0|1.0|1.0|0.0|1 +t|-1.0|1.0|1.0|0.0|2 +t|1.0|-1.0|1.0|0.0|3 +t|1.0|-1.0|1.0|0.0|4 +(4 rows) diff --git a/src/pg/test/expected/13_pia_test.out b/src/pg/test/expected/13_pia_test.out index 2367e20..2ccd544 100644 --- a/src/pg/test/expected/13_pia_test.out +++ b/src/pg/test/expected/13_pia_test.out @@ -2,6 +2,16 @@ SET client_min_messages TO WARNING; \set ECHO none st_astext ------------------------------------------- - POINT(-3.67484492582767 40.4395084885993) + POINT(-3.67484492582767 40.4394914243877) +(1 row) + + st_astext +------------ + POINT(0 0) +(1 row) + + st_astext +------------ + POINT(0 0) (1 row) diff --git a/src/pg/test/sql/11_kmeans_test.sql b/src/pg/test/sql/11_kmeans_test.sql index 2298b85..a44e315 100644 --- a/src/pg/test/sql/11_kmeans_test.sql +++ b/src/pg/test/sql/11_kmeans_test.sql @@ -1,6 +1,34 @@ \pset format unaligned \set ECHO all -SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2); +-- spatial kmeans +SELECT + count(DISTINCT cluster_no) as clusters +FROM + cdb_crankshaft.cdb_kmeans('select * from ppoints', 2); -SELECT count(*) clusters from (select cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), code from ppoints group by code) p; +-- weighted mean +SELECT + count(*) clusters +FROM ( + SELECT + cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), + code + FROM ppoints + GROUP BY code +) p; + +-- nonspatial kmeans +SELECT + cluster_label::int in (0, 1) As cluster_label, + cluster_center::json->>'col1' As cc_col1, + cluster_center::json->>'col2' As cc_col2, + silhouettes, + inertia, + rowid +FROM cdb_crankshaft.CDB_KMeansNonspatial( + 'SELECT unnest(Array[1, 1, 10, 10]) As col1, ' || + 'unnest(Array[100, 100, 2, 2]) As col2, ' || + 'unnest(Array[1, 2, 3, 4]) As cartodb_id ', + Array['col1', 'col2']::text[], + 2); diff --git a/src/pg/test/sql/13_pia_test.sql b/src/pg/test/sql/13_pia_test.sql index 8b37082..a1c11b9 100644 --- a/src/pg/test/sql/13_pia_test.sql +++ b/src/pg/test/sql/13_pia_test.sql @@ -5,3 +5,22 @@ with a as( select st_geomfromtext('POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))', 3857) as g ) SELECT st_astext(cdb_crankshaft.CDB_PIA(g)) from a; + +-- square centered on 0,0 with sides of length 2 +-- expectation: point(0, 0) +WITH square AS ( + SELECT 'SRID=4326;POLYGON((-1 1, 1 1, 1 -1, -1 -1, -1 1))'::geometry as g +) +SELECT ST_AsText(cdb_crankshaft.CDB_PIA(g)) + FROM square; + +-- MultiPolygon test +-- square centered on 0,0 with sides of length 2 +-- expectation: point(0, 0) +WITH square AS ( + SELECT + ST_Multi('SRID=4326;POLYGON((-1 1, 1 1, 1 -1, -1 -1, -1 1))'::geometry) as g +) +SELECT ST_AsText(cdb_crankshaft.CDB_PIA(g)) + FROM square + diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index 7a65e31..3d5225a 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -2,84 +2,97 @@ import plpy import pysal_utils as pu +NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows ' + 'for null values and fill in appropriately.') -class AnalysisDataProvider: + +def verify_data(func): + """decorator to verify data result before returning to algorithm""" + def wrapper(*args, **kwargs): + """Error checking""" + try: + data = func(*args, **kwargs) + if not data: + plpy.error(NULL_VALUE_ERROR) + else: + return data + except Exception as err: + plpy.error('Analysis failed: {}'.format(err)) + + return [] + + return wrapper + + +class AnalysisDataProvider(object): + @verify_data def get_getis(self, w_type, params): """fetch data for getis ord's g""" - try: - query = pu.construct_neighbor_query(w_type, params) - result = plpy.execute(query) - # if there are no neighbors, exit - if len(result) == 0: - return pu.empty_zipped_array(4) - else: - return result - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % err) + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + @verify_data def get_markov(self, w_type, params): """fetch data for spatial markov""" - try: - query = pu.construct_neighbor_query(w_type, params) - data = plpy.execute(query) - - if len(data) == 0: - return pu.empty_zipped_array(4) - - return data - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % err) + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + @verify_data def get_moran(self, w_type, params): """fetch data for moran's i analyses""" - try: - query = pu.construct_neighbor_query(w_type, params) - data = plpy.execute(query) + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) - # if there are no neighbors, exit - if len(data) == 0: - return pu.empty_zipped_array(2) - return data - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % e) - return pu.empty_zipped_array(2) + @verify_data + def get_nonspatial_kmeans(self, params): + """ + Fetch data for non-spatial k-means. - def get_nonspatial_kmeans(self, query): - """fetch data for non-spatial kmeans""" - try: - data = plpy.execute(query) - return data - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % err) + Inputs - a dict (params) with the following keys: + colnames: a (text) list of column names (e.g., + `['andy', 'cookie']`) + id_col: the name of the id column (e.g., `'cartodb_id'`) + subquery: the subquery for exposing the data (e.g., + SELECT * FROM favorite_things) + Output: + A SQL query for packaging the data for consumption within + `KMeans().nonspatial`. Format will be a list of length one, + with the first element a dict with keys ('rowid', 'attr1', + 'attr2', ...) + """ + agg_cols = ', '.join([ + 'array_agg({0}) As arr_col{1}'.format(val, idx+1) + for idx, val in enumerate(params['colnames']) + ]) + query = ''' + SELECT {cols}, array_agg({id_col}) As rowid + FROM ({subquery}) As a + '''.format(subquery=params['subquery'], + id_col=params['id_col'], + cols=agg_cols).strip() + return plpy.execute(query) + @verify_data def get_spatial_kmeans(self, params): """fetch data for spatial kmeans""" - query = ("SELECT " - "array_agg({id_col} ORDER BY {id_col}) as ids," - "array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs," - "array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys " - "FROM ({subquery}) As a " - "WHERE {geom_col} IS NOT NULL").format(**params) - try: - data = plpy.execute(query) - return data - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % err) + query = ''' + SELECT + array_agg("{id_col}" ORDER BY "{id_col}") as ids, + array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs, + array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys + FROM ({subquery}) As a + WHERE "{geom_col}" IS NOT NULL + '''.format(**params) + return plpy.execute(query) + @verify_data def get_gwr(self, params): """fetch data for gwr analysis""" query = pu.gwr_query(params) - try: - query_result = plpy.execute(query) - return query_result - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % err) + return plpy.execute(query) + @verify_data def get_gwr_predict(self, params): """fetch data for gwr predict""" query = pu.gwr_predict_query(params) - try: - query_result = plpy.execute(query) - return query_result - except plpy.SPIError, err: - plpy.error('Analysis failed: %s' % err) + return plpy.execute(query) diff --git a/src/py/crankshaft/crankshaft/clustering/getis.py b/src/py/crankshaft/crankshaft/clustering/getis.py index bef8f50..2bee3a2 100644 --- a/src/py/crankshaft/crankshaft/clustering/getis.py +++ b/src/py/crankshaft/crankshaft/clustering/getis.py @@ -12,7 +12,7 @@ from crankshaft.analysis_data_provider import AnalysisDataProvider # High level interface --------------------------------------- -class Getis: +class Getis(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() @@ -31,13 +31,13 @@ class Getis: # geometries with attributes that are null are ignored # resulting in a collection of not as near neighbors if kNN is chosen - qvals = OrderedDict([("id_col", id_col), - ("attr1", attr), - ("geom_col", geom_col), - ("subquery", subquery), - ("num_ngbrs", num_ngbrs)]) + params = OrderedDict([("id_col", id_col), + ("attr1", attr), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) - result = self.data_provider.get_getis(w_type, qvals) + result = self.data_provider.get_getis(w_type, params) attr_vals = pu.get_attributes(result) # build PySAL weight object diff --git a/src/py/crankshaft/crankshaft/clustering/kmeans.py b/src/py/crankshaft/crankshaft/clustering/kmeans.py index 1e49115..6d22d44 100644 --- a/src/py/crankshaft/crankshaft/clustering/kmeans.py +++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py @@ -4,7 +4,7 @@ import numpy as np from crankshaft.analysis_data_provider import AnalysisDataProvider -class Kmeans: +class Kmeans(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() @@ -20,13 +20,94 @@ class Kmeans: "geom_col": "the_geom", "id_col": "cartodb_id"} - data = self.data_provider.get_spatial_kmeans(params) + result = self.data_provider.get_spatial_kmeans(params) # Unpack query response - xs = data[0]['xs'] - ys = data[0]['ys'] - ids = data[0]['ids'] + xs = result[0]['xs'] + ys = result[0]['ys'] + ids = result[0]['ids'] km = KMeans(n_clusters=no_clusters, n_init=no_init) labels = km.fit_predict(zip(xs, ys)) return zip(ids, labels) + + def nonspatial(self, subquery, colnames, no_clusters=5, + standardize=True, id_col='cartodb_id'): + """ + Arguments: + query (string): A SQL query to retrieve the data required to do the + k-means clustering analysis, like so: + SELECT * FROM iris_flower_data + colnames (list): a list of the column names which contain the data + of interest, like so: ['sepal_width', + 'petal_width', + 'sepal_length', + 'petal_length'] + no_clusters (int): number of clusters (greater than zero) + id_col (string): name of the input id_column + + Returns: + A list of tuples with the following columns: + cluster labels: a label for the cluster that the row belongs to + centers: center of the cluster that this row belongs to + silhouettes: silhouette measure for this value + rowid: row that these values belong to (corresponds to the value in + `id_col`) + """ + import json + from sklearn import metrics + + params = { + "colnames": colnames, + "subquery": subquery, + "id_col": id_col + } + + data = self.data_provider.get_nonspatial_kmeans(params) + + # fill array with values for k-means clustering + if standardize: + cluster_columns = _scale_data( + _extract_columns(data)) + else: + cluster_columns = _extract_columns(data) + + kmeans = KMeans(n_clusters=no_clusters, + random_state=0).fit(cluster_columns) + + centers = [json.dumps(dict(zip(colnames, c))) + for c in kmeans.cluster_centers_[kmeans.labels_]] + + silhouettes = metrics.silhouette_samples(cluster_columns, + kmeans.labels_, + metric='sqeuclidean') + + return zip(kmeans.labels_, + centers, + silhouettes, + [kmeans.inertia_] * kmeans.labels_.shape[0], + data[0]['rowid']) + + +# -- Preprocessing steps + +def _extract_columns(data): + """ + Extract the features from the query and pack them into a NumPy array + data (list of dicts): result of the kmeans request + """ + # number of columns minus rowid column + n_cols = len(data[0]) - 1 + return np.array([data[0]['arr_col{0}'.format(i+1)] + for i in xrange(n_cols)], + dtype=float).T + + +def _scale_data(features): + """ + Scale all input columns to center on 0 with a standard devation of 1 + features (numpy matrix): features of dimension (n_features, n_samples) + """ + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + return scaler.fit_transform(features) diff --git a/src/py/crankshaft/crankshaft/clustering/moran.py b/src/py/crankshaft/crankshaft/clustering/moran.py index 0e12e3f..0d5753f 100644 --- a/src/py/crankshaft/crankshaft/clustering/moran.py +++ b/src/py/crankshaft/crankshaft/clustering/moran.py @@ -15,7 +15,7 @@ import crankshaft.pysal_utils as pu # High level interface --------------------------------------- -class Moran: +class Moran(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() diff --git a/src/py/crankshaft/crankshaft/pysal_utils/pysal_utils.py b/src/py/crankshaft/crankshaft/pysal_utils/pysal_utils.py index b1b0bf9..6b02f6d 100644 --- a/src/py/crankshaft/crankshaft/pysal_utils/pysal_utils.py +++ b/src/py/crankshaft/crankshaft/pysal_utils/pysal_utils.py @@ -25,13 +25,6 @@ def get_weight(query_res, w_type='knn', num_ngbrs=5): Construct PySAL weight from return value of query @param query_res dict-like: query results with attributes and neighbors """ - # if w_type.lower() == 'knn': - # row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs - # weights = {x['id']: row_normed_weights for x in query_res} - # else: - # weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors']) - # if len(x['neighbors']) > 0 - # else [] for x in query_res} neighbors = {x['id']: x['neighbors'] for x in query_res} print 'len of neighbors: %d' % len(neighbors) @@ -148,22 +141,21 @@ def knn(params): "attr_where_i": attr_where.replace("idx_replace", "i"), "attr_where_j": attr_where.replace("idx_replace", "j")} - query = "SELECT " \ - "i.\"{id_col}\" As id, " \ - "%(attr_select)s" \ - "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ - "FROM ({subquery}) As j " \ - "WHERE " \ - "i.\"{id_col}\" <> j.\"{id_col}\" AND " \ - "%(attr_where_j)s " \ - "ORDER BY " \ - "j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \ - "LIMIT {num_ngbrs})" \ - ") As neighbors " \ - "FROM ({subquery}) As i " \ - "WHERE " \ - "%(attr_where_i)s " \ - "ORDER BY i.\"{id_col}\" ASC;" % replacements + query = ''' + SELECT + i."{id_col}" As id, + %(attr_select)s + (SELECT ARRAY(SELECT j."{id_col}" + FROM ({subquery}) As j + WHERE i."{id_col}" <> j."{id_col}" AND + %(attr_where_j)s AND + j."{geom_col}" IS NOT NULL + ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC + LIMIT {num_ngbrs})) As neighbors + FROM ({subquery}) As i + WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL + ORDER BY i."{id_col}" ASC; + ''' % replacements return query.format(**params) @@ -180,19 +172,20 @@ def queen(params): "attr_where_i": attr_where.replace("idx_replace", "i"), "attr_where_j": attr_where.replace("idx_replace", "j")} - query = "SELECT " \ - "i.\"{id_col}\" As id, " \ - "%(attr_select)s" \ - "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ - "FROM ({subquery}) As j " \ - "WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \ - "ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \ - "%(attr_where_j)s)" \ - ") As neighbors " \ - "FROM ({subquery}) As i " \ - "WHERE " \ - "%(attr_where_i)s " \ - "ORDER BY i.\"{id_col}\" ASC;" % replacements + query = ''' + SELECT + i."{id_col}" As id, + %(attr_select)s + (SELECT ARRAY(SELECT j."{id_col}" + FROM ({subquery}) As j + WHERE i."{id_col}" <> j."{id_col}" AND + ST_Touches(i."{geom_col}", j."{geom_col}") AND + %(attr_where_j)s)) As neighbors + FROM ({subquery}) As i + WHERE + %(attr_where_i)s + ORDER BY i."{id_col}" ASC; + ''' % replacements return query.format(**params) @@ -256,15 +249,3 @@ def get_attributes(query_res, attr_num=1): """ return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float) - - -def empty_zipped_array(num_nones): - """ - prepare return values for cases of empty weights objects (no neighbors) - Input: - @param num_nones int: number of columns (e.g., 4) - Output: - [(None, None, None, None)] - """ - - return [tuple([None] * num_nones)] diff --git a/src/py/crankshaft/crankshaft/random_seeds.py b/src/py/crankshaft/crankshaft/random_seeds.py index 31958cb..c55ba14 100644 --- a/src/py/crankshaft/crankshaft/random_seeds.py +++ b/src/py/crankshaft/crankshaft/random_seeds.py @@ -2,6 +2,7 @@ import random import numpy + def set_random_seeds(value): """ Set the seeds of the RNGs (Random Number Generators) diff --git a/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py b/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py index 3ad8273..20daaf1 100644 --- a/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py +++ b/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py @@ -11,7 +11,7 @@ import crankshaft.pysal_utils as pu from crankshaft.analysis_data_provider import AnalysisDataProvider -class Markov: +class Markov(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() @@ -61,14 +61,14 @@ class Markov: "subquery": subquery, "num_ngbrs": num_ngbrs} - query_result = self.data_provider.get_markov(w_type, params) + result = self.data_provider.get_markov(w_type, params) # build weight - weights = pu.get_weight(query_result, w_type) + weights = pu.get_weight(result, w_type) weights.transform = 'r' # prep time data - t_data = get_time_data(query_result, time_cols) + t_data = get_time_data(result, time_cols) sp_markov_result = ps.Spatial_Markov(t_data, weights, diff --git a/src/py/crankshaft/requirements.txt b/src/py/crankshaft/requirements.txt index 3f50cd7..88c0a9e 100644 --- a/src/py/crankshaft/requirements.txt +++ b/src/py/crankshaft/requirements.txt @@ -1,5 +1,5 @@ joblib==0.8.3 numpy==1.6.1 scipy==0.14.0 -pysal==1.11.2 +pysal==1.14.3 scikit-learn==0.14.1 diff --git a/src/py/crankshaft/setup.py b/src/py/crankshaft/setup.py index cd8ad99..ffbdc19 100644 --- a/src/py/crankshaft/setup.py +++ b/src/py/crankshaft/setup.py @@ -41,7 +41,7 @@ setup( # The choice of component versions is dictated by what's # provisioned in the production servers. # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation. - install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.11.2', 'scikit-learn==0.14.1'], + install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1'], requires=['pysal', 'numpy', 'sklearn'], diff --git a/src/py/crankshaft/test/mock_plpy.py b/src/py/crankshaft/test/mock_plpy.py index e8a279d..9c3340c 100644 --- a/src/py/crankshaft/test/mock_plpy.py +++ b/src/py/crankshaft/test/mock_plpy.py @@ -42,6 +42,9 @@ class MockPlPy: def info(self, msg): self.infos.append(msg) + def error(self, msg): + self.notices.append(msg) + def cursor(self, query): data = self.execute(query) return MockCursor(data) diff --git a/src/py/crankshaft/test/test_clustering_kmeans.py b/src/py/crankshaft/test/test_clustering_kmeans.py index 93633b0..c118d34 100644 --- a/src/py/crankshaft/test/test_clustering_kmeans.py +++ b/src/py/crankshaft/test/test_clustering_kmeans.py @@ -2,17 +2,12 @@ import unittest import numpy as np -# from mock_plpy import MockPlPy -# plpy = MockPlPy() -# -# import sys -# sys.modules['plpy'] = plpy from helper import fixture_file from crankshaft.clustering import Kmeans from crankshaft.analysis_data_provider import AnalysisDataProvider import crankshaft.clustering as cc - from crankshaft import random_seeds + import json from collections import OrderedDict @@ -24,7 +19,7 @@ class FakeDataProvider(AnalysisDataProvider): def get_spatial_kmeans(self, query): return self.mocked_result - def get_nonspatial_kmeans(self, query, standarize): + def get_nonspatial_kmeans(self, query): return self.mocked_result @@ -54,3 +49,39 @@ class KMeansTest(unittest.TestCase): self.assertEqual(len(np.unique(labels)), 2) self.assertEqual(len(c1), 20) self.assertEqual(len(c2), 20) + + +class KMeansNonspatialTest(unittest.TestCase): + """Testing class for k-means non-spatial""" + + def setUp(self): + self.params = {"subquery": "SELECT * FROM TABLE", + "n_clusters": 5} + + def test_kmeans_nonspatial(self): + """ + test for k-means non-spatial + """ + # data from: + # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans + data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]), + ("arr_col2", [2, 4, 0, 2, 4, 0]), + ("rowid", [1, 2, 3, 4, 5, 6])])] + + random_seeds.set_random_seeds(1234) + kmeans = Kmeans(FakeDataProvider(data_raw)) + clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2) + + cl1 = clusters[0][0] + cl2 = clusters[3][0] + + for idx, val in enumerate(clusters): + if idx < 3: + self.assertEqual(val[0], cl1) + else: + self.assertEqual(val[0], cl2) + + # raises exception for no data + with self.assertRaises(Exception): + kmeans = Kmeans(FakeDataProvider([])) + kmeans.nonspatial('subquery', ['col1', 'col2'], 2) diff --git a/src/py/crankshaft/test/test_pysal_utils.py b/src/py/crankshaft/test/test_pysal_utils.py index 92b528b..be45164 100644 --- a/src/py/crankshaft/test/test_pysal_utils.py +++ b/src/py/crankshaft/test/test_pysal_utils.py @@ -70,80 +70,10 @@ class PysalUtilsTest(unittest.TestCase): self.assertEqual(pu.query_attr_where(self.params1), ans1) self.assertEqual(pu.query_attr_where(self.params_array), ans_array) - def test_knn(self): - """Test knn neighbors constructor""" - - ans1 = "SELECT i.\"cartodb_id\" As id, " \ - "i.\"andy\"::numeric As attr1, " \ - "i.\"jay_z\"::numeric As attr2, " \ - "(SELECT ARRAY(SELECT j.\"cartodb_id\" " \ - "FROM (SELECT * FROM a_list) As j " \ - "WHERE " \ - "i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \ - "j.\"andy\" IS NOT NULL AND " \ - "j.\"jay_z\" IS NOT NULL " \ - "ORDER BY " \ - "j.\"the_geom\" <-> i.\"the_geom\" ASC " \ - "LIMIT 321)) As neighbors " \ - "FROM (SELECT * FROM a_list) As i " \ - "WHERE i.\"andy\" IS NOT NULL AND " \ - "i.\"jay_z\" IS NOT NULL " \ - "ORDER BY i.\"cartodb_id\" ASC;" - - ans_array = "SELECT i.\"cartodb_id\" As id, " \ - "i.\"_2013_dec\"::numeric As attr1, " \ - "i.\"_2014_jan\"::numeric As attr2, " \ - "i.\"_2014_feb\"::numeric As attr3, " \ - "(SELECT ARRAY(SELECT j.\"cartodb_id\" " \ - "FROM (SELECT * FROM a_list) As j " \ - "WHERE i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \ - "j.\"_2013_dec\" IS NOT NULL AND " \ - "j.\"_2014_jan\" IS NOT NULL AND " \ - "j.\"_2014_feb\" IS NOT NULL " \ - "ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC " \ - "LIMIT 321)) As neighbors " \ - "FROM (SELECT * FROM a_list) As i " \ - "WHERE i.\"_2013_dec\" IS NOT NULL AND " \ - "i.\"_2014_jan\" IS NOT NULL AND " \ - "i.\"_2014_feb\" IS NOT NULL "\ - "ORDER BY i.\"cartodb_id\" ASC;" - - self.assertEqual(pu.knn(self.params1), ans1) - self.assertEqual(pu.knn(self.params_array), ans_array) - - def test_queen(self): - """Test queen neighbors constructor""" - - ans1 = "SELECT i.\"cartodb_id\" As id, " \ - "i.\"andy\"::numeric As attr1, " \ - "i.\"jay_z\"::numeric As attr2, " \ - "(SELECT ARRAY(SELECT j.\"cartodb_id\" " \ - "FROM (SELECT * FROM a_list) As j " \ - "WHERE " \ - "i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \ - "ST_Touches(i.\"the_geom\", " \ - "j.\"the_geom\") AND " \ - "j.\"andy\" IS NOT NULL AND " \ - "j.\"jay_z\" IS NOT NULL)" \ - ") As neighbors " \ - "FROM (SELECT * FROM a_list) As i " \ - "WHERE i.\"andy\" IS NOT NULL AND " \ - "i.\"jay_z\" IS NOT NULL " \ - "ORDER BY i.\"cartodb_id\" ASC;" - - self.assertEqual(pu.queen(self.params1), ans1) - - def test_construct_neighbor_query(self): - """Test construct_neighbor_query""" - - # Compare to raw knn query - self.assertEqual(pu.construct_neighbor_query('knn', self.params1), - pu.knn(self.params1)) - def test_get_attributes(self): """Test get_attributes""" - ## need to add tests + # need to add tests self.assertEqual(True, True) @@ -151,10 +81,3 @@ class PysalUtilsTest(unittest.TestCase): """Test get_weight""" self.assertEqual(True, True) - - def test_empty_zipped_array(self): - """Test empty_zipped_array""" - ans2 = [(None, None)] - ans4 = [(None, None, None, None)] - self.assertEqual(pu.empty_zipped_array(2), ans2) - self.assertEqual(pu.empty_zipped_array(4), ans4)