diff --git a/mgwr/diagnostics.py b/mgwr/diagnostics.py index 82810de..f2c0d06 100755 --- a/mgwr/diagnostics.py +++ b/mgwr/diagnostics.py @@ -11,34 +11,37 @@ def get_AICc(gwr): """ Get AICc value - + Gaussian: p61, (2.33), Fotheringham, Brunsdon and Charlton (2002) - + GWGLM: AICc=AIC+2k(k+1)/(n-k-1), Nakaya et al. (2005): p2704, (36) """ n = gwr.n k = gwr.tr_S + y = gwr.y + mu = gwr.mu #sigma2 = gwr.sigma2 if isinstance(gwr.family, Gaussian): aicc = -2.0 * gwr.llf + 2.0 * n * (k + 1.0) / ( n - k - 2.0) #equivalent to below but - #can't control denominator of sigma without altering GLM familt code + #can't control denominator of sigma without altering GLM family code #aicc = n*np.log(sigma2) + n*np.log(2.0*np.pi) + n*(n+k)/(n-k-2.0) elif isinstance(gwr.family, (Poisson, Binomial)): aicc = get_AIC(gwr) + 2.0 * k * (k + 1.0) / (n - k - 1.0) + #aicc = np.sum(gwr.family.resid_dev(y, mu)**2) + 2.0 * n * (k + 1.0) / (n - k - 2.0) return aicc def get_AIC(gwr): """ - Get AIC calue + Get AIC value Gaussian: p96, (4.22), Fotheringham, Brunsdon and Charlton (2002) GWGLM: AIC(G)=D(G) + 2K(G), where D and K denote the deviance and the effective number of parameters in the model with bandwidth G, respectively. - + """ k = gwr.tr_S #deviance = -2*log-likelihood diff --git a/mgwr/gwr.py b/mgwr/gwr.py index f771c43..3862e85 100755 --- a/mgwr/gwr.py +++ b/mgwr/gwr.py @@ -134,7 +134,7 @@ class GWR(GLM): spherical : boolean True for shperical coordinates (long-lat), False for projected coordinates (defalut). - + hat_matrix : boolean True to store full n by n hat matrix, False to not store full hat matrix to minimize memory footprint (defalut). @@ -516,10 +516,10 @@ class GWRResults(GLMResults): R2 : float R-squared for the entire model (1- RSS/TSS) - + adj_R2 : float adjusted R-squared for the entire model - + aic : float Akaike information criterion @@ -575,11 +575,11 @@ class GWRResults(GLMResults): pDev : float local percent of deviation accounted for; analogous to r-squared for GLM's - + D2 : float percent deviance explained for GLM, equivaleng to R2 for Gaussian. - + adj_D2 : float adjusted percent deviance explained, equivaleng to adjusted R2 for Gaussian. @@ -1438,8 +1438,8 @@ class MGWR(GWR): """ - def __init__(self, coords, y, X, selector, sigma2_v1=True, - kernel='bisquare', fixed=False, constant=True, + def __init__(self, coords, y, X, selector, family, offset=None, + sigma2_v1=True, kernel='bisquare', fixed=False, constant=True, spherical=False, hat_matrix=False): """ Initialize class @@ -1448,17 +1448,18 @@ def __init__(self, coords, y, X, selector, sigma2_v1=True, self.bws = self.selector.bw[0] #final set of bandwidth self.bws_history = selector.bw[1] #bws history in backfitting self.bw_init = self.selector.bw_init #initialization bandiwdth - self.family = Gaussian( - ) # manually set since we only support Gassian MGWR for now - GWR.__init__(self, coords, y, X, self.bw_init, family=self.family, - sigma2_v1=sigma2_v1, kernel=kernel, fixed=fixed, - constant=constant, spherical=spherical, + self.family = family + self.offset = offset + # manually set since we only support Gassian MGWR for now + GWR.__init__(self, coords, y, X, self.bw_init, self.family, + self.offset,sigma2_v1=sigma2_v1, kernel=kernel, + fixed=fixed, constant=constant, spherical=spherical, hat_matrix=hat_matrix) self.selector = selector self.sigma2_v1 = sigma2_v1 self.points = None self.P = None - self.offset = None + self.family = family self.exog_resid = None self.exog_scale = None self_fit_params = None @@ -1483,7 +1484,14 @@ def _chunk_compute_R(self, chunk_id=0): for i in range(n): wi = self._build_wi(i, self.bw_init).reshape(-1, 1) - xT = (self.X * wi).T + if isinstance(self.family, Poisson): + wi=wi.reshape(-1,1) + rslt = iwls(self.y, self.X, self.family, self.offset, None, wi=wi) + inv_xtx_xt = rslt[5] + w = rslt[3] + xT = (self.X * w).T + else: + xT = (self.X * wi).T P = np.linalg.solve(xT.dot(self.X), xT).dot(init_pR).T pR[i, :, :] = P * self.X[i] @@ -1502,8 +1510,16 @@ def _chunk_compute_R(self, chunk_id=0): for i in range(len(chunk_index_Aj)): index = chunk_index_Aj[i] wi = self._build_wi(index, self.bws_history[iter_i, j]) - xw = Xj * wi - pAj[i, :] = Xj[index] / np.sum(xw * Xj) * xw + if isinstance(self.family, Poisson): + Xj = Xj.reshape(-1,1) + wi = wi.reshape(-1,1) + rslt = iwls(self.y, Xj, self.family, self.offset, None, wi=wi) + + w = rslt[3] + xw = Xj * w + else: + xw = Xj * wi + pAj[i, :] = (Xj[index] / np.sum(xw * Xj) * xw).reshape(-1) pR[chunk_index_Aj, :, j] = pAj.dot(pRj_old) err = pRj_old - pR[:, :, j] @@ -1520,21 +1536,33 @@ def _chunk_compute_R(self, chunk_id=0): def fit(self, n_chunks=1, pool=None): """ Compute MGWR inference by chunk to reduce memory footprint. - + Parameters ---------- n_chunks : integer, optional - A number of chunks parameter to reduce memory usage. + A number of chunks parameter to reduce memory usage. e.g. n_chunks=2 should reduce overall memory usage by 2. pool : A multiprocessing Pool object to enable parallel fitting; default is None. - + Returns ------- : MGWRResults """ + #self.fit_params['ini_params'] = ini_params + #self.fit_params['tol'] = tol + #self.fit_params['max_iter'] = max_iter + params = self.selector.params - predy = np.sum(self.X * params, axis=1).reshape(-1, 1) + + if isinstance(self.family,Poisson): + predy = self.offset*(np.exp(np.sum(self.X * params, axis=1).reshape(-1, 1))) + + elif isinstance(self.family,Binomial): + predy = 1/(1+np.exp(-1*np.sum(self.X * params, axis=1).reshape(-1, 1))) + + else: + predy = np.sum(self.X * params, axis=1).reshape(-1, 1) try: from tqdm.autonotebook import tqdm #progress bar @@ -1692,7 +1720,7 @@ class MGWRResults(GWRResults): R2 : float R-squared for the entire model (1- RSS/TSS) - + adj_R2 : float adjusted R-squared for the entire model diff --git a/mgwr/search.py b/mgwr/search.py index fdd6662..78ce674 100755 --- a/mgwr/search.py +++ b/mgwr/search.py @@ -4,6 +4,7 @@ import numpy as np from copy import deepcopy +from spglm.family import Gaussian, Binomial, Poisson def golden_section(a, c, delta, function, tol, max_iter, int_score=False, @@ -164,8 +165,8 @@ def equal_interval(l_bound, u_bound, interval, function, int_score=False, return opt_val, opt_score, output -def multi_bw(init, y, X, n, k, family, tol, max_iter, rss_score, gwr_func, - bw_func, sel_func, multi_bw_min, multi_bw_max, bws_same_times, +def multi_bw(init, y, X, n, k, family, offset, tol, max_iter, rss_score, gwr_func, + gwr_func_g, bw_func, bw_func_g, sel_func, multi_bw_min, multi_bw_max, bws_same_times, verbose=False): """ Multiscale GWR bandwidth search procedure using iterative GAM backfitting @@ -180,7 +181,17 @@ def multi_bw(init, y, X, n, k, family, tol, max_iter, rss_score, gwr_func, err = optim_model.resid_response.reshape((-1, 1)) param = optim_model.params - XB = np.multiply(param, X) + if isinstance(family, Poisson): + XB = offset*np.exp(np.multiply(param,X)) + elif isinstance(family, Binomial): + #v = np.multiply(X, param) + #XB = 1 / (1 + np.exp(-1 * v)) + #XB = v + ((1 / (mu * (1 - mu))) * (y - mu)) + XB = 1/(1+np.exp(-1*np.multiply(param,X))) + #XB=np.log(XB/(1-XB)) + else: + XB = np.multiply(param, X) + if rss_score: rss = np.sum((err)**2) iters = 0 @@ -205,6 +216,10 @@ def tqdm(x, desc=''): #otherwise, just passthrough the range temp_y = XB[:, j].reshape((-1, 1)) temp_y = temp_y + err temp_X = X[:, j].reshape((-1, 1)) + + #if isinstance(family, Binomial): + #bw_class = bw_func_g(temp_y, temp_X) + #else: bw_class = bw_func(temp_y, temp_X) if np.all(bw_stable_counter == bws_same_times): @@ -217,10 +232,16 @@ def tqdm(x, desc=''): #otherwise, just passthrough the range else: bw_stable_counter = np.ones(k) + #if isinstance(family, Binomial): + #optim_model = gwr_func_g(temp_y, temp_X, bw) + + #else: optim_model = gwr_func(temp_y, temp_X, bw) err = optim_model.resid_response.reshape((-1, 1)) param = optim_model.params.reshape((-1, )) + #new_XB[:,j] = 1/(1+np.exp(-1*np.sum(temp_X * param, axis=1).reshape(-1))) new_XB[:, j] = optim_model.predy.reshape(-1) + #new_XB[:, j] = np.log(new_XB[:,j]/(1-new_XB[:,j])) params[:, j] = param bws[j] = bw @@ -230,7 +251,14 @@ def tqdm(x, desc=''): #otherwise, just passthrough the range XB = new_XB if rss_score: - predy = np.sum(np.multiply(params, X), axis=1).reshape((-1, 1)) + if isinstance(family, Poisson): + predy = offset*(np.exp(np.sum(X * params, axis=1).reshape(-1, 1))) + + elif isinstance(family,Binomial): + predy = 1/(1+np.exp(-1*np.sum(X * params, axis=1).reshape(-1, 1))) + + else: + predy = np.sum(np.multiply(params, X), axis=1).reshape((-1, 1)) new_rss = np.sum((y - predy)**2) score = np.abs((new_rss - rss) / new_rss) rss = new_rss diff --git a/mgwr/sel_bw.py b/mgwr/sel_bw.py index 34d5587..d2d696c 100755 --- a/mgwr/sel_bw.py +++ b/mgwr/sel_bw.py @@ -136,7 +136,7 @@ class Sel_BW(object): >>> pov = np.array(data.by_col('PctPov')).reshape((-1,1)) >>> african_amer = np.array(data.by_col('PctBlack')).reshape((-1,1)) >>> X = np.hstack([rural, pov, african_amer]) - + Golden section search AICc - adaptive bisquare >>> bw = Sel_BW(coords, y, X).search(criterion='AICc') @@ -213,7 +213,7 @@ def search(self, search_method='golden_section', criterion='AICc', min value used in bandwidth search bw_max : float max value used in bandwidth search - multi_bw_min : list + multi_bw_min : list min values used for each covariate in mgwr bandwidth search. Must be either a single value or have one value for each covariate including the intercept @@ -374,12 +374,34 @@ def _mbw(self): bws_same_times = self.bws_same_times def gwr_func(y, X, bw): + family = self.family + #if isinstance(family, Binomial): + #family = Gaussian() + return GWR(coords, y, X, bw, family=family, kernel=kernel, + fixed=fixed, offset=offset, constant=False, + spherical=self.spherical, hat_matrix=False).fit( + lite=True, pool=self.pool) + + def gwr_func_g(y, X, bw): + family = self.family + family = Gaussian() return GWR(coords, y, X, bw, family=family, kernel=kernel, fixed=fixed, offset=offset, constant=False, spherical=self.spherical, hat_matrix=False).fit( lite=True, pool=self.pool) def bw_func(y, X): + family = self.family + #if isinstance(family, Binomial): + #family = Gaussian() + selector = Sel_BW(coords, y, X, X_glob=[], family=family, + kernel=kernel, fixed=fixed, offset=offset, + constant=False, spherical=self.spherical) + return selector + + def bw_func_g(y, X): + family = self.family + family = Gaussian() selector = Sel_BW(coords, y, X, X_glob=[], family=family, kernel=kernel, fixed=fixed, offset=offset, constant=False, spherical=self.spherical) @@ -391,9 +413,9 @@ def sel_func(bw_func, bw_min=None, bw_max=None): bw_min=bw_min, bw_max=bw_max, interval=interval, tol=tol, max_iter=max_iter, pool=self.pool, verbose=False) - self.bw = multi_bw(self.init_multi, y, X, n, k, family, self.tol_multi, - self.max_iter_multi, self.rss_score, gwr_func, - bw_func, sel_func, multi_bw_min, multi_bw_max, + self.bw = multi_bw(self.init_multi, y, X, n, k, family, offset, self.tol_multi, + self.max_iter_multi, self.rss_score, gwr_func,gwr_func_g, + bw_func,bw_func_g,sel_func, multi_bw_min, multi_bw_max, bws_same_times, verbose=self.verbose) def _init_section(self, X_glob, X_loc, coords, constant): diff --git a/notebooks/Binomial_MGWR_approaches_tried.ipynb b/notebooks/Binomial_MGWR_approaches_tried.ipynb new file mode 100644 index 0000000..e7bc82d --- /dev/null +++ b/notebooks/Binomial_MGWR_approaches_tried.ipynb @@ -0,0 +1,758 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://github.com/pysal/mgwr/pull/56" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"C:/Users/msachde1/Downloads/Research/Development/mgwr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from mgwr.gwr import GWR\n", + "from spglm.family import Gaussian, Binomial, Poisson\n", + "from mgwr.gwr import MGWR\n", + "from mgwr.sel_bw import Sel_BW\n", + "import multiprocessing as mp\n", + "pool = mp.Pool()\n", + "from scipy import linalg\n", + "import numpy.linalg as la\n", + "from scipy import sparse as sp\n", + "from scipy.sparse import linalg as spla\n", + "from spreg.utils import spdot, spmultiply\n", + "from scipy import special\n", + "import libpysal as ps\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from copy import deepcopy\n", + "import copy\n", + "from collections import namedtuple\n", + "import spglm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Fundamental equation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By simple algebraic manipulation, the probability that Y=1 is:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\begin{align}\n", + "p = 1 / (1 + exp (-{\\beta} & _k x _{k,i}) ) \\\\\n", + "\\end{align}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Approaches tried:\n", + "\n", + "1. Changing XB to : `1 / (1 + np.exp (-1*np.sum(np.multiply(X,params),axis=1)))` - these are the predicted probabilities ~(0,1)\n", + "\n", + "\n", + "2. Changing XB as above and writing a function to create temp_y as a binary variable using condition `1 if BXi > 0 else 0.`\n", + "\n", + "\n", + "3. Derived manipulations to temp_y as in iwls for Logistic regression as below:\n", + "\n", + " `v = np.sum(np.multiply(X,params),axis=1)`\n", + " \n", + " `mu = 1/(1+(np.exp(-v)))`\n", + " \n", + " `z = v + (1/(mu * (1-mu)) * (y-mu))` -- this becomes the temp_y\n", + " \n", + " Then a simple linear regression can be run as z as the temp dependent variable\n", + " \n", + " \n", + "4. Taken from GAM logistic model literature:\n", + " \n", + " `y=exp(b0+b1*x1+...+bm*xm)/{1+exp(b0+b1*x1+...+bm*xm)}`\n", + "\n", + " Applying the logistic link function to the probability p (ranging between 0 and 1):\n", + "\n", + " `p' = log {p/(1-p)}`\n", + "\n", + " By applying the logistic link function, we can now rewrite the model as:\n", + "\n", + " `p' = b0 + b1*X1 + ... + bm*Xm`\n", + "\n", + " Finally, we substitute the simple single-parameter additive terms to derive the generalized additive logistic model:\n", + "\n", + " `p' = b0 + f1(X1) + ... + fm(Xm)`\n", + " \n", + " (http://www.statsoft.com/textbook/generalized-additive-models#gam)\n", + " \n", + " This is the current approach in the latest commit:\n", + " \n", + " `XB = 1 / (1 + np.exp (-1*(np.multiply(X,params))))`\n", + " \n", + " XB is now the probability and is normally distributed\n", + " \n", + " Run MGWR (Gaussian) on this as the dependent variable for the partial models.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Clearwater data - downloaded from link: https://sgsup.asu.edu/sparc/multiscale-gwr" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data_p = pd.read_csv(\"C:/Users/msachde1/Downloads/logistic_mgwr_data/landslides.csv\") " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | UserID | \n", + "X | \n", + "Y | \n", + "Elev | \n", + "Slope | \n", + "SinAspct | \n", + "CosAspct | \n", + "AbsSouth | \n", + "Landslid | \n", + "DistStrm | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "616168.5625 | \n", + "5201076.5 | \n", + "1450.475 | \n", + "27.44172 | \n", + "0.409126 | \n", + "-0.912478 | \n", + "24.1499 | \n", + "1 | \n", + "8.506 | \n", + "
| 1 | \n", + "2 | \n", + "624923.8125 | \n", + "5201008.5 | \n", + "1567.476 | \n", + "21.88343 | \n", + "-0.919245 | \n", + "-0.393685 | \n", + "66.8160 | \n", + "1 | \n", + "15.561 | \n", + "
| 2 | \n", + "3 | \n", + "615672.0000 | \n", + "5199187.5 | \n", + "1515.065 | \n", + "38.81030 | \n", + "-0.535024 | \n", + "-0.844837 | \n", + "32.3455 | \n", + "1 | \n", + "41.238 | \n", + "
| 3 | \n", + "4 | \n", + "615209.3125 | \n", + "5199112.0 | \n", + "1459.827 | \n", + "26.71631 | \n", + "-0.828548 | \n", + "-0.559918 | \n", + "55.9499 | \n", + "1 | \n", + "17.539 | \n", + "
| 4 | \n", + "5 | \n", + "616354.6875 | \n", + "5198945.5 | \n", + "1379.442 | \n", + "27.55271 | \n", + "-0.872281 | \n", + "-0.489005 | \n", + "60.7248 | \n", + "1 | \n", + "35.023 | \n", + "
| \n", + " | UserID | \n", + "X | \n", + "Y | \n", + "Elev | \n", + "Slope | \n", + "SinAspct | \n", + "CosAspct | \n", + "AbsSouth | \n", + "Landslid | \n", + "DistStrm | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "616168.5625 | \n", + "5201076.5 | \n", + "1450.475 | \n", + "27.44172 | \n", + "0.409126 | \n", + "-0.912478 | \n", + "24.1499 | \n", + "1 | \n", + "8.506 | \n", + "
| 1 | \n", + "2 | \n", + "624923.8125 | \n", + "5201008.5 | \n", + "1567.476 | \n", + "21.88343 | \n", + "-0.919245 | \n", + "-0.393685 | \n", + "66.8160 | \n", + "1 | \n", + "15.561 | \n", + "
| 2 | \n", + "3 | \n", + "615672.0000 | \n", + "5199187.5 | \n", + "1515.065 | \n", + "38.81030 | \n", + "-0.535024 | \n", + "-0.844837 | \n", + "32.3455 | \n", + "1 | \n", + "41.238 | \n", + "
| 3 | \n", + "4 | \n", + "615209.3125 | \n", + "5199112.0 | \n", + "1459.827 | \n", + "26.71631 | \n", + "-0.828548 | \n", + "-0.559918 | \n", + "55.9499 | \n", + "1 | \n", + "17.539 | \n", + "
| 4 | \n", + "5 | \n", + "616354.6875 | \n", + "5198945.5 | \n", + "1379.442 | \n", + "27.55271 | \n", + "-0.872281 | \n", + "-0.489005 | \n", + "60.7248 | \n", + "1 | \n", + "35.023 | \n", + "
| \n", + " | UserID | \n", + "X | \n", + "Y | \n", + "Elev | \n", + "Slope | \n", + "SinAspct | \n", + "CosAspct | \n", + "AbsSouth | \n", + "Landslid | \n", + "DistStrm | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "616168.5625 | \n", + "5201076.5 | \n", + "1450.475 | \n", + "27.44172 | \n", + "0.409126 | \n", + "-0.912478 | \n", + "24.1499 | \n", + "1 | \n", + "8.506 | \n", + "
| 1 | \n", + "2 | \n", + "624923.8125 | \n", + "5201008.5 | \n", + "1567.476 | \n", + "21.88343 | \n", + "-0.919245 | \n", + "-0.393685 | \n", + "66.8160 | \n", + "1 | \n", + "15.561 | \n", + "
| 2 | \n", + "3 | \n", + "615672.0000 | \n", + "5199187.5 | \n", + "1515.065 | \n", + "38.81030 | \n", + "-0.535024 | \n", + "-0.844837 | \n", + "32.3455 | \n", + "1 | \n", + "41.238 | \n", + "
| 3 | \n", + "4 | \n", + "615209.3125 | \n", + "5199112.0 | \n", + "1459.827 | \n", + "26.71631 | \n", + "-0.828548 | \n", + "-0.559918 | \n", + "55.9499 | \n", + "1 | \n", + "17.539 | \n", + "
| 4 | \n", + "5 | \n", + "616354.6875 | \n", + "5198945.5 | \n", + "1379.442 | \n", + "27.55271 | \n", + "-0.872281 | \n", + "-0.489005 | \n", + "60.7248 | \n", + "1 | \n", + "35.023 | \n", + "