pysal · jeffcsauer · Jul 20, 2020 · Aug 22, 2020 · Aug 28, 2020 · Aug 29, 2020
diff --git a/esda/local_join_count.py b/esda/local_join_count.py
@@ -0,0 +1,161 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from libpysal import weights
+from esda.crand import (
+    crand as _crand_plus,
+    njit as _njit,
+    _prepare_univariate
+)
+
+
+class Local_Join_Count(BaseEstimator):
+
+    """Univariate Local Join Count Statistic"""
+
+    def __init__(self, connectivity=None, permutations=999, n_jobs=1, 
+                 keep_simulations=True, seed=None):
+        """
+        Initialize a Local_Join_Count estimator
+        Arguments
+        ---------
+        connectivity     : scipy.sparse matrix object
+                           the connectivity structure describing
+                           the relationships between observed units.
+                           Need not be row-standardized.
+        permutations     : int
+                           number of random permutations for calculation of pseudo
+                           p_values
+        n_jobs           : int
+                           Number of cores to be used in the conditional randomisation. If -1,
+                           all available cores are used.    
+        keep_simulations : Boolean
+                           (default=True)
+                           If True, the entire matrix of replications under the null 
+                           is stored in memory and accessible; otherwise, replications 
+                           are not saved
+        seed             : None/int
+                           Seed to ensure reproducibility of conditional randomizations. 
+                           Must be set here, and not outside of the function, since numba 
+                           does not correctly interpret external seeds 
+                           nor numpy.random.RandomState instances.              
+
+        Attributes
+        ----------
+        LJC             : numpy array
+                          array containing the univariate
+                          Local Join Count (LJC).
+        p_sim           : numpy array
+                          array containing the simulated
+                          p-values for each unit.
+
+        """
+
+        self.connectivity = connectivity
+        self.permutations = permutations
+        self.n_jobs = n_jobs
+        self.keep_simulations = keep_simulations
+        self.seed = seed
+
+    def fit(self, x):
+        """
+        Arguments
+        ---------
+        x               : numpy.ndarray
+                          array containing binary (0/1) data
+        Returns
+        -------
+        the fitted estimator.
+
+        Notes
+        -----
+        Technical details and derivations found in :cite:`AnselinLi2019`.
+
+        Examples
+        --------
+        >>> import libpysal
+        >>> w = libpysal.weights.lat2W(4, 4)
+        >>> x = np.ones(16)
+        >>> x[0:8] = 0
+        >>> LJC_uni = Local_Join_Count(connectivity=w).fit(x)
+        >>> LJC_uni.LJC
+        >>> LJC_uni.p_sim
+
+        Guerry data replicating GeoDa tutorial
+        >>> import libpysal
+        >>> import geopandas as gpd
+        >>> guerry = libpysal.examples.load_example('Guerry')
+        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
+        >>> guerry_ds['SELECTED'] = 0
+        >>> guerry_ds.loc[(guerry_ds['Donatns'] > 10997), 'SELECTED'] = 1
+        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
+        >>> LJC_uni = Local_Join_Count(connectivity=w).fit(guerry_ds['SELECTED'])
+        >>> LJC_uni.LJC
+        >>> LJC_uni.p_sim
+        """
+        # Need to ensure that the np.array() are of
+        # dtype='float' for numba
+        x = np.array(x, dtype='float')
+
+        w = self.connectivity
+        # Fill the diagonal with 0s
+        w = weights.util.fill_diagonal(w, val=0)
+        w.transform = 'b'
+
+        keep_simulations = self.keep_simulations
+        n_jobs = self.n_jobs
+        seed = self.seed
+
+        permutations = self.permutations
+
+        self.x = x
+        self.n = len(x)
+        self.w = w
+
+        self.LJC = self._statistic(x, w)
+
+        if permutations:
+            self.p_sim, self.rjoins = _crand_plus(
+                z=self.x, 
+                w=self.w, 
+                observed=self.LJC,
+                permutations=permutations, 
+                keep=keep_simulations, 
+                n_jobs=n_jobs,
+                stat_func=_ljc_uni
+            )
+            # Set p-values for those with LJC of 0 to NaN
+            self.p_sim[self.LJC == 0] = 'NaN'
+
+        del (self.n, self.keep_simulations, self.n_jobs, 
+             self.permutations, self.seed, self.w, self.x,
+             self.connectivity, self.rjoins)
+
+        return self
+
+    @staticmethod
+    def _statistic(x, w):
+        # Create adjacency list. Note that remove_symmetric=False - this is
+        # different from the esda.Join_Counts() function.
+        adj_list = w.to_adjlist(remove_symmetric=False)
+        zseries = pd.Series(x, index=w.id_order)
+        focal = zseries.loc[adj_list.focal].values
+        neighbor = zseries.loc[adj_list.neighbor].values
+        LJC = (focal == 1) & (neighbor == 1)
+        adj_list_LJC = pd.DataFrame(adj_list.focal.values,
+                                    LJC.astype('uint8')).reset_index()
+        adj_list_LJC.columns = ['LJC', 'ID']
+        adj_list_LJC = adj_list_LJC.groupby(by='ID').sum()
+        LJC = np.array(adj_list_LJC.LJC.values, dtype='float')
+        return (LJC)
+
+# --------------------------------------------------------------
+# Conditional Randomization Function Implementations
+# --------------------------------------------------------------
+
+# Note: scaling not used
+
+@_njit(fastmath=True)
+def _ljc_uni(i, z, permuted_ids, weights_i, scaling):
+    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
+    return zi * (zrand @ weights_i)
diff --git a/esda/local_join_count_bv.py b/esda/local_join_count_bv.py
@@ -0,0 +1,229 @@
+import numpy as np
+import pandas as pd
+import warnings
+from scipy import sparse
+from sklearn.base import BaseEstimator
+from libpysal import weights
+from esda.crand import (
+    crand as _crand_plus,
+    njit as _njit,
+    _prepare_univariate,
+    _prepare_bivariate
+)
+
+
+class Local_Join_Count_BV(BaseEstimator):
+
+    """Univariate Local Join Count Statistic"""
+
+    def __init__(self, connectivity=None, permutations=999, n_jobs=1, 
+                 keep_simulations=True, seed=None):
+        """
+        Initialize a Local_Join_Count_BV estimator
+        Arguments
+        ---------
+        connectivity     : scipy.sparse matrix object
+                           the connectivity structure describing
+                           the relationships between observed units.
+                           Need not be row-standardized.
+        permutations     : int
+                           number of random permutations for calculation of pseudo
+                           p_values
+        n_jobs           : int
+                           Number of cores to be used in the conditional randomisation. If -1,
+                           all available cores are used.    
+        keep_simulations : Boolean
+                           (default=True)
+                           If True, the entire matrix of replications under the null 
+                           is stored in memory and accessible; otherwise, replications 
+                           are not saved
+        seed             : None/int
+                           Seed to ensure reproducibility of conditional randomizations. 
+                           Must be set here, and not outside of the function, since numba 
+                           does not correctly interpret external seeds 
+                           nor numpy.random.RandomState instances.              
+
+        """
+
+        self.connectivity = connectivity
+        self.permutations = permutations
+        self.n_jobs = n_jobs
+        self.keep_simulations = keep_simulations
+        self.seed = seed
+
+    def fit(self, x, y, case="CLC"):
+        """
+        Arguments
+        ---------
+        x                : numpy.ndarray
+                           array containing binary (0/1) data
+        y                : numpy.ndarray
+                           array containing binary (0/1) data
+        case             : str
+                           "BJC" for bivariate local join count,
+                           "CLC" for co-location local join count.
+                           Details in :cite:`AnselinLi2019`.
+
+        Returns
+        -------
+        the fitted estimator.
+
+        Notes
+        -----
+        Technical details and derivations can be found in :cite:`AnselinLi2019`.
+
+        Examples
+        --------
+        >>> import libpysal
+        >>> w = libpysal.weights.lat2W(4, 4)
+        >>> x = np.ones(16)
+        >>> x[0:8] = 0
+        >>> y = [0,1,0,1,1,1,1,1,0,0,1,1,0,0,1,1]
+        >>> LJC_BV_C1 = Local_Join_Count_BV(connectivity=w).fit(x, y, case="BJC")
+        >>> LJC_BV_C2 = Local_Join_Count_BV(connectivity=w).fit(x, y, case="CLC")
+        >>> LJC_BV_C1.LJC
+        >>> LJC_BV_C1.p_sim
+        >>> LJC_BV_C2.LJC
+        >>> LJC_BV_C2.p_sim
+
+        Commpop data replicating GeoDa tutorial (Case 1)
+        >>> import libpysal
+        >>> import geopandas as gpd
+        >>> commpop = gpd.read_file("https://github.com/jeffcsauer/GSOC2020/raw/master/validation/data/commpop.gpkg")
+        >>> w = libpysal.weights.Queen.from_dataframe(commpop)
+        >>> LJC_BV_Case1 = Local_Join_Count_BV(connectivity=w).fit(commpop['popneg'], commpop['popplus'], case='BJC')
+        >>> LJC_BV_Case1.LJC
+        >>> LJC_BV_Case1.p_sim
+
+        Guerry data replicating GeoDa tutorial (Case 2)
+        >>> import libpysal
+        >>> import geopandas as gpd
+        >>> guerry = libpysal.examples.load_example('Guerry')
+        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
+        >>> guerry_ds['infq5'] = 0
+        >>> guerry_ds['donq5'] = 0
+        >>> guerry_ds.loc[(guerry_ds['Infants'] > 23574), 'infq5'] = 1
+        >>> guerry_ds.loc[(guerry_ds['Donatns'] > 10973), 'donq5'] = 1
+        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
+        >>> LJC_BV_Case2 = Local_Join_Count_BV(connectivity=w).fit(guerry_ds['infq5'], guerry_ds['donq5'], case='CLC')
+        >>> LJC_BV_Case2.LJC
+        >>> LJC_BV_Case2.p_sim
+        """
+        # Need to ensure that the np.array() are of
+        # dtype='float' for numba
+        x = np.array(x, dtype='float')
+        y = np.array(y, dtype='float')
+
+        w = self.connectivity
+        # Fill the diagonal with 0s
+        w = weights.util.fill_diagonal(w, val=0)
+        w.transform = 'b'
+
+        self.x = x
+        self.y = y
+        self.n = len(x)
+        self.w = w
+        self.case = case
+
+        keep_simulations = self.keep_simulations
+        n_jobs = self.n_jobs
+        seed = self.seed
+
+        permutations = self.permutations
+
+        self.LJC = self._statistic(x, y, w, case=case)
+
+        if permutations:
+            if case == "BJC":
+                self.p_sim, self.rjoins = _crand_plus(
+                    z=np.column_stack((x, y)),
+                    w=self.w, 
+                    observed=self.LJC,
+                    permutations=permutations, 
+                    keep=True, 
+                    n_jobs=n_jobs,
+                    stat_func=_ljc_bv_case1
+                )
+                # Set p-values for those with LJC of 0 to NaN
+                self.p_sim[self.LJC == 0] = 'NaN'
+            elif case == "CLC":
+                self.p_sim, self.rjoins = _crand_plus(
+                    z=np.column_stack((x, y)),
+                    w=self.w, 
+                    observed=self.LJC,
+                    permutations=permutations, 
+                    keep=True, 
+                    n_jobs=n_jobs,
+                    stat_func=_ljc_bv_case2
+                )
+                # Set p-values for those with LJC of 0 to NaN
+                self.p_sim[self.LJC == 0] = 'NaN'
+            else:
+                raise NotImplementedError(f'The requested LJC method ({case}) \
+                is not currently supported!')
+
+        del (self.n, self.keep_simulations, self.n_jobs, 
+             self.permutations, self.seed, self.w, self.x,
+             self.y, self.connectivity, self.rjoins)
+
+        return self
+
+    @staticmethod
+    def _statistic(x, y, w, case):
+        # Create adjacency list. Note that remove_symmetric=False - this is
+        # different from the esda.Join_Counts() function.
+        adj_list = w.to_adjlist(remove_symmetric=False)
+
+        # First, set up a series that maps the values
+        # to the weights table
+        zseries_x = pd.Series(x, index=w.id_order)
+        zseries_y = pd.Series(y, index=w.id_order)
+
+        # Map the values to the focal (i) values
+        focal_x = zseries_x.loc[adj_list.focal].values
+        focal_y = zseries_y.loc[adj_list.focal].values
+
+        # Map the values to the neighbor (j) values
+        neighbor_x = zseries_x.loc[adj_list.neighbor].values
+        neighbor_y = zseries_y.loc[adj_list.neighbor].values
+
+        if case == "BJC":
+            BJC = (focal_x == 1) & (focal_y == 0) & \
+                  (neighbor_x == 0) & (neighbor_y == 1)
+            adj_list_BJC = pd.DataFrame(adj_list.focal.values,
+                                        BJC.astype('uint8')).reset_index()
+            adj_list_BJC.columns = ['BJC', 'ID']
+            adj_list_BJC = adj_list_BJC.groupby(by='ID').sum()
+            return (np.array(adj_list_BJC.BJC.values, dtype='float'))
+        elif case == "CLC":
+            CLC = (focal_x == 1) & (focal_y == 1) & \
+                  (neighbor_x == 1) & (neighbor_y == 1)
+            adj_list_CLC = pd.DataFrame(adj_list.focal.values,
+                                        CLC.astype('uint8')).reset_index()
+            adj_list_CLC.columns = ['CLC', 'ID']
+            adj_list_CLC = adj_list_CLC.groupby(by='ID').sum()
+            return (np.array(adj_list_CLC.CLC.values, dtype='float'))
+        else:
+            raise NotImplementedError(f'The requested LJC method ({case}) \
+            is not currently supported!')
+
+# --------------------------------------------------------------
+# Conditional Randomization Function Implementations
+# --------------------------------------------------------------
+
+# Note: scaling not used
+
+@_njit(fastmath=True)
+def _ljc_bv_case1(i, z, permuted_ids, weights_i, scaling):
+    zx = z[:, 0]
+    zy = z[:, 1]
+    zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, weights_i)
+    return zx[i] * (zyrand @ weights_i)
+
+@_njit(fastmath=True)
+def _ljc_bv_case2(i, z, permuted_ids, weights_i, scaling):
+    zx = z[:, 0]
+    zy = z[:, 1]
+    zxi, zxrand, zyi, zyrand = _prepare_bivariate(i, z, permuted_ids, weights_i)
+    zf = zxrand * zyrand
+    return zy[i] * (zf @ weights_i)