Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9b80c2f
draft significance.py
JosiahParry Feb 14, 2024
5e6b05b
multiply two-sided by 2
JosiahParry Feb 14, 2024
1783ae9
add the two-sided percentile-based test and directed test with array …
ljwolf Feb 15, 2024
65552ca
fix imports
ljwolf Feb 15, 2024
920719c
add folding-based p-value
ljwolf Feb 15, 2024
d9ea095
swap to strict inequality
ljwolf Feb 15, 2024
c6e3a8a
add example and ruff
ljwolf Feb 15, 2024
fa2deaa
update significance directions
ljwolf Feb 23, 2024
1c59fa7
adding one below is sufficient
Mar 6, 2024
26a51c9
update significance implementation for final merge
ljwolf May 1, 2025
d3278d0
just report p-value, keep calculation internal to method
ljwolf May 1, 2025
3ff32b6
move to njit implementation
ljwolf May 1, 2025
b9f6161
move to significance machinery for crand
ljwolf May 1, 2025
3b2ee7d
review by @martinfleis: prep for tests and benchmarking
ljwolf May 2, 2025
708ac80
fix item() extraction in inner permutation loop
ljwolf May 2, 2025
54d5f7d
iterate to calculate the percentages
ljwolf May 2, 2025
7a2dcdf
update significance testing tests
ljwolf May 2, 2025
9d38da3
Merge branch 'main' into calc-sig
ljwolf May 2, 2025
1b53098
Merge branch 'main' of github.com:pysal/esda into calc-sig
ljwolf Sep 10, 2025
9de1b1e
make sure weights types are cast correctly for matmul
ljwolf Sep 10, 2025
1295dfc
add warning suppression when islands result in zero seI
ljwolf Sep 10, 2025
239e737
fix typing, shaping, and iteration issues in significance
ljwolf Sep 10, 2025
d53a905
update notebook, removing warning filter and numba disclaimer
ljwolf Sep 10, 2025
8dc453d
fix shaping and test validity
ljwolf Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 49 additions & 34 deletions esda/crand.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,13 @@

import os
import warnings

from .significance import _permutation_significance
import numpy as np

try:
from numba import boolean, jit, njit, prange
from numba import boolean, njit, prange
except (ImportError, ModuleNotFoundError):

def jit(*dec_args, **dec_kwargs): # noqa: ARG001
"""
decorator mimicking numba.jit
"""

def intercepted_function(f, *f_args, **f_kwargs): # noqa: ARG001
return f

return intercepted_function

njit = jit

from libpysal.common import jit as njit
prange = range
boolean = bool

Expand All @@ -35,7 +23,9 @@ def intercepted_function(f, *f_args, **f_kwargs): # noqa: ARG001


@njit(fastmath=True)
def vec_permutations(max_card: int, n: int, k_replications: int, seed: int):
def vec_permutations(
max_card: int, n: int, k_replications: int, seed: int
):
"""
Generate `max_card` permuted IDs, sampled from `n` without replacement,
`k_replications` times
Expand Down Expand Up @@ -75,6 +65,7 @@ def crand(
scaling=None,
seed=None,
island_weight=0,
alternative=None
):
"""
Conduct conditional randomization of a given input using the provided
Expand Down Expand Up @@ -147,6 +138,24 @@ def crand(
f"conditional randomization. Recieved `z` of shape {z.shape}"
)

if alternative is None:
warnings.warn(
"The alternative hypothesis for conditional randomization"
" is changing in the next major release of esda. We recommend"
" setting alternative='two-sided', which will generally"
" double the p-value returned."
" To retain the current behavior, set alternative='directed'."
" We strongly recommend moving to alternative='two-sided'.",
DeprecationWarning,
)
# TODO: replace this with 'two-sided' by next major release
alternative = 'directed'
if alternative not in ("two-sided", "greater", "lesser", "directed", "folded"):
raise ValueError(
f"alternative='{alternative}' provided, but is not"
f" one of the supported options: 'two-sided', 'greater', 'lesser', 'directed', 'folded')"
)

# paralellise over permutations?
if seed is None:
seed = np.random.randint(12345, 12345000)
Expand All @@ -163,7 +172,7 @@ def crand(
adj_matrix.setdiag(0)
adj_matrix.eliminate_zeros()
# extract the weights from a now no-self-weighted adj_matrix
other_weights = adj_matrix.data
other_weights = adj_matrix.data.astype(z.dtype) # cast is forced by @ in numba
# use the non-self weight as the cardinality, since
# this is the set we have to randomize.
# if there is a self-neighbor, we need to *not* shuffle the
Expand All @@ -185,7 +194,7 @@ def crand(
n_jobs = 1

if n_jobs == 1:
larger, rlocals = compute_chunk(
p_sims, rlocals = compute_chunk(
0, # chunk start
z, # chunked z, for serial this is the entire data
z, # all z, for serial this is also the entire data
Expand All @@ -198,14 +207,15 @@ def crand(
keep, # whether or not to keep the local statistics
stat_func,
island_weight,
alternative=alternative
)
else:
if n_jobs == -1:
n_jobs = os.cpu_count()
if n_jobs > len(z):
n_jobs = len(z)
# Parallel implementation
larger, rlocals = parallel_crand(
p_sims, rlocals = parallel_crand(
z,
observed,
cardinalities,
Expand All @@ -217,13 +227,10 @@ def crand(
keep,
stat_func,
island_weight,
alternative=alternative
)

low_extreme = (permutations - larger) < larger
larger[low_extreme] = permutations - larger[low_extreme]
p_sim = (larger + 1.0) / (permutations + 1.0)

return p_sim, rlocals
return p_sims, rlocals


@njit(parallel=False, fastmath=True)
Expand All @@ -240,6 +247,7 @@ def compute_chunk(
keep: bool,
stat_func,
island_weight: float,
alternative: str
):
"""
Compute conditional randomisation for a single chunk
Expand Down Expand Up @@ -302,11 +310,13 @@ def compute_chunk(
the null of spatial randomness
"""
chunk_n = z_chunk.shape[0]
n = z.shape[0]
larger = np.zeros((chunk_n,), dtype=np.int64)
n_samples = z.shape[0]
p_permutations, k_max_card = permuted_ids.shape
p_sims = np.zeros((chunk_n,), dtype=np.float32)
rlocals = np.empty((chunk_n, permuted_ids.shape[0])) if keep else np.empty((1, 1))

mask = np.ones((n,), dtype=np.int8) == 1

mask = np.ones((n_samples,), dtype=np.int8) == 1
wloc = 0

for i in range(chunk_n):
Expand All @@ -323,10 +333,13 @@ def compute_chunk(
wloc += cardinality
mask[chunk_start + i] = False
rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
p_sims[i] = _permutation_significance(
observed[i], rstats, alternative=alternative
).item()
if keep:
rlocals[i] = rstats
larger[i] = np.sum(rstats >= observed[i])
return larger, rlocals

return p_sims, rlocals


#######################################################################
Expand Down Expand Up @@ -449,6 +462,7 @@ def parallel_crand(
keep: bool,
stat_func,
island_weight,
alternative: str = 'directed'
):
"""
Conduct conditional randomization in parallel using numba
Expand Down Expand Up @@ -516,8 +530,8 @@ def parallel_crand(
starts = np.arange(n_jobs + 1) * chunk_size
# ------------------------------------------------------------------
# Set up output holders
larger = np.zeros((n,), dtype=np.int64)
rlocals = np.empty((n, permuted_ids.shape[0])) if keep else np.empty((1, 1))

# ------------------------------------------------------------------
# Joblib parallel loop by chunks

Expand All @@ -536,14 +550,15 @@ def parallel_crand(
with parallel_backend("loky", inner_max_num_threads=1):
worker_out = Parallel(n_jobs=n_jobs)(
delayed(compute_chunk)(
*pars, permuted_ids, scaling, keep, stat_func, island_weight
*pars, permuted_ids, scaling, keep, stat_func, island_weight, alternative
)
for pars in chunks
)
larger, rlocals = zip(*worker_out, strict=True)
larger = np.hstack(larger).squeeze()

p_sims, rlocals = zip(*worker_out)
p_sims = np.hstack(p_sims).squeeze()
rlocals = np.row_stack(rlocals).squeeze()
return larger, rlocals
return p_sims, rlocals


#######################################################################
Expand Down
12 changes: 8 additions & 4 deletions esda/moran.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ def __init__(
self.EI_sim = sim.sum() / permutations
self.seI_sim = np.array(sim).std()
self.VI_sim = self.seI_sim**2
self.z_sim = (self.I - self.EI_sim) / self.seI_sim
with np.errstate(divide='ignore'):
self.z_sim = (self.I - self.EI_sim) / self.seI_sim
if self.z_sim > 0:
self.p_z_sim = stats.norm.sf(self.z_sim)
else:
Expand Down Expand Up @@ -539,7 +540,8 @@ def __init__(self, x, y, w, transformation="r", permutations=PERMUTATIONS):
self.EI_sim = sim.sum() / permutations
self.seI_sim = np.array(sim).std()
self.VI_sim = self.seI_sim**2
self.z_sim = (self.I - self.EI_sim) / self.seI_sim
with np.errstate(divide='ignore'):
self.z_sim = (self.I - self.EI_sim) / self.seI_sim
if self.z_sim > 0:
self.p_z_sim = stats.norm.sf(self.z_sim)
else:
Expand Down Expand Up @@ -1348,7 +1350,8 @@ def __init__(
self.EI_sim = self.sim.mean(axis=0)
self.seI_sim = self.sim.std(axis=0)
self.VI_sim = self.seI_sim * self.seI_sim
self.z_sim = (self.Is - self.EI_sim) / self.seI_sim
with np.errstate(divide='ignore'):
self.z_sim = (self.Is - self.EI_sim) / self.seI_sim
self.p_z_sim = stats.norm.sf(np.abs(self.z_sim))
else:
self.sim = self.rlisas = None
Expand Down Expand Up @@ -1825,7 +1828,8 @@ def __init__(
self.EI_sim = sim.mean(axis=0)
self.seI_sim = sim.std(axis=0)
self.VI_sim = self.seI_sim * self.seI_sim
self.z_sim = (self.Is - self.EI_sim) / self.seI_sim
with np.errstate(divide='ignore'):
self.z_sim = (self.Is - self.EI_sim) / self.seI_sim
self.p_z_sim = stats.norm.sf(np.abs(self.z_sim))

def __calc(self):
Expand Down
109 changes: 109 additions & 0 deletions esda/significance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import numpy as np
import warnings

try:
from numba import njit
except (ImportError, ModuleNotFoundError):
from libpysal.common import jit as njit


def calculate_significance(test_stat, reference_distribution, alternative="two-sided"):
"""
Calculate a pseudo p-value from a reference distribution.

Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations
and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic.
Comment on lines +14 to +15
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: describe the adjustment here


Parameters
----------
test_stat: float or numpy.ndarray
The observed test statistic, or a vector of observed test statistics
reference_distribution: numpy.ndarray
A numpy array containing simulated test statistics as a result of conditional permutation.
alternative: string
One of 'two-sided', 'lesser', 'greater', 'folded', or 'directed'. Indicates the alternative hypothesis.
- 'two-sided': the observed test statistic is in either tail of the reference distribution. This is an un-directed alternative hypothesis.
- 'folded': the observed test statistic is an extreme value of the reference distribution folded about its mean. This is an un-directed alternative hypothesis.
- 'lesser': the observed test statistic is small relative to the reference distribution. This is a directed alternative hypothesis.
- 'greater': the observed test statistic is large relative to the reference distribution. This is a directed alternative hypothesis.
- 'directed': the observed test statistic is in either tail of the reference distribution, but the tail is selected depending on the test statistic. This is a directed alternative hypothesis, but the direction is chosen dependent on the data. This is not advised, and included solely to reproduce past results.

Notes
-----

the directed p-value is half of the two-sided p-value, and corresponds to running the
lesser and greater tests, then picking the smaller significance value. This is not advised,
since the p-value will be uniformly too small.
"""
reference_distribution = np.atleast_2d(reference_distribution)
n_samples, p_permutations = reference_distribution.shape
test_stat = np.atleast_2d(test_stat).reshape(n_samples, -1)
if alternative not in (
'folded',
'two-sided',
'greater',
'lesser',
'directed'
):
raise ValueError(
f"alternative='{alternative}' provided, but is not"
f" one of the supported options: 'two-sided', 'greater', 'lesser', 'directed', 'folded')"
)
return _permutation_significance(
test_stat,
reference_distribution,
alternative=alternative
)

@njit(parallel=False, fastmath=False)
def _permutation_significance(test_stat, reference_distribution, alternative='two-sided'):
reference_distribution = np.atleast_2d(reference_distribution)
n_samples, p_permutations = reference_distribution.shape
if alternative == "directed":
larger = (reference_distribution >= test_stat).sum(axis=1)
low_extreme = (p_permutations - larger) < larger
larger[low_extreme] = p_permutations - larger[low_extreme]
p_value = (larger + 1.0) / (p_permutations + 1.0)
elif alternative == "lesser":
p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / (
p_permutations + 1
)
elif alternative == "greater":
p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / (
p_permutations + 1
)
elif alternative == "two-sided":
# find percentile p at which the test statistic sits
# find "synthetic" test statistic at 1-p
# count how many observations are outisde of (p, 1-p)
# including the test statistic and its synthetic pair
lows = np.empty(n_samples).astype(reference_distribution.dtype)
highs = np.empty(n_samples).astype(reference_distribution.dtype)
for i in range(n_samples):
percentile_i = (reference_distribution[i] <= test_stat).mean()*100
p_low = np.minimum(percentile_i, 100-percentile_i)
lows[i] = np.percentile(
reference_distribution[i],
p_low
)
highs[i] = np.percentile(
reference_distribution[i],
100 - p_low
)
n_outside = (reference_distribution <= lows[:,None]).sum(axis=1)
n_outside += (reference_distribution >= highs[:,None]).sum(axis=1)
p_value = (n_outside + 1) / (p_permutations + 1)
elif alternative == "folded":
means = np.empty((n_samples,1)).astype(reference_distribution.dtype)
for i in range(n_samples):
means[i] = reference_distribution[i].mean()
folded_test_stat = np.abs(test_stat - means)
folded_reference_distribution = np.abs(reference_distribution - means)
p_value = ((folded_reference_distribution >= folded_test_stat).sum(axis=1) + 1) / (
p_permutations + 1
)
else:
p_value = np.ones((n_samples, ))*np.nan
return p_value


Loading
Loading