Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9d45fdf

Browse files
committed
add Tukey HSD functions and tests
1 parent 4831ddb commit 9d45fdf

5 files changed

Lines changed: 258 additions & 10 deletions

File tree

‎wqio/datacollections.py‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,13 @@ def f_test(self, **opts):
615615
"""
616616
return self.comparison_stat_allway(stats.f_oneway, statname="f-test", control=None, **opts)
617617

618+
def tukey_hsd(self) -> tuple[pandas.DataFrame, pandas.DataFrame]:
619+
hsd = utils.tukey_hsd(
620+
self.tidy, self.rescol, self.stationcol, self.paramcol, *self.othergroups
621+
)
622+
scores = utils.process_tukey_hsd_scores(hsd, self.stationcol, self.paramcol)
623+
return hsd, scores
624+
618625
def theilslopes(self, logs=False):
619626
raise NotImplementedError
620627

‎wqio/tests/_data/wq.pkl‎

2.26 MB
Binary file not shown.

‎wqio/tests/test_datacollections.py‎

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,26 @@ def test_f_test(dc):
626626
pandas.testing.assert_frame_equal(result, expected)
627627

628628

629+
def test_tukey_hsd_smoke_test(dc):
630+
hsd, scores = dc.tukey_hsd()
631+
assert isinstance(hsd, pandas.DataFrame)
632+
assert isinstance(scores, pandas.DataFrame)
633+
634+
assert hsd.index.names == ["loc 1", "loc 2", "param"]
635+
assert hsd.columns.tolist() == [
636+
"HSD Stat",
637+
"p-value",
638+
"CI-Low",
639+
"CI-High",
640+
"is_diff",
641+
"sign_of_diff",
642+
"score",
643+
]
644+
645+
assert scores.index.names == ["param"]
646+
assert scores.columns.tolist() == ["Inflow", "Outflow", "Reference"]
647+
648+
629649
@helpers.seed
630650
def test_theilslopes(dc):
631651
with helpers.raises(NotImplementedError):

‎wqio/tests/utils_tests/test_numutils.py‎

Lines changed: 103 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
from collections import namedtuple
33
from io import StringIO
44
from textwrap import dedent
5+
from typing import Any, Literal
56

67
import numpy
78
import numpy.testing as nptest
89
import pandas
910
import pandas.testing as pdtest
1011
import pytest
1112
import statsmodels.api as sm
13+
from numpy._typing._array_like import NDArray
14+
from pandas import DataFrame
1215
from scipy import stats
1316

1417
from wqio.tests import helpers
@@ -92,7 +95,7 @@ def test_process_p_vals(fxn, pval, expected, error_to_raise):
9295
(1.01, (None, None), ValueError),
9396
],
9497
)
95-
def test_translate_p_vals(pval, expected, as_emoji, error_to_raise):
98+
def test_translate_p_vals(pval, expected, as_emoji: bool, error_to_raise):
9699
with helpers.raises(error_to_raise):
97100
result = numutils.translate_p_vals(pval, as_emoji=as_emoji)
98101
assert result == expected[as_emoji]
@@ -125,7 +128,7 @@ def test_anderson_darling():
125128

126129

127130
@pytest.mark.parametrize("which", ["good", "bad"])
128-
def test_processAndersonDarlingResults(which):
131+
def test_processAndersonDarlingResults(which: Literal["good"] | Literal["bad"]):
129132
fieldnames = ["statistic", "critical_values", "significance_level"]
130133
AndersonResult = namedtuple("AndersonResult", fieldnames)
131134
ARs = {
@@ -215,7 +218,7 @@ def units_norm_data():
215218
return raw, expected
216219

217220

218-
def test_normalize_units(units_norm_data):
221+
def test_normalize_units(units_norm_data: tuple[DataFrame, DataFrame]):
219222
unitsmap = {"ug/L": 1e-6, "mg/L": 1e-3, "g/L": 1e0}
220223

221224
targetunits = {"Lead, Total": "ug/L", "Cadmium, Total": "mg/L"}
@@ -226,7 +229,7 @@ def test_normalize_units(units_norm_data):
226229
pdtest.assert_frame_equal(result, expected)
227230

228231

229-
def test_normalize_units_bad_targetunits(units_norm_data):
232+
def test_normalize_units_bad_targetunits(units_norm_data: tuple[DataFrame, DataFrame]):
230233
unitsmap = {"ug/L": 1e-6, "mg/L": 1e-3, "g/L": 1e0}
231234

232235
targetunits = {"Lead, Total": "ug/L"}
@@ -243,7 +246,7 @@ def test_normalize_units_bad_targetunits(units_norm_data):
243246
)
244247

245248

246-
def test_normalize_units_bad_normalization(units_norm_data):
249+
def test_normalize_units_bad_normalization(units_norm_data: tuple[DataFrame, DataFrame]):
247250
unitsmap = {"mg/L": 1e-3, "g/L": 1e0}
248251

249252
targetunits = {"Lead, Total": "ug/L", "Cadmium, Total": "mg/L"}
@@ -260,7 +263,7 @@ def test_normalize_units_bad_normalization(units_norm_data):
260263
)
261264

262265

263-
def test_normalize_units_bad_conversion(units_norm_data):
266+
def test_normalize_units_bad_conversion(units_norm_data: tuple[DataFrame, DataFrame]):
264267
unitsmap = {"ug/L": 1e-6, "mg/L": 1e-3, "g/L": 1e0}
265268

266269
targetunits = {"Lead, Total": "ng/L", "Cadmium, Total": "mg/L"}
@@ -292,7 +295,7 @@ def test_test_pH2concentration(pH, expected, error):
292295

293296
@helpers.seed
294297
@pytest.mark.parametrize("error", [None, ValueError])
295-
def test_compute_theilslope_default(error):
298+
def test_compute_theilslope_default(error: types.NoneType | type[ValueError]):
296299
with helpers.raises(error):
297300
y = helpers.getTestROSData()["res"].values
298301
x = numpy.arange(len(y) - 1) if error else None
@@ -443,7 +446,7 @@ def fit_data():
443446
(None, "junk", ValueError),
444447
],
445448
)
446-
def test_fit_line(fit_data, fitlogs, fitprobs, error):
449+
def test_fit_line(fit_data: dict[str, NDArray[Any]], fitlogs, fitprobs, error):
447450
xy = {
448451
(None, None): (fit_data["zscores"], fit_data["data"]),
449452
("y", None): (fit_data["zscores"], fit_data["data"]),
@@ -483,13 +486,13 @@ def test_fit_line(fit_data, fitlogs, fitprobs, error):
483486
assert isinstance(res, sm.regression.linear_model.RegressionResultsWrapper)
484487

485488

486-
def test_fit_line_through_origin(fit_data):
489+
def test_fit_line_through_origin(fit_data: dict[str, NDArray[Any]]):
487490
x, y = fit_data["zscores"], fit_data["data"]
488491
x_, y_, res = numutils.fit_line(x, y, through_origin=True)
489492
assert res.params[0] == 0
490493

491494

492-
def test_fit_line_with_xhat(fit_data):
495+
def test_fit_line_with_xhat(fit_data: dict[str, NDArray[Any]]):
493496
x, y = fit_data["zscores"], fit_data["data"]
494497
x_, y_, res = numutils.fit_line(x, y, xhat=[-2, -1, 0, 1, 2])
495498
expected = [-0.566018, 4.774419, 10.114857, 15.455295, 20.795733]
@@ -799,3 +802,93 @@ def test_remove_outliers():
799802
x = numpy.random.normal(0, 4, size=37)
800803

801804
assert numutils.remove_outliers(x).shape == expected_shape
805+
806+
807+
def test_tukey_hsd_functions():
808+
expected_records = [
809+
{
810+
"chemical_name": "Copper",
811+
"Loc_0": -2.0,
812+
"Loc_1": 6.0,
813+
"Loc_2": -2.0,
814+
"Loc_3": -4.0,
815+
"Loc_4": 3.0,
816+
"Loc_5": -1.0,
817+
"Loc_6": 0.0,
818+
},
819+
{
820+
"chemical_name": "Di(2-ethylhexyl)phthalate",
821+
"Loc_0": 3.0,
822+
"Loc_1": 5.0,
823+
"Loc_2": -2.0,
824+
"Loc_3": -2.0,
825+
"Loc_4": -2.0,
826+
"Loc_5": -1.0,
827+
"Loc_6": -1.0,
828+
},
829+
{
830+
"chemical_name": "Indeno(1,2,3-cd)pyrene",
831+
"Loc_0": 2.0,
832+
"Loc_1": 0.0,
833+
"Loc_2": 6.0,
834+
"Loc_3": -2.0,
835+
"Loc_4": -2.0,
836+
"Loc_5": -4.0,
837+
"Loc_6": 0.0,
838+
},
839+
{
840+
"chemical_name": "Lead",
841+
"Loc_0": 0.0,
842+
"Loc_1": 6.0,
843+
"Loc_2": -2.0,
844+
"Loc_3": -3.0,
845+
"Loc_4": 4.0,
846+
"Loc_5": -3.0,
847+
"Loc_6": -2.0,
848+
},
849+
{
850+
"chemical_name": "Phenanthrene",
851+
"Loc_0": 1.0,
852+
"Loc_1": 0.0,
853+
"Loc_2": 1.0,
854+
"Loc_3": -3.0,
855+
"Loc_4": 0.0,
856+
"Loc_5": 0.0,
857+
"Loc_6": 1.0,
858+
},
859+
{
860+
"chemical_name": "Pyrene",
861+
"Loc_0": 0.0,
862+
"Loc_1": -1.0,
863+
"Loc_2": 3.0,
864+
"Loc_3": -2.0,
865+
"Loc_4": -2.0,
866+
"Loc_5": -2.0,
867+
"Loc_6": 4.0,
868+
},
869+
{
870+
"chemical_name": "Total Suspended Solids",
871+
"Loc_0": -1.0,
872+
"Loc_1": -1.0,
873+
"Loc_2": -1.0,
874+
"Loc_3": -1.0,
875+
"Loc_4": -1.0,
876+
"Loc_5": -1.0,
877+
"Loc_6": 6.0,
878+
},
879+
{
880+
"chemical_name": "Zinc",
881+
"Loc_0": 0.0,
882+
"Loc_1": 1.0,
883+
"Loc_2": -1.0,
884+
"Loc_3": -6.0,
885+
"Loc_4": -1.0,
886+
"Loc_5": 4.0,
887+
"Loc_6": 3.0,
888+
},
889+
]
890+
expected = pandas.DataFrame(expected_records).set_index("chemical_name")
891+
wq = pandas.read_pickle(helpers.test_data_path("wq.pkl"))
892+
hsd = numutils.tukey_hsd(wq, "res", "location", "chemical_name")
893+
result = numutils.process_tukey_hsd_scores(hsd, "location", "chemical_name")
894+
pandas.testing.assert_frame_equal(result, expected, check_names=False)

‎wqio/utils/numutils.py‎

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import statsmodels.api as sm
99
from probscale.algo import _estimate_from_fit
1010
from scipy import stats
11+
from scipy.stats._hypotests import TukeyHSDResult
1112

1213
from wqio import validate
1314
from wqio.utils import misc
@@ -786,3 +787,130 @@ def _paired_stat_generator(
786787
stat = statfxn(x, y, **statopts)
787788
row.update({statname: stat[0], "pvalue": stat.pvalue})
788789
yield row
790+
791+
792+
def _tukey_res_to_df(
793+
names: list[str], hsd_res: list[TukeyHSDResult], group_prefix: str
794+
) -> pandas.DataFrame:
795+
"""Converts Scipy's TukeyHSDResult to a dataframe
796+
797+
Parameters
798+
----------
799+
names : list of str
800+
Name of the groups present in the Tukey HSD Results
801+
hsd_res : list of TukeyHSDResult
802+
List of Tukey results to be converted to a dateframe
803+
group_prefix : str (default = "Loc")
804+
Prefix that describes the nature of the groups
805+
806+
Returns
807+
-------
808+
hsd_df : pandas.DataFrame
809+
810+
"""
811+
rows = []
812+
for i, n1 in enumerate(names):
813+
for j, n2 in enumerate(names):
814+
if i != j:
815+
ci_bands = hsd_res.confidence_interval()
816+
row = {
817+
f"{group_prefix} 1": names[n1],
818+
f"{group_prefix} 2": names[n2],
819+
"HSD Stat": hsd_res.statistic[i, j],
820+
"p-value": hsd_res.pvalue[i, j],
821+
"CI-Low": ci_bands.low[i, j],
822+
"CI-High": ci_bands.high[i, j],
823+
}
824+
825+
rows.append(row)
826+
827+
df = pandas.DataFrame(rows).set_index([f"{group_prefix} 1", f"{group_prefix} 2"])
828+
return df
829+
830+
831+
def tukey_hsd(
832+
df: pandas.DataFrame,
833+
rescol: str,
834+
compcol: str,
835+
paramcol: str,
836+
*othergroups: str,
837+
):
838+
"""
839+
Run the Tukey HSD Test on a dataframe based on groupings
840+
841+
Parameters
842+
----------
843+
df : pandas.DataFrame
844+
rescol : str
845+
Name of the column that contains the values of interest
846+
compcol: str
847+
Name of the column that defines the groups to be compared
848+
(i.e., treatment vs control)
849+
paramcol: str
850+
Name of the column that contains the measured parameter
851+
*othergroups : str
852+
Names of any other columsn that need to considered when
853+
defining the groups to be considered.
854+
855+
Returns
856+
-------
857+
hsd_df : pandas.DataFrame
858+
859+
"""
860+
groupcols = [paramcol, *othergroups]
861+
scores = []
862+
for name, g in df.groupby(by=groupcols):
863+
locs = {loc: subg[rescol].values for loc, subg in g.groupby(compcol) if subg.shape[0] > 1}
864+
subset_names = {n: loc for n, loc in enumerate(locs)}
865+
res = stats.tukey_hsd(*[v for v in locs.values()])
866+
df_res = _tukey_res_to_df(subset_names, res, group_prefix=compcol)
867+
868+
keys = {g: n for g, n in zip(groupcols, name)}
869+
scores.append(
870+
df_res.assign(
871+
is_diff=lambda df: df["p-value"].lt(0.05).astype(int),
872+
sign_of_diff=lambda df: numpy.sign(df["HSD Stat"]).astype(int),
873+
score=lambda df: df["is_diff"] * df["sign_of_diff"],
874+
**keys,
875+
).set_index(groupcols, append=True)
876+
)
877+
878+
return pandas.concat(scores, ignore_index=False, axis="index")
879+
880+
881+
def process_tukey_hsd_scores(
882+
hsd_df: pandas.DataFrame, compcol: str, paramcol: str
883+
) -> pandas.DataFrame:
884+
"""
885+
Converts a Tukey HSD Results dataframe into scores that describe
886+
the value of groups' magnitude relative to each other.
887+
888+
Generally speaking:
889+
* -7 to -5 -> significantly lower
890+
* -5 to -3 -> moderately lower
891+
* -3 to -1 -> slightly lower
892+
* -1 to +1 -> neutral
893+
* +1 to +3 -> slightly higher
894+
* +3 to +5 -> moderately higher
895+
* +5 to +7 -> significantly higher
896+
897+
Parameters
898+
----------
899+
hsd_df : pandas.DataFrame
900+
Dataframe dumped by `tukey_hsd`
901+
group_prefix : str (default = "Loc")
902+
Prefix that describes the nature of the groups
903+
904+
Returns
905+
-------
906+
scores : pandas.DataFrame
907+
908+
"""
909+
return (
910+
hsd_df["score"]
911+
.unstack(level=f"{compcol} 2")
912+
.fillna(0)
913+
.groupby(level=paramcol, as_index=False, group_keys=False)
914+
.apply(lambda g: g.sum(axis="columns"))
915+
.unstack(level=f"{compcol} 1")
916+
)

0 commit comments

Comments
 (0)