Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 18d727e

Browse files
authored
Merge pull request #188 from phobson/add-group-comparisons
Add group comparisons
2 parents a71f374 + c5c793f commit 18d727e

11 files changed

Lines changed: 747 additions & 134 deletions

File tree

β€Ž.github/workflows/python-runtests-basic.ymlβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
runs-on: ubuntu-latest
1616
strategy:
1717
matrix:
18-
python-version: ["3.9", "3.10", "3.11", "3.12"]
18+
python-version: ["3.10", "3.11", "3.12"]
1919

2020
steps:
2121
- uses: actions/checkout@v2

β€Žwqio/datacollections.pyβ€Ž

Lines changed: 164 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@
2020
_Stat = namedtuple("_stat", ["stat", "pvalue"])
2121

2222

23-
def _dist_compare(x, y, stat_comp_func):
23+
def _dist_compare(x, y, stat_comp_func, **test_opts):
2424
if (len(x) == len(y)) and numpy.equal(x, y).all():
2525
return _Stat(numpy.nan, numpy.nan)
26-
return stat_comp_func(x, y, alternative="two-sided")
26+
27+
return stat_comp_func(x, y, **test_opts)
2728

2829

2930
class DataCollection:
@@ -130,6 +131,8 @@ def __init__(
130131
**{self.cencol: dataframe[self.qualcol].isin(self.ndval)}
131132
).reset_index()
132133

134+
self.pbarfxn = tqdm if (self.showpbar and tqdm) else utils.misc.no_op
135+
133136
@cache_readonly
134137
def tidy(self):
135138
if self.useros:
@@ -323,7 +326,7 @@ def mean(self):
323326

324327
@cache_readonly
325328
def std_dev(self):
326-
return self.generic_stat(numpy.std, statname="std. dev.", use_bootstrap=False)
329+
return self.generic_stat(numpy.std, statname="std. dev.", use_bootstrap=False, ddof=1)
327330

328331
def percentile(self, percentile):
329332
"""Return the percentiles (0 - 100) for the data."""
@@ -342,7 +345,7 @@ def logmean(self):
342345
@cache_readonly
343346
def logstd_dev(self):
344347
return self.generic_stat(
345-
lambda x, axis=0: numpy.std(numpy.log(x), axis=axis),
348+
lambda x, axis=0: numpy.std(numpy.log(x), axis=axis, ddof=1),
346349
use_bootstrap=False,
347350
statname="Log-std. dev.",
348351
)
@@ -359,65 +362,93 @@ def geostd_dev(self):
359362
geostd.columns.names = ["station", "Geo-std. dev."]
360363
return geostd
361364

362-
@cache_readonly
363-
def shapiro(self):
365+
def shapiro(self, **opts):
366+
"""
367+
Run the Shapiro-Wilk test for normality on the datasets.
368+
369+
Requires at least 3 observations in each dataset.
370+
371+
See `scipy.stats.shapiro` for info on kwargs you can pass.
372+
"""
364373
return self.generic_stat(
365374
stats.shapiro,
366375
use_bootstrap=False,
367376
has_pvalue=True,
368377
statname="shapiro",
369378
filterfxn=lambda x: x.shape[0] > 3,
379+
**opts,
370380
)
371381

372-
@cache_readonly
373-
def shapiro_log(self):
382+
def shapiro_log(self, **opts):
383+
"""
384+
Run the Shapiro-Wilk test for normality on log-transformed datasets.
385+
386+
Requires at least 3 observations in each dataset.
387+
388+
See `scipy.stats.shapiro` for info on kwargs you can pass.
389+
"""
374390
return self.generic_stat(
375391
lambda x: stats.shapiro(numpy.log(x)),
376392
use_bootstrap=False,
377393
has_pvalue=True,
378394
filterfxn=lambda x: x.shape[0] > 3,
379395
statname="log-shapiro",
396+
**opts,
380397
)
381398

382-
@cache_readonly
383-
def lilliefors(self):
399+
def lilliefors(self, **opts):
400+
"""
401+
Run the Lilliefors test for normality on the datasets.
402+
403+
Requires at least 3 observations in each dataset.
404+
405+
See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass.
406+
"""
384407
return self.generic_stat(
385408
sm.stats.lilliefors,
386409
use_bootstrap=False,
387410
has_pvalue=True,
388411
statname="lilliefors",
412+
**opts,
389413
)
390414

391-
@cache_readonly
392-
def lilliefors_log(self):
415+
def lilliefors_log(self, **opts):
416+
"""
417+
Run the Lilliefors test for normality on the log-transformed datasets.
418+
419+
Requires at least 3 observations in each dataset.
420+
421+
See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass.
422+
"""
393423
return self.generic_stat(
394424
lambda x: sm.stats.lilliefors(numpy.log(x)),
395425
use_bootstrap=False,
396426
has_pvalue=True,
397427
statname="log-lilliefors",
428+
**opts,
398429
)
399430

400-
@cache_readonly
401-
def anderson_darling(self):
431+
def anderson_darling(self, **opts):
402432
raise NotImplementedError
403433
return self.generic_stat(
404434
utils.anderson_darling,
405435
use_bootstrap=False,
406436
has_pvalue=True,
407437
statname="anderson-darling",
438+
**opts,
408439
)
409440

410-
@cache_readonly
411-
def anderson_darling_log(self):
441+
def anderson_darling_log(self, **opts):
412442
raise NotImplementedError
413443
return self.generic_stat(
414444
lambda x: utils.anderson_darling(numpy.log(x)),
415445
use_bootstrap=False,
416446
has_pvalue=True,
417447
statname="log-anderson-darling",
448+
**opts,
418449
)
419450

420-
def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
451+
def comparison_stat_twoway(self, statfxn, statname=None, paired=False, **statopts):
421452
"""Generic function to apply comparative hypothesis tests to
422453
the groups of the ``DataCollection``.
423454
@@ -430,7 +461,7 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
430461
statname : string, optional
431462
Name of the statistic. Included as a column name in the
432463
final dataframe.
433-
apired : bool, optional
464+
paired : bool, optional
434465
Set to ``True`` if ``statfxn`` requires paired data.
435466
**statopts : optional kwargs
436467
Additional keyword arguments that will be passed to
@@ -455,9 +486,9 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
455486
>>> dc = DataCollection(df, rescol='res', qualcol='qual',
456487
... stationcol='loc', paramcol='param',
457488
... ndval='<')
458-
>>> mwht = dc.comparison_stat(stats.mannwhitneyu,
459-
... statname='mann_whitney',
460-
... alternative='two-sided')
489+
>>> mwht = dc.comparison_stat_twoway(stats.mannwhitneyu,
490+
... statname='mann_whitney',
491+
... alternative='two-sided')
461492
462493
"""
463494

@@ -475,46 +506,134 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
475506
index_cols = meta_columns + station_columns
476507

477508
results = generator(
478-
data, meta_columns, self.stationcol, rescol, statfxn, statname=statname, **statopts
509+
data,
510+
meta_columns,
511+
self.stationcol,
512+
rescol,
513+
statfxn,
514+
statname=statname,
515+
pbarfxn=self.pbarfxn,
516+
**statopts,
479517
)
480518
return pandas.DataFrame.from_records(results).set_index(index_cols)
481519

482-
@cache_readonly
483-
def mann_whitney(self):
484-
return self.comparison_stat(
485-
partial(_dist_compare, stat_comp_func=stats.mannwhitneyu),
520+
def comparison_stat_allway(self, statfxn, statname, control=None, **statopts):
521+
results = utils.numutils._group_comp_stat_generator(
522+
self.tidy,
523+
self.groupcols_comparison,
524+
self.stationcol,
525+
self.rescol,
526+
statfxn,
527+
statname=statname,
528+
control=control,
529+
pbarfxn=self.pbarfxn,
530+
**statopts,
531+
)
532+
return pandas.DataFrame.from_records(results).set_index(self.groupcols_comparison)
533+
534+
def mann_whitney(self, **opts):
535+
"""
536+
Run the Mann-Whitney U test across datasets.
537+
538+
See `scipy.stats.mannwhitneyu` for available options.
539+
"""
540+
return self.comparison_stat_twoway(
541+
partial(_dist_compare, stat_comp_func=stats.mannwhitneyu, **opts),
486542
statname="mann_whitney",
487543
)
488544

489-
@cache_readonly
490-
def ranksums(self):
491-
return self.comparison_stat(stats.ranksums, statname="rank_sums")
545+
def ranksums(self, **opts):
546+
"""
547+
Run the unpaired Wilcoxon rank-sum test across datasets.
492548
493-
@cache_readonly
494-
def t_test(self):
495-
return self.comparison_stat(stats.ttest_ind, statname="t_test", equal_var=False)
549+
See `scipy.stats.ranksums` for available options.
550+
"""
551+
return self.comparison_stat_twoway(stats.ranksums, statname="rank_sums", **opts)
496552

497-
@cache_readonly
498-
def levene(self):
499-
return self.comparison_stat(stats.levene, statname="levene", center="median")
553+
def t_test(self, **opts):
554+
"""
555+
Run the T-test for independent scores.
500556
501-
@cache_readonly
502-
def wilcoxon(self):
503-
return self.comparison_stat(
557+
See `scipy.stats.ttest_ind` for available options.
558+
"""
559+
return self.comparison_stat_twoway(stats.ttest_ind, statname="t_test", **opts)
560+
561+
def levene(self, **opts):
562+
"""
563+
Run the Levene test for equal variances
564+
565+
See `scipy.stats.levene` for available options.
566+
"""
567+
return self.comparison_stat_twoway(stats.levene, statname="levene", **opts)
568+
569+
def wilcoxon(self, **opts):
570+
"""
571+
Run the paired Wilcoxon rank-sum test across paired dataset.
572+
573+
See `scipy.stats.wilcoxon` for available options.
574+
"""
575+
return self.comparison_stat_twoway(
504576
partial(_dist_compare, stat_comp_func=stats.wilcoxon),
505577
statname="wilcoxon",
506578
paired=True,
579+
**opts,
507580
)
508581

509-
@cache_readonly
510-
def kendall(self):
511-
return self.comparison_stat(stats.kendalltau, statname="kendalltau", paired=True)
582+
def kendall(self, **opts):
583+
"""
584+
Run the paired Kendall-tau test across paired dataset.
512585
513-
@cache_readonly
514-
def spearman(self):
515-
return self.comparison_stat(stats.spearmanr, statname="spearmanrho", paired=True)
586+
See `scipy.stats.kendalltau` for available options.
587+
"""
588+
return self.comparison_stat_twoway(
589+
stats.kendalltau, statname="kendalltau", paired=True, **opts
590+
)
591+
592+
def spearman(self, **opts):
593+
"""
594+
Run the paired Spearman-rho test across paired dataset.
595+
596+
See `scipy.stats.spearmanr` for available options.
597+
"""
598+
return self.comparison_stat_twoway(
599+
stats.spearmanr, statname="spearmanrho", paired=True, **opts
600+
)
601+
602+
def kruskal_wallis(self, **opts):
603+
"""
604+
Run the paired Kruskal-Wallos H-test across paired dataset.
605+
606+
See `scipy.stats.kruskal` for available options.
607+
"""
608+
return self.comparison_stat_allway(stats.kruskal, statname="K-W H", control=None, **opts)
609+
610+
def f_test(self, **opts):
611+
"""
612+
One-way ANOVA test across datasets
613+
614+
See `scipy.stats.f_oneway` for available options.
615+
"""
616+
return self.comparison_stat_allway(stats.f_oneway, statname="f-test", control=None, **opts)
617+
618+
def tukey_hsd(self) -> tuple[pandas.DataFrame, pandas.DataFrame]:
619+
"""
620+
Tukey Honestly Significant Difference (HSD) test across stations + other groups
621+
for each parameter
622+
"""
623+
hsd = utils.tukey_hsd(
624+
self.tidy, self.rescol, self.stationcol, self.paramcol, *self.othergroups
625+
)
626+
scores = utils.process_tukey_hsd_scores(hsd, self.stationcol, self.paramcol)
627+
return hsd, scores
628+
629+
def dunn(self):
630+
"""
631+
Dunn test across the different stations for each pollutant
632+
"""
633+
return self.tidy.groupby(by=[self.paramcol]).apply(
634+
lambda g: utils.dunn_test(g, self.rescol, self.stationcol, *self.othergroups).scores
635+
)
516636

517-
@cache_readonly
518637
def theilslopes(self, logs=False):
519638
raise NotImplementedError
520639

β€Žwqio/features.pyβ€Ž

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1599,20 +1599,18 @@ def scatterplot(
15991599
raise ValueError(f"`eqn_pos` must be on of {list.positions.keys()}")
16001600
# annotate axes with stats
16011601

1602+
slope = utils.sig_figs(modelres.params[1], n=3)
1603+
icept = utils.sig_figs(modelres.params[0], n=3)
16021604
ax.annotate(
1603-
r"$\log(y) = {} \, \log(x) + {}$".format(
1604-
utils.sig_figs(modelres.params[1], n=3),
1605-
utils.sig_figs(modelres.params[0], n=3),
1606-
),
1605+
rf"$\log(y) = {slope} \, \log(x) + {icept}$",
16071606
(txt_x, txt_y),
16081607
xycoords="axes fraction",
16091608
)
16101609

1610+
slope_pval = utils.process_p_vals(modelres.pvalues[1])
1611+
icept_pval = utils.process_p_vals(modelres.pvalues[0])
16111612
ax.annotate(
1612-
"Slope p-value: {}\nIntercept p-value: {}".format(
1613-
utils.process_p_vals(modelres.pvalues[1]),
1614-
utils.process_p_vals(modelres.pvalues[0]),
1615-
),
1613+
f"Slope p-value: {slope_pval}\nIntercept p-value: {icept_pval}",
16161614
(txt_x, txt_y - vert_offset),
16171615
xycoords="axes fraction",
16181616
)

β€Žwqio/tests/__init__.pyβ€Ž

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import warnings
2-
3-
from pkg_resources import resource_filename
2+
from importlib import resources
43

54
from wqio.tests.helpers import requires
65

@@ -12,7 +11,7 @@
1211

1312
@requires(pytest, "pytest")
1413
def test(*args):
15-
options = [resource_filename("wqio", "")]
14+
options = [str(resources.files("wqio"))]
1615
options.extend(list(args))
1716
return pytest.main(options)
1817

β€Žwqio/tests/_data/sed.pklβ€Ž

380 KB
Binary file not shown.

β€Žwqio/tests/_data/wq.pklβ€Ž

2.26 MB
Binary file not shown.

0 commit comments

Comments
Β (0)