2020_Stat = namedtuple ("_stat" , ["stat" , "pvalue" ])
2121
2222
23- def _dist_compare (x , y , stat_comp_func ):
23+ def _dist_compare (x , y , stat_comp_func , ** test_opts ):
2424 if (len (x ) == len (y )) and numpy .equal (x , y ).all ():
2525 return _Stat (numpy .nan , numpy .nan )
26- return stat_comp_func (x , y , alternative = "two-sided" )
26+
27+ return stat_comp_func (x , y , ** test_opts )
2728
2829
2930class DataCollection :
@@ -130,6 +131,8 @@ def __init__(
130131 ** {self .cencol : dataframe [self .qualcol ].isin (self .ndval )}
131132 ).reset_index ()
132133
134+ self .pbarfxn = tqdm if (self .showpbar and tqdm ) else utils .misc .no_op
135+
133136 @cache_readonly
134137 def tidy (self ):
135138 if self .useros :
@@ -323,7 +326,7 @@ def mean(self):
323326
324327 @cache_readonly
325328 def std_dev (self ):
326- return self .generic_stat (numpy .std , statname = "std. dev." , use_bootstrap = False )
329+ return self .generic_stat (numpy .std , statname = "std. dev." , use_bootstrap = False , ddof = 1 )
327330
328331 def percentile (self , percentile ):
329332 """Return the percentiles (0 - 100) for the data."""
@@ -342,7 +345,7 @@ def logmean(self):
342345 @cache_readonly
343346 def logstd_dev (self ):
344347 return self .generic_stat (
345- lambda x , axis = 0 : numpy .std (numpy .log (x ), axis = axis ),
348+ lambda x , axis = 0 : numpy .std (numpy .log (x ), axis = axis , ddof = 1 ),
346349 use_bootstrap = False ,
347350 statname = "Log-std. dev." ,
348351 )
@@ -359,65 +362,93 @@ def geostd_dev(self):
359362 geostd .columns .names = ["station" , "Geo-std. dev." ]
360363 return geostd
361364
362- @cache_readonly
363- def shapiro (self ):
365+ def shapiro (self , ** opts ):
366+ """
367+ Run the Shapiro-Wilk test for normality on the datasets.
368+
369+ Requires at least 3 observations in each dataset.
370+
371+ See `scipy.stats.shapiro` for info on kwargs you can pass.
372+ """
364373 return self .generic_stat (
365374 stats .shapiro ,
366375 use_bootstrap = False ,
367376 has_pvalue = True ,
368377 statname = "shapiro" ,
369378 filterfxn = lambda x : x .shape [0 ] > 3 ,
379+ ** opts ,
370380 )
371381
372- @cache_readonly
373- def shapiro_log (self ):
382+ def shapiro_log (self , ** opts ):
383+ """
384+ Run the Shapiro-Wilk test for normality on log-transformed datasets.
385+
386+ Requires at least 3 observations in each dataset.
387+
388+ See `scipy.stats.shapiro` for info on kwargs you can pass.
389+ """
374390 return self .generic_stat (
375391 lambda x : stats .shapiro (numpy .log (x )),
376392 use_bootstrap = False ,
377393 has_pvalue = True ,
378394 filterfxn = lambda x : x .shape [0 ] > 3 ,
379395 statname = "log-shapiro" ,
396+ ** opts ,
380397 )
381398
382- @cache_readonly
383- def lilliefors (self ):
399+ def lilliefors (self , ** opts ):
400+ """
401+ Run the Lilliefors test for normality on the datasets.
402+
403+ Requires at least 3 observations in each dataset.
404+
405+ See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass.
406+ """
384407 return self .generic_stat (
385408 sm .stats .lilliefors ,
386409 use_bootstrap = False ,
387410 has_pvalue = True ,
388411 statname = "lilliefors" ,
412+ ** opts ,
389413 )
390414
391- @cache_readonly
392- def lilliefors_log (self ):
415+ def lilliefors_log (self , ** opts ):
416+ """
417+ Run the Lilliefors test for normality on the log-transformed datasets.
418+
419+ Requires at least 3 observations in each dataset.
420+
421+ See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass.
422+ """
393423 return self .generic_stat (
394424 lambda x : sm .stats .lilliefors (numpy .log (x )),
395425 use_bootstrap = False ,
396426 has_pvalue = True ,
397427 statname = "log-lilliefors" ,
428+ ** opts ,
398429 )
399430
400- @cache_readonly
401- def anderson_darling (self ):
431+ def anderson_darling (self , ** opts ):
402432 raise NotImplementedError
403433 return self .generic_stat (
404434 utils .anderson_darling ,
405435 use_bootstrap = False ,
406436 has_pvalue = True ,
407437 statname = "anderson-darling" ,
438+ ** opts ,
408439 )
409440
410- @cache_readonly
411- def anderson_darling_log (self ):
441+ def anderson_darling_log (self , ** opts ):
412442 raise NotImplementedError
413443 return self .generic_stat (
414444 lambda x : utils .anderson_darling (numpy .log (x )),
415445 use_bootstrap = False ,
416446 has_pvalue = True ,
417447 statname = "log-anderson-darling" ,
448+ ** opts ,
418449 )
419450
420- def comparison_stat (self , statfxn , statname = None , paired = False , ** statopts ):
451+ def comparison_stat_twoway (self , statfxn , statname = None , paired = False , ** statopts ):
421452 """Generic function to apply comparative hypothesis tests to
422453 the groups of the ``DataCollection``.
423454
@@ -430,7 +461,7 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
430461 statname : string, optional
431462 Name of the statistic. Included as a column name in the
432463 final dataframe.
433- apired : bool, optional
464+ paired : bool, optional
434465 Set to ``True`` if ``statfxn`` requires paired data.
435466 **statopts : optional kwargs
436467 Additional keyword arguments that will be passed to
@@ -455,9 +486,9 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
455486 >>> dc = DataCollection(df, rescol='res', qualcol='qual',
456487 ... stationcol='loc', paramcol='param',
457488 ... ndval='<')
458- >>> mwht = dc.comparison_stat (stats.mannwhitneyu,
459- ... statname='mann_whitney',
460- ... alternative='two-sided')
489+ >>> mwht = dc.comparison_stat_twoway (stats.mannwhitneyu,
490+ ... statname='mann_whitney',
491+ ... alternative='two-sided')
461492
462493 """
463494
@@ -475,46 +506,134 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts):
475506 index_cols = meta_columns + station_columns
476507
477508 results = generator (
478- data , meta_columns , self .stationcol , rescol , statfxn , statname = statname , ** statopts
509+ data ,
510+ meta_columns ,
511+ self .stationcol ,
512+ rescol ,
513+ statfxn ,
514+ statname = statname ,
515+ pbarfxn = self .pbarfxn ,
516+ ** statopts ,
479517 )
480518 return pandas .DataFrame .from_records (results ).set_index (index_cols )
481519
482- @cache_readonly
483- def mann_whitney (self ):
484- return self .comparison_stat (
485- partial (_dist_compare , stat_comp_func = stats .mannwhitneyu ),
520+ def comparison_stat_allway (self , statfxn , statname , control = None , ** statopts ):
521+ results = utils .numutils ._group_comp_stat_generator (
522+ self .tidy ,
523+ self .groupcols_comparison ,
524+ self .stationcol ,
525+ self .rescol ,
526+ statfxn ,
527+ statname = statname ,
528+ control = control ,
529+ pbarfxn = self .pbarfxn ,
530+ ** statopts ,
531+ )
532+ return pandas .DataFrame .from_records (results ).set_index (self .groupcols_comparison )
533+
534+ def mann_whitney (self , ** opts ):
535+ """
536+ Run the Mann-Whitney U test across datasets.
537+
538+ See `scipy.stats.mannwhitneyu` for available options.
539+ """
540+ return self .comparison_stat_twoway (
541+ partial (_dist_compare , stat_comp_func = stats .mannwhitneyu , ** opts ),
486542 statname = "mann_whitney" ,
487543 )
488544
489- @ cache_readonly
490- def ranksums ( self ):
491- return self . comparison_stat ( stats . ranksums , statname = "rank_sums" )
545+ def ranksums ( self , ** opts ):
546+ """
547+ Run the unpaired Wilcoxon rank-sum test across datasets.
492548
493- @ cache_readonly
494- def t_test ( self ):
495- return self .comparison_stat (stats .ttest_ind , statname = "t_test " , equal_var = False )
549+ See `scipy.stats.ranksums` for available options.
550+ """
551+ return self .comparison_stat_twoway (stats .ranksums , statname = "rank_sums " , ** opts )
496552
497- @ cache_readonly
498- def levene ( self ):
499- return self . comparison_stat ( stats . levene , statname = "levene" , center = "median" )
553+ def t_test ( self , ** opts ):
554+ """
555+ Run the T-test for independent scores.
500556
501- @cache_readonly
502- def wilcoxon (self ):
503- return self .comparison_stat (
557+ See `scipy.stats.ttest_ind` for available options.
558+ """
559+ return self .comparison_stat_twoway (stats .ttest_ind , statname = "t_test" , ** opts )
560+
561+ def levene (self , ** opts ):
562+ """
563+ Run the Levene test for equal variances
564+
565+ See `scipy.stats.levene` for available options.
566+ """
567+ return self .comparison_stat_twoway (stats .levene , statname = "levene" , ** opts )
568+
569+ def wilcoxon (self , ** opts ):
570+ """
571+ Run the paired Wilcoxon rank-sum test across paired dataset.
572+
573+ See `scipy.stats.wilcoxon` for available options.
574+ """
575+ return self .comparison_stat_twoway (
504576 partial (_dist_compare , stat_comp_func = stats .wilcoxon ),
505577 statname = "wilcoxon" ,
506578 paired = True ,
579+ ** opts ,
507580 )
508581
509- @ cache_readonly
510- def kendall ( self ):
511- return self . comparison_stat ( stats . kendalltau , statname = "kendalltau" , paired = True )
582+ def kendall ( self , ** opts ):
583+ """
584+ Run the paired Kendall-tau test across paired dataset.
512585
513- @cache_readonly
514- def spearman (self ):
515- return self .comparison_stat (stats .spearmanr , statname = "spearmanrho" , paired = True )
586+ See `scipy.stats.kendalltau` for available options.
587+ """
588+ return self .comparison_stat_twoway (
589+ stats .kendalltau , statname = "kendalltau" , paired = True , ** opts
590+ )
591+
592+ def spearman (self , ** opts ):
593+ """
594+ Run the paired Spearman-rho test across paired dataset.
595+
596+ See `scipy.stats.spearmanr` for available options.
597+ """
598+ return self .comparison_stat_twoway (
599+ stats .spearmanr , statname = "spearmanrho" , paired = True , ** opts
600+ )
601+
602+ def kruskal_wallis (self , ** opts ):
603+ """
604+ Run the paired Kruskal-Wallos H-test across paired dataset.
605+
606+ See `scipy.stats.kruskal` for available options.
607+ """
608+ return self .comparison_stat_allway (stats .kruskal , statname = "K-W H" , control = None , ** opts )
609+
610+ def f_test (self , ** opts ):
611+ """
612+ One-way ANOVA test across datasets
613+
614+ See `scipy.stats.f_oneway` for available options.
615+ """
616+ return self .comparison_stat_allway (stats .f_oneway , statname = "f-test" , control = None , ** opts )
617+
618+ def tukey_hsd (self ) -> tuple [pandas .DataFrame , pandas .DataFrame ]:
619+ """
620+ Tukey Honestly Significant Difference (HSD) test across stations + other groups
621+ for each parameter
622+ """
623+ hsd = utils .tukey_hsd (
624+ self .tidy , self .rescol , self .stationcol , self .paramcol , * self .othergroups
625+ )
626+ scores = utils .process_tukey_hsd_scores (hsd , self .stationcol , self .paramcol )
627+ return hsd , scores
628+
629+ def dunn (self ):
630+ """
631+ Dunn test across the different stations for each pollutant
632+ """
633+ return self .tidy .groupby (by = [self .paramcol ]).apply (
634+ lambda g : utils .dunn_test (g , self .rescol , self .stationcol , * self .othergroups ).scores
635+ )
516636
517- @cache_readonly
518637 def theilslopes (self , logs = False ):
519638 raise NotImplementedError
520639
0 commit comments