Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 8bf8653

Browse files
committed
REF/TST: split up boxplot function into a calculator (cbook.boxplot_stats) and a drawer (axes.bxp).
Existing function now relies on these two functions.
1 parent 350320f commit 8bf8653

17 files changed

Lines changed: 656 additions & 331 deletions

lib/matplotlib/axes/_axes.py

Lines changed: 199 additions & 223 deletions
Large diffs are not rendered by default.

lib/matplotlib/cbook.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,6 +1832,145 @@ def delete_masked_points(*args):
18321832
return margs
18331833

18341834

1835+
def boxplot_stats(X, whis=1.5, bootstrap=None):
1836+
'''
1837+
Returns list of dictionaries of staticists to be use to draw a series of
1838+
box and whisker plots. See the `Returns` section below to the required
1839+
keys of the dictionary. Users can skip this function and pass a user-
1840+
defined set of dictionaries to the new `axes.bxp` method instead of
1841+
relying on MPL to do the calcs.
1842+
1843+
Parameters
1844+
----------
1845+
X : array-like
1846+
Data that will be represented in the boxplots. Should have 2 or fewer
1847+
dimensions.
1848+
1849+
whis : float (default = 1.5)
1850+
Determines the reach of the whiskers past the first and third
1851+
quartiles (e.g., Q3 + whis*IQR). Beyone the whiskers, data are
1852+
considers outliers and are plotted as individual points. Set
1853+
this to an unreasonably high value to force the whiskers to
1854+
show the min and max data. (IQR = interquartile range, Q3-Q1)
1855+
1856+
bootstrap : int or None (default)
1857+
Number of times the confidence intervals around the median should
1858+
be bootstrapped (percentile method).
1859+
1860+
Returns
1861+
-------
1862+
bxpstats : A list of dictionaries containing the results for each column
1863+
of data. Keys are as
1864+
'''
1865+
1866+
def _bootstrap_median(data, N=5000):
1867+
# determine 95% confidence intervals of the median
1868+
M = len(data)
1869+
percentiles = [2.5, 97.5]
1870+
1871+
# initialize the array of estimates
1872+
estimate = np.empty(N)
1873+
for n in range(N):
1874+
bsIndex = np.random.random_integers(0, M - 1, M)
1875+
bsData = data[bsIndex]
1876+
estimate[n] = np.percentile(bsData, 50)
1877+
1878+
CI = np.percentile(estimate, percentiles)
1879+
return CI
1880+
1881+
def _compute_conf_interval(data, med, iqr, bootstrap):
1882+
if bootstrap is not None:
1883+
# Do a bootstrap estimate of notch locations.
1884+
# get conf. intervals around median
1885+
CI = _bootstrap_median(data, N=bootstrap)
1886+
notch_min = CI[0]
1887+
notch_max = CI[1]
1888+
else:
1889+
# Estimate notch locations using Gaussian-based
1890+
# asymptotic approximation.
1891+
#
1892+
# For discussion: McGill, R., Tukey, J.W.,
1893+
# and Larsen, W.A. (1978) "Variations of
1894+
# Boxplots", The American Statistician, 32:12-16.
1895+
N = len(data)
1896+
notch_min = med - 1.57 * iqr / np.sqrt(N)
1897+
notch_max = med + 1.57 * iqr / np.sqrt(N)
1898+
1899+
return notch_min, notch_max
1900+
1901+
# output is a list of dicts
1902+
bxpstats = []
1903+
1904+
# convert X to a list of lists
1905+
if hasattr(X, 'shape'):
1906+
# one item
1907+
if len(X.shape) == 1:
1908+
if hasattr(X[0], 'shape'):
1909+
X = list(X)
1910+
else:
1911+
X = [X, ]
1912+
1913+
# several items
1914+
elif len(X.shape) == 2:
1915+
nrows, ncols = X.shape
1916+
if nrows == 1:
1917+
X = [X]
1918+
elif ncols == 1:
1919+
X = [X.ravel()]
1920+
else:
1921+
X = [X[:, i] for i in xrange(ncols)]
1922+
else:
1923+
raise ValueError("input `X` must have 2 or fewer dimensions")
1924+
1925+
if not hasattr(X[0], '__len__'):
1926+
X = [X]
1927+
1928+
ncols = len(X)
1929+
for ii, x in enumerate(X, start=0):
1930+
stats = {}
1931+
1932+
# arithmetic mean
1933+
stats['mean'] = np.mean(x)
1934+
1935+
# medians and quartiles
1936+
stats['q1'], stats['med'], stats['q3'] = \
1937+
np.percentile(x, [25, 50, 75])
1938+
1939+
# interquartile range
1940+
stats['iqr'] = stats['q3'] - stats['q1']
1941+
1942+
# conf. interval around median
1943+
stats['cilo'], stats['cihi'] = _compute_conf_interval(
1944+
x, stats['med'], stats['iqr'], bootstrap
1945+
)
1946+
1947+
# highest non-outliers
1948+
hival = stats['q3'] + whis * stats['iqr']
1949+
wiskhi = np.compress(x <= hival, x)
1950+
if len(wiskhi) == 0 or np.max(wiskhi) < stats['q3']:
1951+
stats['whishi'] = stats['q3']
1952+
else:
1953+
stats['whishi'] = max(wiskhi)
1954+
1955+
# get low extreme
1956+
loval = stats['q1'] - whis * stats['iqr']
1957+
wisklo = np.compress(x >= loval, x)
1958+
if len(wisklo) == 0 or np.min(wisklo) > stats['q1']:
1959+
stats['whislo'] = stats['q1']
1960+
else:
1961+
stats['whislo'] = min(wisklo)
1962+
1963+
# compute a single array of outliers
1964+
stats['outliers'] = np.hstack([
1965+
np.compress(x < stats['whislo'], x),
1966+
np.compress(x > stats['whishi'], x)
1967+
])
1968+
1969+
bxpstats.append(stats)
1970+
1971+
return bxpstats
1972+
1973+
18351974
# FIXME I don't think this is used anywhere
18361975
def unmasked_index_ranges(mask, compressed=True):
18371976
"""
69 Bytes
Binary file not shown.
-826 Bytes
Loading

0 commit comments

Comments
 (0)