From 388ee595330e2375a1e4c8187c17de7ea9fb2f6f Mon Sep 17 00:00:00 2001 From: Varun Nayyar Date: Sat, 15 Aug 2015 03:14:24 +1000 Subject: [PATCH] ENH: Adding in automatic number of bins estimation for np.histogram. Users can now pass in bins='auto' (or 'scott','fd','rice','sturges') and get the corresponding rule of thumb estimator provide a decent estimate of the optimal number of bins for the given the data. --- doc/release/1.11.0-notes.rst | 4 +- numpy/lib/function_base.py | 171 +++++++++++++++++++++++++- numpy/lib/tests/test_function_base.py | 90 ++++++++++++++ 3 files changed, 263 insertions(+), 2 deletions(-) diff --git a/doc/release/1.11.0-notes.rst b/doc/release/1.11.0-notes.rst index 7bb0c6fda319..b616799a1185 100644 --- a/doc/release/1.11.0-notes.rst +++ b/doc/release/1.11.0-notes.rst @@ -20,7 +20,9 @@ Compatibility notes New Features ============ - +* `np.histogram` now provides plugin estimators for automatically estimating the optimal +number of bins. Passing one of ['auto', 'fd', 'scott', 'rice', 'sturges'] +as the argument to 'bins' results in the corresponding estimator being used. Improvements ============ diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 19040807b9f1..820168663f22 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -27,6 +27,7 @@ from numpy.core.multiarray import digitize, bincount, interp as compiled_interp from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc from numpy.compat import long +from numpy.compat.py3k import basestring # Force range to be a generator, for np.delete's usage. if sys.version_info[0] < 3: @@ -75,6 +76,91 @@ def iterable(y): return 1 +def _hist_optim_numbins_estimator(a, estimator): + """ + A helper function to be called from histogram to deal with estimating optimal number of bins + + estimator: str + If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function + will choose the appropriate estimator and return it's estimate for the optimal + number of bins. + """ + assert isinstance(estimator, basestring) + # private function should not be called otherwise + + if a.size == 0: + return 1 + + def sturges(x): + """ + Sturges Estimator + A very simplistic estimator based on the assumption of normality of the data + Poor performance for non-normal data, especially obvious for large X. + Depends only on size of the data. + """ + return np.ceil(np.log2(x.size)) + 1 + + def rice(x): + """ + Rice Estimator + Another simple estimator, with no normality assumption. + It has better performance for large data, but tends to overestimate number of bins. + The number of bins is proportional to the cube root of data size (asymptotically optimal) + Depends only on size of the data + """ + return np.ceil(2 * x.size ** (1.0 / 3)) + + def scott(x): + """ + Scott Estimator + The binwidth is proportional to the standard deviation of the data and + inversely proportional to the cube root of data size (asymptotically optimal) + + """ + h = 3.5 * x.std() * x.size ** (-1.0 / 3) + if h > 0: + return np.ceil(x.ptp() / h) + return 1 + + def fd(x): + """ + Freedman Diaconis rule using Inter Quartile Range (IQR) for binwidth + Considered a variation of the Scott rule with more robustness as the IQR + is less affected by outliers than the standard deviation. However the IQR depends on + fewer points than the sd so it is less accurate, especially for long tailed distributions. + + If the IQR is 0, we return 1 for the number of bins. + Binwidth is inversely proportional to the cube root of data size (asymptotically optimal) + """ + iqr = np.subtract(*np.percentile(x, [75, 25])) + + if iqr > 0: + h = (2 * iqr * x.size ** (-1.0 / 3)) + return np.ceil(x.ptp() / h) + + # If iqr is 0, default number of bins is 1 + return 1 + + def auto(x): + """ + The FD estimator is usually the most robust method, but it tends to be too small + for small X. The Sturges estimator is quite good for small (<1000) datasets and is + the default in R. + This method gives good off the shelf behaviour. + """ + return max(fd(x), sturges(x)) + + optimal_numbins_methods = {'sturges': sturges, 'rice': rice, 'scott': scott, + 'fd': fd, 'auto': auto} + try: + estimator_func = optimal_numbins_methods[estimator.lower()] + except KeyError: + raise ValueError("{0} not a valid method for `bins`".format(estimator)) + else: + # these methods return floats, np.histogram requires an int + return int(estimator_func(a)) + + def histogram(a, bins=10, range=None, normed=False, weights=None, density=None): """ @@ -84,11 +170,37 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, ---------- a : array_like Input data. The histogram is computed over the flattened array. - bins : int or sequence of scalars, optional + bins : int or sequence of scalars or str, optional If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. + + .. versionadded:: 1.11.0 + + If `bins` is a string from the list below, `histogram` will use the method + chosen to calculate the optimal number of bins (see Notes for more detail + on the estimators). For visualisation, we suggest using the 'auto' option. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into account data + variability and data size . + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'rice' + Estimator does not take variability into account, only data size. + Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only optimal for + gaussian data and underestimates number of bins for large non-gaussian datasets. + range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are @@ -140,6 +252,48 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* 4. + .. versionadded:: 1.11.0 + + The methods to estimate the optimal number of bins are well found in literature, + and are inspired by the choices R provides for histogram visualisation. + Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal, + which is why it appears in most estimators. + These are simply plug-in methods that give good starting points for number of bins. + In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins + + 'Auto' (maximum of the 'Sturges' and 'FD' estimators) + A compromise to get a good value. For small datasets the sturges + value will usually be chosen, while larger datasets will usually default to FD. + Avoids the overly conservative behaviour of FD and Sturges for small and + large datasets respectively. Switchover point is usually x.size~1000. + + 'FD' (Freedman Diaconis Estimator) + .. math:: h = 2 \\frac{IQR}{n^{-1/3}} + The binwidth is proportional to the interquartile range (IQR) + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good + for large datasets. The IQR is very robust to outliers. + + 'Scott' + .. math:: h = \\frac{3.5\\sigma}{n^{-1/3}} + The binwidth is proportional to the standard deviation (sd) of the data + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good + for large datasets. The sd is not very robust to outliers. Values + are very similar to the Freedman Diaconis Estimator in the absence of outliers. + + 'Rice' + .. math:: n_h = \\left\\lceil 2n^{1/3} \\right\\rceil + The number of bins is only proportional to cube root of a.size. + It tends to overestimate the number of bins + and it does not take into account data variability. + + 'Sturges' + .. math:: n_h = \\left\\lceil \\log _{2}n+1 \\right\\rceil + The number of bins is the base2 log of a.size. + This estimator assumes normality of data and is too conservative for larger, + non-normal datasets. This is the default method in R's `hist` method. + Examples -------- >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) @@ -158,6 +312,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, >>> np.sum(hist*np.diff(bin_edges)) 1.0 + .. versionadded:: 1.11.0 + + Automated Bin Selection Methods example, using 2 peak random data with 2000 points + + >>> import matplotlib.pyplot as plt + >>> rng = np.random.RandomState(10) # deterministic random data + >>> a = np.hstack((rng.normal(size = 1000), rng.normal(loc = 5, scale = 2, size = 1000))) + >>> plt.hist(a, bins = 'auto') # plt.hist passes it's arguments to np.histogram + >>> plt.title("Histogram with 'auto' bins") + >>> plt.show() """ a = asarray(a) @@ -175,6 +339,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, raise AttributeError( 'max must be larger than min in range parameter.') + if isinstance(bins, basestring): + bins = _hist_optim_numbins_estimator(a, bins) + # if `bins` is a string for an automatic method, + # this will replace it with the number of bins calculated + # Histogram is an integer or a float array depending on the weights. if weights is None: ntype = np.dtype(np.intp) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index b29012bcbcd5..b127f65f618b 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1234,6 +1234,96 @@ def test_empty(self): assert_array_equal(b, np.array([0, 1])) +class TestHistogramOptimBinNums(TestCase): + """ + Provide test coverage when using provided estimators for optimal number of bins + """ + + def test_empty(self): + estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + # check it can deal with empty data + for estimator in estimator_list: + a, b = histogram([], bins=estimator) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_simple(self): + """ + Straightforward testing with a mixture of linspace data (for consistency). + All test values have been precomputed and the values shouldn't change + """ + # some basic sanity checking, with some fixed data. Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}} + + for testlen, expectedResults in basic_test.items(): + # create some sort of non uniform data to test with (2 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen/5 * 2) + x2 = np.linspace(1,10, testlen/5 * 3) + x = np.hstack((x1, x2)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator) + assert_equal(len(a), numbins, + err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen)) + + def test_small(self): + """ + Smaller datasets have the potential to cause issues with the data adaptive methods + Especially the FD methods + All bin numbers have been precalculated + """ + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1}, + 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2}, + 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3}} + + for testlen, expectedResults in small_dat.items(): + testdat = np.arange(testlen) + for estimator, expbins in expectedResults.items(): + a, b = np.histogram(testdat, estimator) + assert_equal(len(a), expbins, + err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen)) + + def test_incorrect_methods(self): + """ + Check a Value Error is thrown when an unknown string is passed in + """ + check_list = ['mad', 'freeman', 'histograms', 'IQR'] + for estimator in check_list: + assert_raises(ValueError, histogram, [1, 2, 3], estimator) + + def test_novariance(self): + """ + Check that methods handle no variance in data + Primarily for Scott and FD as the SD and IQR are both 0 in this case + """ + novar_dataset = np.ones(100) + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, 'auto': 8} + + for estimator, numbins in novar_resultdict.items(): + a, b = np.histogram(novar_dataset, estimator) + assert_equal(len(a), numbins, + err_msg="{0} estimator, No Variance test".format(estimator)) + + def test_outlier(self): + """ + Check the fd and scott with outliers + The fd determines a smaller binwidth since it's less affected by outliers + since the range is so (artificially) large this means more bins + most of which will be empty, but the data of interest usually is unaffected. + The Scott estimator is more affected and returns fewer bins, despite most of + the variance being in one area of the data + """ + xcenter = np.linspace(-10, 10, 50) + outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) + + outlier_resultdict = {'fd': 21, 'scott': 5} + + for estimator, numbins in outlier_resultdict.items(): + a, b = np.histogram(outlier_dataset, estimator) + assert_equal(len(a), numbins) + + class TestHistogramdd(TestCase): def test_simple(self):