From a528977da85bdb1c49353df74514e0253eb7f786 Mon Sep 17 00:00:00 2001 From: Robert Kern Date: Sun, 15 May 2016 14:51:32 +0100 Subject: [PATCH 1/2] TST: Failing test for histogram. --- numpy/lib/tests/test_function_base.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index ea10cbc7614d..323ba0bbd82f 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1276,7 +1276,18 @@ def test_finite_range(self): histogram(vals, range=[0.25,0.75]) assert_raises(ValueError, histogram, vals, range=[np.nan,0.75]) assert_raises(ValueError, histogram, vals, range=[0.25,np.inf]) - + + def test_bin_edge_cases(self): + # Ensure that floating-point computations correctly place edge cases. + arr = np.array([337, 404, 739, 806, 1007, 1811, 2012]) + hist, edges = np.histogram(arr, bins=8296, range=(2, 2280)) + mask = hist > 0 + left_edges = edges[:-1][mask] + right_edges = edges[1:][mask] + for x, left, right in zip(arr, left_edges, right_edges): + self.assertGreaterEqual(x, left) + self.assertLess(x, right) + class TestHistogramOptimBinNums(TestCase): """ From 20d5553f9985e95c26c621a8fd1e8c1ef4fd4639 Mon Sep 17 00:00:00 2001 From: Robert Kern Date: Tue, 24 May 2016 11:05:48 +0100 Subject: [PATCH 2/2] ENH: correct initial index estimate in histogram. --- numpy/lib/function_base.py | 21 ++++++++++++++++----- numpy/lib/tests/test_function_base.py | 4 ++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 44e0d5ce6b55..092697f05fbc 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -565,6 +565,9 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, # Pre-compute histogram scaling factor norm = bins / (mx - mn) + # Compute the bin edges for potential correction. + bin_edges = linspace(mn, mx, bins + 1, endpoint=True) + # We iterate over blocks here for two reasons: the first is that for # large arrays, it is actually faster (for example for a 10^8 array it # is 2x as fast) and it results in a memory footprint 3x lower in the @@ -583,14 +586,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, tmp_a = tmp_a[keep] if tmp_w is not None: tmp_w = tmp_w[keep] - tmp_a = tmp_a.astype(float) - tmp_a -= mn + tmp_a_data = tmp_a.astype(float) + tmp_a = tmp_a_data - mn tmp_a *= norm # Compute the bin indices, and for values that lie exactly on mx we # need to subtract one indices = tmp_a.astype(np.intp) - indices[indices == bins] -= 1 + equals_endpoint = (indices == bins) + indices[equals_endpoint] -= 1 + + # The index computation is not guaranteed to give exactly + # consistent results within ~1 ULP of the bin edges. + decrement = tmp_a_data < bin_edges[indices] + indices[decrement] -= 1 + increment = (tmp_a_data >= bin_edges[indices + 1]) & ~equals_endpoint + indices[increment] += 1 # We now compute the histogram using bincount if ntype.kind == 'c': @@ -599,8 +610,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, else: n += np.bincount(indices, weights=tmp_w, minlength=bins).astype(ntype) - # We now compute the bin edges since these are returned - bins = linspace(mn, mx, bins + 1, endpoint=True) + # Rename the bin edges for return. + bins = bin_edges else: bins = asarray(bins) if (np.diff(bins) < 0).any(): diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 323ba0bbd82f..40769961f6f8 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1285,8 +1285,8 @@ def test_bin_edge_cases(self): left_edges = edges[:-1][mask] right_edges = edges[1:][mask] for x, left, right in zip(arr, left_edges, right_edges): - self.assertGreaterEqual(x, left) - self.assertLess(x, right) + assert_(x >= left) + assert_(x < right) class TestHistogramOptimBinNums(TestCase):