From f54454fe1b98fce217fb7352dd6d2a115bb6f82b Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Mon, 13 Nov 2023 02:04:59 +0200 Subject: [PATCH 1/2] BUG: Fix FP overflow error in division when the divisor is scalar The bug occurred when SIMD partial load was involved, due to filling remaining lanes of the dividend vector with ones, which leads to raised overflow warnings when the divisor is denormal. This patch replaces the remaining lanes with nans rather than ones to fix this issue. --- .../src/umath/loops_arithm_fp.dispatch.c.src | 4 +++- numpy/core/tests/test_umath.py | 20 +++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index 30111258d646..c32239dc12df 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -138,8 +138,10 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) npyv_store_@sfx@((@type@*)(dst + vstep), r1); } for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { - #if @is_div@ || @is_mul@ + #if @is_mul@ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + #elif @is_div@ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, NPY_NAN@C@); #else npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); #endif diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 59c670ffed29..963e740d8dcb 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -17,7 +17,8 @@ assert_, assert_equal, assert_raises, assert_raises_regex, assert_array_equal, assert_almost_equal, assert_array_almost_equal, assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings, - _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL + _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL, + IS_PYPY ) from numpy.testing._private.utils import _glibc_older_than @@ -1825,6 +1826,18 @@ def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape): with assert_no_warnings(): ufunc(array) + @pytest.mark.parametrize("dtype", ('e', 'f', 'd')) + def test_divide_spurious_fpexception(self, dtype): + dt = np.dtype(dtype) + dt_info = np.finfo(dt) + subnorm = dt_info.smallest_subnormal + # Verify a bug fix caused due to filling the remaining lanes of the + # partially loaded dividend SIMD vector with ones, which leads to + # raising an overflow warning when the divisor is denormal. + # see https://github.com/numpy/numpy/issues/25097 + with assert_no_warnings(): + np.zeros(128 + 1, dtype=dt) / subnorm + class TestFPClass: @pytest.mark.parametrize("stride", [-5, -4, -3, -2, -1, 1, 2, 4, 5, 6, 7, 8, 9, 10]) @@ -4180,7 +4193,10 @@ def test_against_cmath(self): for p in points: a = complex(func(np.complex_(p))) b = cfunc(p) - assert_(abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b)) + assert_( + abs(a - b) < atol, + "%s %s: %s; cmath: %s" % (fname, p, a, b) + ) @pytest.mark.xfail( # manylinux2014 uses glibc2.17 From c6e29c8bfd3861dcecc181a6077972e2e1a8d61c Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Wed, 15 Nov 2023 17:44:54 +0200 Subject: [PATCH 2/2] ENH: Disable SIMD single-precision division optimization on armv7 The decision is based on the lack of native SIMD support for this operation in the armhf architecture, and the associated challenges in performance and evaluate the benefits of emulated SIMD intrinsic versus native scalar division. --- .../src/umath/loops_arithm_fp.dispatch.c.src | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index c32239dc12df..0d0de90125f6 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -74,7 +74,29 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) #endif return; } -#if @VECTOR@ +#if @is_div@ && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64 + /** + * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD + * support for single-precision floating-point division. Only scalar division is + * supported natively, and without hardware for performance and accuracy comparison, + * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus + * native scalar division. + * + * The `npyv_div_f32` universal intrinsic emulates the division operation using an + * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced + * precision. However, this approach has limitations: + * + * - It can cause unexpected floating-point overflows in special cases, such as when + * the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097). + * + * - The precision may vary between the emulated SIMD and scalar division due to + * non-uniform branches (non-contiguous) in the code, leading to precision + * inconsistencies. + * + * - Considering the necessity of multiple Newton-Raphson iterations, the performance + * gain may not sufficiently offset these drawbacks. + */ +#elif @VECTOR@ if (len > npyv_nlanes_@sfx@*2 && !is_mem_overlap(src0, ssrc0, dst, sdst, len) && !is_mem_overlap(src1, ssrc1, dst, sdst, len)