From 8446a47e5c4041fad5b49c11f2b5fd293e6d30ff Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Mon, 13 Nov 2023 02:04:59 +0200 Subject: [PATCH 1/2] BUG: Fix FP overflow error in division when the divisor is scalar The bug occurred when SIMD partial load was involved, due to filling remaining lanes of the dividend vector with ones, which leads to raised overflow warnings when the divisor is denormal. This patch replaces the remaining lanes with nans rather than ones to fix this issue. --- .../src/umath/loops_arithm_fp.dispatch.c.src | 4 +++- numpy/_core/tests/test_umath.py | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src index 30111258d646..c32239dc12df 100644 --- a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src @@ -138,8 +138,10 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) npyv_store_@sfx@((@type@*)(dst + vstep), r1); } for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { - #if @is_div@ || @is_mul@ + #if @is_mul@ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + #elif @is_div@ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, NPY_NAN@C@); #else npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); #endif diff --git a/numpy/_core/tests/test_umath.py b/numpy/_core/tests/test_umath.py index 39a4f7534e50..b57caf07ca0d 100644 --- a/numpy/_core/tests/test_umath.py +++ b/numpy/_core/tests/test_umath.py @@ -17,7 +17,7 @@ assert_, assert_equal, assert_raises, assert_raises_regex, assert_array_equal, assert_almost_equal, assert_array_almost_equal, assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings, - _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL, + _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL, IS_PYPY ) from numpy.testing._private.utils import _glibc_older_than @@ -1818,6 +1818,18 @@ def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape): with assert_no_warnings(): ufunc(array) + @pytest.mark.parametrize("dtype", ('e', 'f', 'd')) + def test_divide_spurious_fpexception(self, dtype): + dt = np.dtype(dtype) + dt_info = np.finfo(dt) + subnorm = dt_info.smallest_subnormal + # Verify a bug fix caused due to filling the remaining lanes of the + # partially loaded dividend SIMD vector with ones, which leads to + # raising an overflow warning when the divisor is denormal. + # see https://github.com/numpy/numpy/issues/25097 + with assert_no_warnings(): + np.zeros(128 + 1, dtype=dt) / subnorm + class TestFPClass: @pytest.mark.parametrize("stride", [-5, -4, -3, -2, -1, 1, 2, 4, 5, 6, 7, 8, 9, 10]) @@ -4218,7 +4230,7 @@ def test_against_cmath(self): a = complex(func(np.complex128(p))) b = cfunc(p) assert_( - abs(a - b) < atol, + abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b) ) @@ -4788,7 +4800,7 @@ def test_different_docstring_fails(self): # test for attributes (which are C-level defined) with assert_raises(RuntimeError): ncu.add_docstring(np.ndarray.flat, "different docstring") - + # And typical functions: def func(): """docstring""" From 3ed0e5ad76a2084b7c65a1d97ac9e9628c6490d1 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Wed, 15 Nov 2023 17:44:54 +0200 Subject: [PATCH 2/2] ENH: Disable SIMD single-precision division optimization on armv7 The decision is based on the lack of native SIMD support for this operation in the armhf architecture, and the associated challenges in performance and evaluate the benefits of emulated SIMD intrinsic versus native scalar division. --- .../src/umath/loops_arithm_fp.dispatch.c.src | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src index c32239dc12df..0d0de90125f6 100644 --- a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src @@ -74,7 +74,29 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) #endif return; } -#if @VECTOR@ +#if @is_div@ && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64 + /** + * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD + * support for single-precision floating-point division. Only scalar division is + * supported natively, and without hardware for performance and accuracy comparison, + * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus + * native scalar division. + * + * The `npyv_div_f32` universal intrinsic emulates the division operation using an + * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced + * precision. However, this approach has limitations: + * + * - It can cause unexpected floating-point overflows in special cases, such as when + * the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097). + * + * - The precision may vary between the emulated SIMD and scalar division due to + * non-uniform branches (non-contiguous) in the code, leading to precision + * inconsistencies. + * + * - Considering the necessity of multiple Newton-Raphson iterations, the performance + * gain may not sufficiently offset these drawbacks. + */ +#elif @VECTOR@ if (len > npyv_nlanes_@sfx@*2 && !is_mem_overlap(src0, ssrc0, dst, sdst, len) && !is_mem_overlap(src1, ssrc1, dst, sdst, len)