From 8446a47e5c4041fad5b49c11f2b5fd293e6d30ff Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Mon, 13 Nov 2023 02:04:59 +0200
Subject: [PATCH 1/2] BUG: Fix FP overflow error in division when the divisor
 is scalar

  The bug occurred when SIMD partial load was involved,
  due to filling remaining lanes of the dividend vector
  with ones, which leads to raised overflow warnings
  when the divisor is denormal.

  This patch replaces the remaining lanes with nans rather
  than ones to fix this issue.
---
 .../src/umath/loops_arithm_fp.dispatch.c.src   |  4 +++-
 numpy/_core/tests/test_umath.py                | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
index 30111258d646..c32239dc12df 100644
--- a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -138,8 +138,10 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
             for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
-            #if @is_div@ || @is_mul@
+            #if @is_mul@
                 npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+            #elif @is_div@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, NPY_NAN@C@);
             #else
                 npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
             #endif
diff --git a/numpy/_core/tests/test_umath.py b/numpy/_core/tests/test_umath.py
index 39a4f7534e50..b57caf07ca0d 100644
--- a/numpy/_core/tests/test_umath.py
+++ b/numpy/_core/tests/test_umath.py
@@ -17,7 +17,7 @@
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL, 
+    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL,
     IS_PYPY
     )
 from numpy.testing._private.utils import _glibc_older_than
@@ -1818,6 +1818,18 @@ def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape):
         with assert_no_warnings():
             ufunc(array)
 
+    @pytest.mark.parametrize("dtype", ('e', 'f', 'd'))
+    def test_divide_spurious_fpexception(self, dtype):
+        dt = np.dtype(dtype)
+        dt_info = np.finfo(dt)
+        subnorm = dt_info.smallest_subnormal
+        # Verify a bug fix caused due to filling the remaining lanes of the
+        # partially loaded dividend SIMD vector with ones, which leads to
+        # raising an overflow warning when the divisor is denormal.
+        # see https://github.com/numpy/numpy/issues/25097
+        with assert_no_warnings():
+            np.zeros(128 + 1, dtype=dt) / subnorm
+
 class TestFPClass:
     @pytest.mark.parametrize("stride", [-5, -4, -3, -2, -1, 1,
                                 2, 4, 5, 6, 7, 8, 9, 10])
@@ -4218,7 +4230,7 @@ def test_against_cmath(self):
                 a = complex(func(np.complex128(p)))
                 b = cfunc(p)
                 assert_(
-                    abs(a - b) < atol, 
+                    abs(a - b) < atol,
                     "%s %s: %s; cmath: %s" % (fname, p, a, b)
                 )
 
@@ -4788,7 +4800,7 @@ def test_different_docstring_fails(self):
         # test for attributes (which are C-level defined)
         with assert_raises(RuntimeError):
             ncu.add_docstring(np.ndarray.flat, "different docstring")
-            
+
         # And typical functions:
         def func():
             """docstring"""

From 3ed0e5ad76a2084b7c65a1d97ac9e9628c6490d1 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Wed, 15 Nov 2023 17:44:54 +0200
Subject: [PATCH 2/2] ENH: Disable SIMD single-precision division optimization
 on armv7

  The decision is based on the lack of native SIMD support for
  this operation in the armhf architecture, and the associated challenges
  in performance and evaluate the benefits of emulated SIMD intrinsic versus
  native scalar division.
---
 .../src/umath/loops_arithm_fp.dispatch.c.src  | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
index c32239dc12df..0d0de90125f6 100644
--- a/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -74,7 +74,29 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
     #endif
         return;
     }
-#if @VECTOR@
+#if @is_div@ && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif @VECTOR@
     if (len > npyv_nlanes_@sfx@*2 &&
         !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
         !is_mem_overlap(src1, ssrc1, dst, sdst, len)