From f54454fe1b98fce217fb7352dd6d2a115bb6f82b Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Mon, 13 Nov 2023 02:04:59 +0200
Subject: [PATCH 1/2] BUG: Fix FP overflow error in division when the divisor
 is scalar

  The bug occurred when SIMD partial load was involved,
  due to filling remaining lanes of the dividend vector
  with ones, which leads to raised overflow warnings
  when the divisor is denormal.

  This patch replaces the remaining lanes with nans rather
  than ones to fix this issue.
---
 .../src/umath/loops_arithm_fp.dispatch.c.src  |  4 +++-
 numpy/core/tests/test_umath.py                | 20 +++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index 30111258d646..c32239dc12df 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -138,8 +138,10 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
             for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
-            #if @is_div@ || @is_mul@
+            #if @is_mul@
                 npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+            #elif @is_div@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, NPY_NAN@C@);
             #else
                 npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
             #endif
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 59c670ffed29..963e740d8dcb 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -17,7 +17,8 @@
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL
+    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL,
+    IS_PYPY
     )
 from numpy.testing._private.utils import _glibc_older_than
 
@@ -1825,6 +1826,18 @@ def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape):
         with assert_no_warnings():
             ufunc(array)
 
+    @pytest.mark.parametrize("dtype", ('e', 'f', 'd'))
+    def test_divide_spurious_fpexception(self, dtype):
+        dt = np.dtype(dtype)
+        dt_info = np.finfo(dt)
+        subnorm = dt_info.smallest_subnormal
+        # Verify a bug fix caused due to filling the remaining lanes of the
+        # partially loaded dividend SIMD vector with ones, which leads to
+        # raising an overflow warning when the divisor is denormal.
+        # see https://github.com/numpy/numpy/issues/25097
+        with assert_no_warnings():
+            np.zeros(128 + 1, dtype=dt) / subnorm
+
 class TestFPClass:
     @pytest.mark.parametrize("stride", [-5, -4, -3, -2, -1, 1,
                                 2, 4, 5, 6, 7, 8, 9, 10])
@@ -4180,7 +4193,10 @@ def test_against_cmath(self):
             for p in points:
                 a = complex(func(np.complex_(p)))
                 b = cfunc(p)
-                assert_(abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b))
+                assert_(
+                    abs(a - b) < atol,
+                    "%s %s: %s; cmath: %s" % (fname, p, a, b)
+                )
 
     @pytest.mark.xfail(
         # manylinux2014 uses glibc2.17

From c6e29c8bfd3861dcecc181a6077972e2e1a8d61c Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Wed, 15 Nov 2023 17:44:54 +0200
Subject: [PATCH 2/2] ENH: Disable SIMD single-precision division optimization
 on armv7

  The decision is based on the lack of native SIMD support for
  this operation in the armhf architecture, and the associated challenges
  in performance and evaluate the benefits of emulated SIMD intrinsic versus
  native scalar division.
---
 .../src/umath/loops_arithm_fp.dispatch.c.src  | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index c32239dc12df..0d0de90125f6 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -74,7 +74,29 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
     #endif
         return;
     }
-#if @VECTOR@
+#if @is_div@ && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif @VECTOR@
     if (len > npyv_nlanes_@sfx@*2 &&
         !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
         !is_mem_overlap(src1, ssrc1, dst, sdst, len)