From 50752aa920be32b74c1a7d0e4242e84b15ffa73c Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Mon, 8 Mar 2021 14:17:19 +0530
Subject: [PATCH 1/9] ENH, SIMD: Added integer dispatch

---
 .../src/umath/loops_arithmetic.dispatch.c.src | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 numpy/core/src/umath/loops_arithmetic.dispatch.c.src

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
new file mode 100644
index 000000000000..0e68f1b7b26e
--- /dev/null
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -0,0 +1,131 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41 avx2 avx512_skx
+ ** vsx2
+ ** neon
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+#include<signal.h> 
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+//###############################################################################
+//## Unsigned Integers
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#ifdef NPY_SIMD
+/**begin repeat
+ *  #sfx = u8, u16, u32, u64#
+ */
+
+static void simd_divide_by_scalar_contig_contig_@sfx@
+(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst,
+ int len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_@sfx@ a = npyv_load_@sfx@(src);
+        npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
+        npyv_store_@sfx@(dst, c);
+    }
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_@sfx@ a = *src;
+        *dst = a / scalar;
+    }
+    npyv_cleanup();
+}
+
+/**end repeat**/
+#endif
+
+
+
+// XXX Need to see what can be done for 64 bits
+/**begin repeat
+ * Unsigned types
+ *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ *  #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ *  #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+#if NPY_BITSOF_@SIGNED_TYPE@ <= 8
+    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8
+#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16
+    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16
+#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32
+    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32
+#else
+    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64
+#endif
+static NPY_INLINE int
+run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    BINARY_DEFS
+
+    if (n == 0) {
+        return 1;
+    }
+
+    const @type@ in2 = *(@type@ *)ip2;
+    if (in2 == 0) {
+        npy_set_floatstatus_divbyzero();
+        BINARY_LOOP_SLIDING {
+            *((@type@ *)op1) = 0;
+        }
+        return 1;
+    }
+#if defined NPY_SIMD
+    #ifdef NPY_HAVE_AVX512F
+        const npy_intp vector_size_bytes = 64;
+    #elif defined NPY_HAVE_AVX2
+        const npy_intp vector_size_bytes = 32;
+    #else
+        const npy_intp vector_size_bytes = 16;
+    #endif
+    // XXX Implement other loops
+    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) {
+        simd_divide_by_scalar_@type@(ip1, in2, op1, n);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/ 
+
+/**begin repeat
+ * Unsigned types
+ *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ *  #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            io1 /= *(@type@ *)ip2;
+        }
+        *((@type@ *)iop1) = io1;
+    }
+    else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((@type@ *)op1) = 0;
+            }
+            *((@type@ *)op1) = in1 / in2;
+        }
+    }
+}
+/**end repeat**/

From 6b2fb9e6a567e24a8940d0c8d78410a310c531a1 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Mon, 8 Mar 2021 14:18:23 +0530
Subject: [PATCH 2/9] ENH, SIMD: Use integer dispatch

---
 numpy/core/src/umath/loops.c.src | 16 ----------------
 numpy/core/src/umath/loops.h.src | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 68e209fe9312..04665dc5296e 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1014,22 +1014,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
 }
 
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else {
-            *((@type@ *)op1)= in1/in2;
-        }
-    }
-}
-
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index b3a19be12d62..0301aa5ed7b8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -53,6 +53,17 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *****************************************************************************
  */
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
 /**begin repeat
  * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
  */
@@ -141,7 +152,7 @@ NPY_NO_EXPORT void
 @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-@S@@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));

From f2cb33bcf60e72924b46dd652af64d0af8da2508 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Mon, 8 Mar 2021 14:19:18 +0530
Subject: [PATCH 3/9] ENH, SIMD: Add dispatch to build process

---
 numpy/core/code_generators/generate_umath.py | 2 +-
 numpy/core/setup.py                          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index b5305fbfce98..2e5548b6924a 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -325,7 +325,7 @@ def english_upper(s):
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.floor_divide'),
           'PyUFunc_DivisionTypeResolver',
-          TD(intfltcmplx),
+          TD(intfltcmplx, cfunc_alias='divide', dispatch=[('loops_arithmetic', 'BHILQ')]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 8c34a3286d72..df405bcaf487 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -931,6 +931,7 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),

From 453043c10a1d343a0ecc16c9a88bcfd0dfdfd4ce Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Mon, 8 Mar 2021 14:19:40 +0530
Subject: [PATCH 4/9] MAINT, SIMD: Add loops_arithmetic.dispatch.c.src

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 05df19335be2..736597b6b7af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -215,5 +215,6 @@ numpy/core/src/_simd/_simd_inc.h
 # umath module
 numpy/core/src/umath/loops_unary_fp.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
+numpy/core/src/umath/loops_arithmetic.dispatch.c
 numpy/core/src/umath/loops_trigonometric.dispatch.c
 numpy/core/src/umath/loops_exponent_log.dispatch.c

From 71e84dcd2ec1a59b6426f05b9095a3a2fd51c01d Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Wed, 10 Mar 2021 20:05:34 +0530
Subject: [PATCH 5/9] MAINT: Fixed dispatch in generate_umath

---
 numpy/core/code_generators/generate_umath.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 2e5548b6924a..57c811ff3306 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -245,6 +245,8 @@ def english_upper(s):
 O = 'O'
 P = 'P'
 ints = 'bBhHiIlLqQ'
+sints = 'bhilq'
+uints = 'BHILQ'
 times = 'Mm'
 timedeltaonly = 'm'
 intsO = ints + O
@@ -325,7 +327,9 @@ def english_upper(s):
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.floor_divide'),
           'PyUFunc_DivisionTypeResolver',
-          TD(intfltcmplx, cfunc_alias='divide', dispatch=[('loops_arithmetic', 'BHILQ')]),
+          TD(uints, cfunc_alias='divide',
+              dispatch=[('loops_arithmetic', 'BHILQ')]),
+          TD(sints + flts + cmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),

From bbb143646cbaad2866ed401ca3c795f083285f78 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Sat, 20 Mar 2021 16:22:06 +0530
Subject: [PATCH 6/9] SIMD, MAINT: Refined kernel and inner ufunc functions

---
 .../src/umath/loops_arithmetic.dispatch.c.src | 109 +++++++-----------
 1 file changed, 43 insertions(+), 66 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 0e68f1b7b26e..a012d50dd72c 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -1,6 +1,6 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 sse41 avx2 avx512_skx
+ ** sse2 sse41 avx2 avx512f avx512_skx
  ** vsx2
  ** neon
  **/
@@ -12,26 +12,26 @@
 #include "loops_utils.h"
 #include "loops.h"
 #include "lowlevel_strided_loops.h"
-#include<signal.h> 
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
 //###############################################################################
-//## Unsigned Integers
+//## Division
 //###############################################################################
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
-#ifdef NPY_SIMD
+#if NPY_SIMD
 /**begin repeat
  *  #sfx = u8, u16, u32, u64#
  */
-
-static void simd_divide_by_scalar_contig_contig_@sfx@
-(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst,
- int len)
+static NPY_INLINE void
+simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
-    const int vstep = npyv_nlanes_@sfx@;
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
+    const int vstep            = npyv_nlanes_@sfx@;
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
@@ -39,92 +39,69 @@ static void simd_divide_by_scalar_contig_contig_@sfx@
         npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
         npyv_store_@sfx@(dst, c);
     }
+
     for (; len > 0; --len, ++src, ++dst) {
         const npyv_lanetype_@sfx@ a = *src;
         *dst = a / scalar;
     }
+
     npyv_cleanup();
 }
-
 /**end repeat**/
 #endif
 
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
 
-
-// XXX Need to see what can be done for 64 bits
 /**begin repeat
  * Unsigned types
- *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
- *  #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
+ *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
  */
-#if NPY_BITSOF_@SIGNED_TYPE@ <= 8
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8
-#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16
-#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32
-#else
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64
-#endif
-static NPY_INLINE int
-run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    BINARY_DEFS
-
-    if (n == 0) {
-        return 1;
-    }
-
-    const @type@ in2 = *(@type@ *)ip2;
-    if (in2 == 0) {
-        npy_set_floatstatus_divbyzero();
-        BINARY_LOOP_SLIDING {
-            *((@type@ *)op1) = 0;
-        }
-        return 1;
-    }
-#if defined NPY_SIMD
-    #ifdef NPY_HAVE_AVX512F
-        const npy_intp vector_size_bytes = 64;
-    #elif defined NPY_HAVE_AVX2
-        const npy_intp vector_size_bytes = 32;
-    #else
-        const npy_intp vector_size_bytes = 16;
-    #endif
-    // XXX Implement other loops
-    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) {
-        simd_divide_by_scalar_@type@(ip1, in2, op1, n);
-        return 1;
-    }
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_u@len@
+/**end repeat1**/
 #endif
-    return 0;
-}
-/**end repeat**/ 
 
-/**begin repeat
- * Unsigned types
- *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
- */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(@type@) {
-            io1 /= *(@type@ *)ip2;
+            const @type@ d = *(@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
         }
         *((@type@ *)iop1) = io1;
     }
-    else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) {
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+             (*(@type@ *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
         BINARY_LOOP {
             const @type@ in1 = *(@type@ *)ip1;
             const @type@ in2 = *(@type@ *)ip2;
-            if (in2 == 0) {
+            if (NPY_UNLIKELY(in2 == 0)) {
                 npy_set_floatstatus_divbyzero();
                 *((@type@ *)op1) = 0;
+            } else{
+                *((@type@ *)op1) = in1 / in2;
             }
-            *((@type@ *)op1) = in1 / in2;
         }
     }
 }

From c78d9a0bb1429f3c4d56d8687ae54cbbe7158838 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Sat, 20 Mar 2021 17:01:43 +0530
Subject: [PATCH 7/9] TST: Division tests for unsigned ints

---
 numpy/core/tests/test_umath.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2249c866caf5..b31b84d0cc2b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -250,13 +250,22 @@ def test_division_int(self):
         assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
 
     @pytest.mark.parametrize("input_dtype",
-            [np.int8, np.int16, np.int32, np.int64])
+            np.sctypes['int'] + np.sctypes['uint'])
     def test_division_int_boundary(self, input_dtype):
         iinfo = np.iinfo(input_dtype)
 
+        # Unsigned:
+        # Create list with 0, 25th, 50th, 75th percentile and max
+        if iinfo.min == 0:
+            lst = [0, iinfo.max//4, iinfo.max//2,
+                    int(iinfo.max/1.33), iinfo.max]
+            divisors = [iinfo.max//4, iinfo.max//2,
+                    int(iinfo.max/1.33), iinfo.max]
+        # Signed:
         # Create list with min, 25th percentile, 0, 75th percentile, max
-        lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
-        divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
+        else:
+            lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
+            divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
         a = np.array(lst, dtype=input_dtype)
 
         for divisor in divisors:
@@ -926,7 +935,7 @@ def test_log_values(self):
             assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
-        # See https://github.com/numpy/numpy/issues/18005 
+        # See https://github.com/numpy/numpy/issues/18005
         with assert_no_warnings():
             a = np.array(1e9, dtype='float32')
             np.log(a)

From a2c5af9c4f170cd452645a5d938d93ed24f246fa Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Mon, 22 Mar 2021 20:57:54 +0530
Subject: [PATCH 8/9] BENCH: Benchmarks for unsigned ints (#18075)

---
 benchmarks/benchmarks/bench_ufunc.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 13b7382a1708..b036581e1aae 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -135,18 +135,19 @@ def time_less_than_scalar2(self, dtype):
 
 
 class CustomScalarFloorDivideInt(Benchmark):
-    params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43, 0])
+    params = (np.sctypes['int'] + np.sctypes['uint'], [8, -8, 43, -43])
     param_names = ['dtype', 'divisors']
-    max_value = 10**7
-    min_value = -10**7
 
     def setup(self, dtype, divisor):
+        if dtype in np.sctypes['uint'] and divisor < 0:
+            raise NotImplementedError(
+                    "Skipping test for negative divisor with unsigned type")
+
         iinfo = np.iinfo(dtype)
-        self.x = np.arange(
-                max(iinfo.min, self.min_value),
-                min(iinfo.max, self.max_value), dtype=dtype)
+        self.x = np.random.randint(
+                    iinfo.min, iinfo.max, size=10000, dtype=dtype)
 
-    def time_floor_divide_int(self, dtpye, divisor):
+    def time_floor_divide_int(self, dtype, divisor):
         self.x // divisor
 
 

From 4d2e4847823d3d3c9b7380f8ee7bc1799bd070f9 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 6 Apr 2021 09:04:53 +0530
Subject: [PATCH 9/9] SIMD: Use scalar division for Armv7, Aarch64, and
 IBM/Power

Co-authored-by: Sayed Adel <seiko@imavr.com>
---
 numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index a012d50dd72c..7e9f464636c5 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -69,7 +69,17 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     #define TO_SIMD_SFX(X) X##_u@len@
 /**end repeat1**/
 #endif
-
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division,
+ * note neither infrastructure nor NPYV has supported VSX4 yet.
+ */
+#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {