From 762538cf6c00c02d16c29a0c380fd0a91bc9e96a Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 18 Jan 2022 04:31:01 +0200
Subject: [PATCH 1/2] ENH, SIMD: improve argmax/argmin performance

  for all integers, f32 and f64 data types on all
  supported architectures via universal intrinsics.
---
 benchmarks/benchmarks/bench_reduce.py         |  14 +-
 numpy/core/setup.py                           |   4 +-
 .../src/multiarray/argfunc.dispatch.c.src     | 392 ++++++++++++++++++
 numpy/core/src/multiarray/arraytypes.c.src    | 127 ++----
 .../{arraytypes.h => arraytypes.h.src}        |  21 +
 numpy/core/tests/test_multiarray.py           | 155 +++++--
 6 files changed, 599 insertions(+), 114 deletions(-)
 create mode 100644 numpy/core/src/multiarray/argfunc.dispatch.c.src
 rename numpy/core/src/multiarray/{arraytypes.h => arraytypes.h.src} (56%)

diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
index 81316c492327..ca07bd180c0e 100644
--- a/benchmarks/benchmarks/bench_reduce.py
+++ b/benchmarks/benchmarks/bench_reduce.py
@@ -73,7 +73,8 @@ def time_max(self, dtype):
         np.fmax.reduce(self.d)
 
 class ArgMax(Benchmark):
-    params = [np.float32, np.float64, bool]
+    params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
+              np.int64, np.uint64, np.float32, np.float64, bool]
     param_names = ['dtype']
 
     def setup(self, dtype):
@@ -82,6 +83,17 @@ def setup(self, dtype):
     def time_argmax(self, dtype):
         np.argmax(self.d)
 
+class ArgMin(Benchmark):
+    params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
+              np.int64, np.uint64, np.float32, np.float64, bool]
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        self.d = np.ones(200000, dtype=dtype)
+
+    def time_argmin(self, dtype):
+        np.argmin(self.d)
+
 class SmallReduction(Benchmark):
     def setup(self):
         self.d = np.ones(100, dtype=np.float32)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 46b77d4776f4..219bca57cbfa 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -830,7 +830,7 @@ def gl_if_msvc(build_cmd):
     multiarray_deps = [
             join('src', 'multiarray', 'abstractdtypes.h'),
             join('src', 'multiarray', 'arrayobject.h'),
-            join('src', 'multiarray', 'arraytypes.h'),
+            join('src', 'multiarray', 'arraytypes.h.src'),
             join('src', 'multiarray', 'arrayfunction_override.h'),
             join('src', 'multiarray', 'array_coercion.h'),
             join('src', 'multiarray', 'array_method.h'),
@@ -892,7 +892,9 @@ def gl_if_msvc(build_cmd):
             join('src', 'multiarray', 'abstractdtypes.c'),
             join('src', 'multiarray', 'alloc.c'),
             join('src', 'multiarray', 'arrayobject.c'),
+            join('src', 'multiarray', 'arraytypes.h.src'),
             join('src', 'multiarray', 'arraytypes.c.src'),
+            join('src', 'multiarray', 'argfunc.dispatch.c.src'),
             join('src', 'multiarray', 'array_coercion.c'),
             join('src', 'multiarray', 'array_method.c'),
             join('src', 'multiarray', 'array_assign_scalar.c'),
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
new file mode 100644
index 000000000000..39222bc9a3ad
--- /dev/null
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -0,0 +1,392 @@
+/* -*- c -*- */
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse42 xop avx2 avx512_skx
+ ** vsx2
+ ** neon asimd
+ **/
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "numpy/npy_math.h"
+
+#include "arraytypes.h"
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+#if NPY_SIMD
+#if NPY_SIMD > 512 || NPY_SIMD < 0
+    #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"
+    // TODO: add special loop for large SIMD width.
+    // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
+    // or maybe expand the indices to 32|64-bit vectors(slower).
+#endif
+/**begin repeat
+ * #sfx = u8, s8, u16, s16#
+ * #usfx = u8, u8, u16, u16#
+ * #bsfx = b8, b8, b16, b16#
+ * #idx_max = NPY_MAX_UINT8*2, NPY_MAX_UINT16*2#
+ */
+/**begin repeat1
+ * #intrin = cmpgt, cmplt#
+ * #func = argmax, argmin#
+ * #op = >, <#
+ */
+static inline npy_intp
+simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
+{
+    npyv_lanetype_@sfx@ s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep*4;
+    npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
+    const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
+    const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
+    const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
+
+    const npy_intp max_block = @idx_max@*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
+        npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
+        npyv_@usfx@ acc_indices_scale = npyv_zero_@usfx@();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i2);
+            npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
+            npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
+            npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
+            npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
+            npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
+            npyv_@sfx@  x_ba = npyv_select_@sfx@(m_ba, b, a);
+            npyv_@sfx@  x_dc = npyv_select_@sfx@(m_dc, d, c);
+            npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
+            npyv_@sfx@  x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);
+
+            npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
+            npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
+            npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
+            npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
+            acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_@usfx@(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_@usfx@(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
+        npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
+        npyv_lanetype_@usfx@ dacc_s[npyv_nlanes_@sfx@];
+        npyv_store_@sfx@(dacc, acc);
+        npyv_store_@usfx@(dacc_i, acc_indices);
+        npyv_store_@usfx@(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] @op@ s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_@sfx@ a = ip[i];
+        if (a @op@ s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+/**end repeat1**/
+/**end repeat**/
+#endif
+
+/**begin repeat
+ * #sfx = u32, s32, u64, s64, f32, f64#
+ * #usfx = u32, u32, u64, u64, u32, u64#
+ * #bsfx = b32, b32, b64, b64, b32, b64#
+ * #is_fp = 0*4, 1*2#
+ * #is_idx32 = 1*2, 0*2, 1, 0#
+ * #chk_simd = NPY_SIMD*5, NPY_SIMD_F64#
+ */
+#if @chk_simd@
+/**begin repeat1
+ * #intrin = cmpgt, cmplt#
+ * #func = argmax, argmin#
+ * #op = >, <#
+ * #iop = <, >#
+ */
+static inline npy_intp
+simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
+{
+    npyv_lanetype_@sfx@ s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if @is_idx32@
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
+    const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
+    const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
+    const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
+    npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
+        npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
+        npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
+        npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
+        npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
+        npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
+        npyv_@sfx@  x_ba = npyv_select_@sfx@(m_ba, b, a);
+        npyv_@sfx@  x_dc = npyv_select_@sfx@(m_dc, d, c);
+        npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
+        npyv_@sfx@  x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);
+
+        npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
+        npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
+        npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
+        npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
+        acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, idx_dcba), acc_indices);
+
+    #if @is_fp@
+        npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
+        npyv_@bsfx@ nnan_b = npyv_notnan_@sfx@(b);
+        npyv_@bsfx@ nnan_c = npyv_notnan_@sfx@(c);
+        npyv_@bsfx@ nnan_d = npyv_notnan_@sfx@(d);
+        npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
+        npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
+        if (nnan != ((1LL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
+            nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
+            nnan_4[2] = npyv_tobits_@bsfx@(nnan_c);
+            nnan_4[3] = npyv_tobits_@bsfx@(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
+        npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
+        npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(a, acc);
+        acc = npyv_select_@sfx@(m_acc, a, acc);
+        acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, vindices_0), acc_indices);
+    #if @is_fp@
+        npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
+        npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
+        if (nnan != ((1LL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
+    npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
+    npyv_store_@usfx@(dacc_i, acc_indices);
+    npyv_store_@sfx@(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] @op@ s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_@sfx@ a = ip[i];
+    #if @is_fp@
+        if (!(a @iop@= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a @op@ s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if @is_fp@
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+/**end repeat1**/
+#endif // chk_simd
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigned@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #func = argmax, argmin#
+ * #op = >, <#
+ * #iop = <, >#
+ */
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(@type@ *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if @is_fp@
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+#else
+    @type@ mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        @type@ a = ip[i];
+    #if @is_fp@
+        if (!(a @iop@= mp)) {  // negated, for correct nan handling
+    #else
+        if (a @op@ mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if @is_fp@
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
+(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+
+{
+    npy_intp i = 0;
+#if NPY_SIMD
+    const npyv_u8 zero = npyv_zero_u8();
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * 4;
+    for (npy_intp n = len & -wstep; i < n; i += wstep) {
+        npyv_u8 a = npyv_load_u8(ip + i + vstep*0);
+        npyv_u8 b = npyv_load_u8(ip + i + vstep*1);
+        npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
+        npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
+        npyv_b8 m_a = npyv_cmpeq_u8(a, zero);
+        npyv_b8 m_b = npyv_cmpeq_u8(b, zero);
+        npyv_b8 m_c = npyv_cmpeq_u8(c, zero);
+        npyv_b8 m_d = npyv_cmpeq_u8(d, zero);
+        npyv_b8 m_ab = npyv_and_b8(m_a, m_b);
+        npyv_b8 m_cd = npyv_and_b8(m_c, m_d);
+        npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd));
+    #if NPY_SIMD == 512
+        if (m != NPY_MAX_UINT64) {
+    #else
+        if ((npy_int64)m != ((1LL << vstep) - 1)) {
+    #endif
+            break;
+        }
+    }
+#endif // NPY_SIMD
+    for (; i < len; ++i) {
+        if (ip[i]) {
+            *mindx = i;
+            return 0;
+        }
+    }
+    *mindx = 0;
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 71401c60e8d0..1dc6c9bb1feb 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -27,12 +27,6 @@
 #include "arrayobject.h"
 #include "alloc.h"
 #include "typeinfo.h"
-#if defined(__ARM_NEON__) || defined (__ARM_NEON)
-#include <arm_neon.h>
-#endif
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#endif
 
 #include "npy_longdouble.h"
 #include "numpyos.h"
@@ -42,7 +36,7 @@
 #include "npy_cblas.h"
 #include "npy_buffer.h"
 
-
+#include "arraytypes.h"
 /*
  * Define a stack allocated dummy array with only the minimum information set:
  *   1. The descr, the main field interesting here.
@@ -3176,77 +3170,21 @@ finish:
  **                                 ARGFUNC                                 **
  *****************************************************************************
  */
-#if defined(__ARM_NEON__) || defined (__ARM_NEON)
-    int32_t _mm_movemask_epi8_neon(uint8x16_t input)
-    {
-        int8x8_t m0 = vcreate_s8(0x0706050403020100ULL);
-        uint8x16_t v0 = vshlq_u8(vshrq_n_u8(input, 7), vcombine_s8(m0, m0));
-        uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
-        return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
-    }
-#endif
-#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b))
 
-static int
-BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
-            PyArrayObject *NPY_UNUSED(aip))
-
-{
-    npy_intp i = 0;
-    /* memcmp like logical_and on i386 is maybe slower for small arrays */
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-    const __m128i zero = _mm_setzero_si128();
-    for (; i < n - (n % 32); i+=32) {
-        __m128i d1 = _mm_loadu_si128((__m128i*)&ip[i]);
-        __m128i d2 = _mm_loadu_si128((__m128i*)&ip[i + 16]);
-        d1 = _mm_cmpeq_epi8(d1, zero);
-        d2 = _mm_cmpeq_epi8(d2, zero);
-        if (_mm_movemask_epi8(_mm_min_epu8(d1, d2)) != 0xFFFF) {
-            break;
-        }
-    }
-#else
-    #if defined(__ARM_NEON__) || defined (__ARM_NEON)
-        uint8x16_t zero = vdupq_n_u8(0);
-        for(; i < n - (n % 32); i+=32) {
-            uint8x16_t d1 = vld1q_u8((uint8_t *)&ip[i]);
-            uint8x16_t d2 = vld1q_u8((uint8_t *)&ip[i + 16]);
-            d1 = vceqq_u8(d1, zero);
-            d2 = vceqq_u8(d2, zero);
-            if(_mm_movemask_epi8_neon(vminq_u8(d1, d2)) != 0xFFFF) {
-                break;
-            }
-        }
-    #endif
-#endif
-    for (; i < n; i++) {
-        if (ip[i]) {
-            *max_ind = i;
-            return 0;
-        }
-    }
-    *max_ind = 0;
-    return 0;
-}
+#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b))
 
 /**begin repeat
  *
- * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *          LONG, ULONG, LONGLONG, ULONGLONG,
- *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * #fname = HALF, CFLOAT, CDOUBLE, CLONGDOUBLE,
  *          DATETIME, TIMEDELTA#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_float, npy_double, npy_longdouble,
+ * #type = npy_half, npy_float, npy_double, npy_longdouble,
  *         npy_datetime, npy_timedelta#
- * #isfloat = 0*10, 1*7, 0*2#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
- * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
- * #iscomplex = 0*14, 1*3, 0*2#
- * #incr = ip++*14, ip+=2*3, ip++*2#
- * #isdatetime = 0*17, 1*2#
+ * #isfloat = 1*4, 0*2#
+ * #isnan = npy_half_isnan, npy_isnan*3, nop*2#
+ * #le = npy_half_le, _LESS_THAN_OR_EQUAL*5#
+ * #iscomplex = 0, 1*3, 0*2#
+ * #incr = ip++, ip+=2*3, ip++*2#
+ * #isdatetime = 0*4, 1*2#
  */
 static int
 @fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind,
@@ -3337,22 +3275,16 @@ BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind,
 
 /**begin repeat
  *
- * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *          LONG, ULONG, LONGLONG, ULONGLONG,
- *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * #fname = HALF, CFLOAT, CDOUBLE, CLONGDOUBLE,
  *          DATETIME, TIMEDELTA#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_float, npy_double, npy_longdouble,
+ * #type = npy_half, npy_float, npy_double, npy_longdouble,
  *         npy_datetime, npy_timedelta#
- * #isfloat = 0*10, 1*7, 0*2#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
- * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
- * #iscomplex = 0*14, 1*3, 0*2#
- * #incr = ip++*14, ip+=2*3, ip++*2#
- * #isdatetime = 0*17, 1*2#
+ * #isfloat = 1*4, 0*2#
+ * #isnan = npy_half_isnan, npy_isnan*3, nop*2#
+ * #le = npy_half_le, _LESS_THAN_OR_EQUAL*5#
+ * #iscomplex = 0, 1*3, 0*2#
+ * #incr = ip++, ip+=2*3, ip++*2#
+ * #isdatetime = 0*4, 1*2#
  */
 static int
 @fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
@@ -3409,7 +3341,7 @@ static int
             *min_ind = i;
             break;
         }
-#endif 
+#endif
         if (!@le@(mp, *ip)) {  /* negated, for correct nan handling */
             mp = *ip;
             *min_ind = i;
@@ -4494,6 +4426,27 @@ set_typeinfo(PyObject *dict)
     PyArray_Descr *dtype;
     PyObject *cobj, *key;
 
+    // SIMD runtime dispatching
+    #ifndef NPY_DISABLE_OPTIMIZATION
+        #include "argfunc.dispatch.h"
+    #endif
+    /**begin repeat
+     * #FROM = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         LONG, ULONG, LONGLONG, ULONGLONG,
+     *         FLOAT, DOUBLE, LONGDOUBLE#
+     *
+     * #NAME = Byte, UByte, Short, UShort, Int, UInt,
+     *         Long, ULong, LongLong, ULongLong,
+     *         Float, Double, LongDouble#
+     */
+    /**begin repeat1
+     * #func = argmax, argmin#
+     */
+    NPY_CPU_DISPATCH_CALL_XB(_Py@NAME@_ArrFuncs.@func@ = (PyArray_ArgFunc*)@FROM@_@func@);
+    /**end repeat1**/
+    /**end repeat**/
+    NPY_CPU_DISPATCH_CALL_XB(_PyBool_ArrFuncs.argmax = (PyArray_ArgFunc*)BOOL_argmax);
+
     /*
      * Override the base class for all types, eventually all of this logic
      * should be defined on the class and inherited to the scalar.
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h.src
similarity index 56%
rename from numpy/core/src/multiarray/arraytypes.h
rename to numpy/core/src/multiarray/arraytypes.h.src
index b3a13b297da1..4c7487189b5a 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h.src
@@ -28,4 +28,25 @@ small_correlate(const char * d_, npy_intp dstride,
                 npy_intp nk, enum NPY_TYPES ktype,
                 char * out_, npy_intp ostride);
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "argfunc.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         float, double, longdouble#
+ */
+/**begin repeat1
+ * #func = argmax, argmin#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int @TYPE@_@func@,
+    (npy_@type@ *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+/**end repeat1**/
+/**end repeat**/
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
+    (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 4e23625470b1..137fb02ec838 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4190,7 +4190,8 @@ class TestArgmaxArgminCommon:
     sizes = [(), (3,), (3, 2), (2, 3),
              (3, 3), (2, 3, 4), (4, 3, 2),
              (1, 2, 3, 4), (2, 3, 4, 1),
-             (3, 4, 1, 2), (4, 1, 2, 3)]
+             (3, 4, 1, 2), (4, 1, 2, 3),
+             (64,), (128,), (256,)]
 
     @pytest.mark.parametrize("size, axis", itertools.chain(*[[(size, axis)
         for axis in list(range(-len(size), len(size))) + [None]]
@@ -4304,9 +4305,9 @@ def test_output_shape(self, method):
     @pytest.mark.parametrize('ndim', [0, 1])
     @pytest.mark.parametrize('method', ['argmax', 'argmin'])
     def test_ret_is_out(self, ndim, method):
-        a = np.ones((4,) + (3,)*ndim)
+        a = np.ones((4,) + (256,)*ndim)
         arg_method = getattr(a, method)
-        out = np.empty((3,)*ndim, dtype=np.intp)
+        out = np.empty((256,)*ndim, dtype=np.intp)
         ret = arg_method(axis=0, out=out)
         assert ret is out
 
@@ -4357,12 +4358,44 @@ def test_object_with_NULLs(self, method, vals):
         assert_equal(arg_method(), 1)
 
 class TestArgmax:
-
-    nan_arr = [
-        ([0, 1, 2, 3, np.nan], 4),
-        ([0, 1, 2, np.nan, 3], 3),
-        ([np.nan, 0, 1, 2, 3], 0),
-        ([np.nan, 0, np.nan, 2, 3], 0),
+    usg_data = [
+        ([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 0),
+        ([3, 3, 3, 3,  2,  2,  2,  2], 0),
+        ([0, 1, 2, 3,  4,  5,  6,  7], 7),
+        ([7, 6, 5, 4,  3,  2,  1,  0], 0)
+    ]
+    sg_data = usg_data + [
+        ([1, 2, 3, 4, -4, -3, -2, -1], 3),
+        ([1, 2, 3, 4, -1, -2, -3, -4], 3)
+    ]
+    darr = [(np.array(d[0], dtype=t), d[1]) for d, t in (
+        itertools.product(usg_data, (
+            np.uint8, np.uint16, np.uint32, np.uint64
+        ))
+    )]
+    darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+        itertools.product(sg_data, (
+            np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+        ))
+    )]
+    darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+        itertools.product((
+            ([0, 1, 2, 3, np.nan], 4),
+            ([0, 1, 2, np.nan, 3], 3),
+            ([np.nan, 0, 1, 2, 3], 0),
+            ([np.nan, 0, np.nan, 2, 3], 0),
+            # To hit the tail of SIMD multi-level(x4, x1) inner loops
+            # on varient SIMD widthes
+            ([1] * (2*5-1) + [np.nan], 2*5-1),
+            ([1] * (4*5-1) + [np.nan], 4*5-1),
+            ([1] * (8*5-1) + [np.nan], 8*5-1),
+            ([1] * (16*5-1) + [np.nan], 16*5-1),
+            ([1] * (32*5-1) + [np.nan], 32*5-1)
+        ), (
+            np.float32, np.float64
+        ))
+    )]
+    nan_arr = darr + [
         ([0, 1, 2, 3, complex(0, np.nan)], 4),
         ([0, 1, 2, 3, complex(np.nan, 0)], 4),
         ([0, 1, 2, complex(np.nan, 0), 3], 3),
@@ -4432,28 +4465,80 @@ def test_combinations(self, data):
         assert_equal(np.argmax(arr), pos, err_msg="%r" % arr)
         assert_equal(arr[np.argmax(arr)], val, err_msg="%r" % arr)
 
+        # add padding to test SIMD loops
+        rarr = np.repeat(arr, 129)
+        rpos = pos * 129
+        assert_equal(np.argmax(rarr), rpos, err_msg="%r" % rarr)
+        assert_equal(rarr[np.argmax(rarr)], val, err_msg="%r" % rarr)
+
+        padd = np.repeat(np.min(arr), 513)
+        rarr = np.concatenate((arr, padd))
+        rpos = pos
+        assert_equal(np.argmax(rarr), rpos, err_msg="%r" % rarr)
+        assert_equal(rarr[np.argmax(rarr)], val, err_msg="%r" % rarr)
+
+
     def test_maximum_signed_integers(self):
 
         a = np.array([1, 2**7 - 1, -2**7], dtype=np.int8)
         assert_equal(np.argmax(a), 1)
+        a.repeat(129)
+        assert_equal(np.argmax(a), 1)
 
         a = np.array([1, 2**15 - 1, -2**15], dtype=np.int16)
         assert_equal(np.argmax(a), 1)
+        a.repeat(129)
+        assert_equal(np.argmax(a), 1)
 
         a = np.array([1, 2**31 - 1, -2**31], dtype=np.int32)
         assert_equal(np.argmax(a), 1)
+        a.repeat(129)
+        assert_equal(np.argmax(a), 1)
 
         a = np.array([1, 2**63 - 1, -2**63], dtype=np.int64)
         assert_equal(np.argmax(a), 1)
-
+        a.repeat(129)
+        assert_equal(np.argmax(a), 1)
 
 class TestArgmin:
-
-    nan_arr = [
-        ([0, 1, 2, 3, np.nan], 4),
-        ([0, 1, 2, np.nan, 3], 3),
-        ([np.nan, 0, 1, 2, 3], 0),
-        ([np.nan, 0, np.nan, 2, 3], 0),
+    usg_data = [
+        ([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 8),
+        ([3, 3, 3, 3,  2,  2,  2,  2], 4),
+        ([0, 1, 2, 3,  4,  5,  6,  7], 0),
+        ([7, 6, 5, 4,  3,  2,  1,  0], 7)
+    ]
+    sg_data = usg_data + [
+        ([1, 2, 3, 4, -4, -3, -2, -1], 4),
+        ([1, 2, 3, 4, -1, -2, -3, -4], 7)
+    ]
+    darr = [(np.array(d[0], dtype=t), d[1]) for d, t in (
+        itertools.product(usg_data, (
+            np.uint8, np.uint16, np.uint32, np.uint64
+        ))
+    )]
+    darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+        itertools.product(sg_data, (
+            np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+        ))
+    )]
+    darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+        itertools.product((
+            ([0, 1, 2, 3, np.nan], 4),
+            ([0, 1, 2, np.nan, 3], 3),
+            ([np.nan, 0, 1, 2, 3], 0),
+            ([np.nan, 0, np.nan, 2, 3], 0),
+            # To hit the tail of SIMD multi-level(x4, x1) inner loops
+            # on varient SIMD widthes
+            ([1] * (2*5-1) + [np.nan], 2*5-1),
+            ([1] * (4*5-1) + [np.nan], 4*5-1),
+            ([1] * (8*5-1) + [np.nan], 8*5-1),
+            ([1] * (16*5-1) + [np.nan], 16*5-1),
+            ([1] * (32*5-1) + [np.nan], 32*5-1)
+        ), (
+            np.float32, np.float64
+        ))
+    )]
+    nan_arr = darr + [
         ([0, 1, 2, 3, complex(0, np.nan)], 4),
         ([0, 1, 2, 3, complex(np.nan, 0)], 4),
         ([0, 1, 2, complex(np.nan, 0), 3], 3),
@@ -4512,30 +4597,50 @@ class TestArgmin:
         ([False, True, False, True, True], 0),
     ]
 
-    def test_combinations(self):
-        for arr, pos in self.nan_arr:
-            with suppress_warnings() as sup:
-                sup.filter(RuntimeWarning,
-                           "invalid value encountered in reduce")
-                min_val = np.min(arr)
+    @pytest.mark.parametrize('data', nan_arr)
+    def test_combinations(self, data):
+        arr, pos = data
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning,
+                       "invalid value encountered in reduce")
+            min_val = np.min(arr)
+
+        assert_equal(np.argmin(arr), pos, err_msg="%r" % arr)
+        assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr)
 
-            assert_equal(np.argmin(arr), pos, err_msg="%r" % arr)
-            assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr)
+        # add padding to test SIMD loops
+        rarr = np.repeat(arr, 129)
+        rpos = pos * 129
+        assert_equal(np.argmin(rarr), rpos, err_msg="%r" % rarr)
+        assert_equal(rarr[np.argmin(rarr)], min_val, err_msg="%r" % rarr)
+
+        padd = np.repeat(np.max(arr), 513)
+        rarr = np.concatenate((arr, padd))
+        rpos = pos
+        assert_equal(np.argmin(rarr), rpos, err_msg="%r" % rarr)
+        assert_equal(rarr[np.argmin(rarr)], min_val, err_msg="%r" % rarr)
 
     def test_minimum_signed_integers(self):
 
         a = np.array([1, -2**7, -2**7 + 1, 2**7 - 1], dtype=np.int8)
         assert_equal(np.argmin(a), 1)
+        a.repeat(129)
+        assert_equal(np.argmin(a), 1)
 
         a = np.array([1, -2**15, -2**15 + 1, 2**15 - 1], dtype=np.int16)
         assert_equal(np.argmin(a), 1)
+        a.repeat(129)
+        assert_equal(np.argmin(a), 1)
 
         a = np.array([1, -2**31, -2**31 + 1, 2**31 - 1], dtype=np.int32)
         assert_equal(np.argmin(a), 1)
+        a.repeat(129)
+        assert_equal(np.argmin(a), 1)
 
         a = np.array([1, -2**63, -2**63 + 1, 2**63 - 1], dtype=np.int64)
         assert_equal(np.argmin(a), 1)
-
+        a.repeat(129)
+        assert_equal(np.argmin(a), 1)
 
 class TestMinMax:
 

From 52787cc75a462466cd3e41ec12c4776eff4c98de Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 18 Jan 2022 06:53:30 +0200
Subject: [PATCH 2/2] add npyv_cleanup()

to avoid the AVX-SSE transition penalty.
---
 numpy/core/src/multiarray/argfunc.dispatch.c.src | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index 39222bc9a3ad..cbfaebdb4796 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -323,6 +323,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 #endif
 #ifdef TO_SIMD_SFX
     *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
 #else
     @type@ mp = *ip;
     *mindx = 0;
@@ -380,6 +381,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
             break;
         }
     }
+    npyv_cleanup();
 #endif // NPY_SIMD
     for (; i < len; ++i) {
         if (ip[i]) {