From 762538cf6c00c02d16c29a0c380fd0a91bc9e96a Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 18 Jan 2022 04:31:01 +0200 Subject: [PATCH 1/2] ENH, SIMD: improve argmax/argmin performance for all integers, f32 and f64 data types on all supported architectures via universal intrinsics. --- benchmarks/benchmarks/bench_reduce.py | 14 +- numpy/core/setup.py | 4 +- .../src/multiarray/argfunc.dispatch.c.src | 392 ++++++++++++++++++ numpy/core/src/multiarray/arraytypes.c.src | 127 ++---- .../{arraytypes.h => arraytypes.h.src} | 21 + numpy/core/tests/test_multiarray.py | 155 +++++-- 6 files changed, 599 insertions(+), 114 deletions(-) create mode 100644 numpy/core/src/multiarray/argfunc.dispatch.c.src rename numpy/core/src/multiarray/{arraytypes.h => arraytypes.h.src} (56%) diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py index 81316c492327..ca07bd180c0e 100644 --- a/benchmarks/benchmarks/bench_reduce.py +++ b/benchmarks/benchmarks/bench_reduce.py @@ -73,7 +73,8 @@ def time_max(self, dtype): np.fmax.reduce(self.d) class ArgMax(Benchmark): - params = [np.float32, np.float64, bool] + params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, + np.int64, np.uint64, np.float32, np.float64, bool] param_names = ['dtype'] def setup(self, dtype): @@ -82,6 +83,17 @@ def setup(self, dtype): def time_argmax(self, dtype): np.argmax(self.d) +class ArgMin(Benchmark): + params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, + np.int64, np.uint64, np.float32, np.float64, bool] + param_names = ['dtype'] + + def setup(self, dtype): + self.d = np.ones(200000, dtype=dtype) + + def time_argmin(self, dtype): + np.argmin(self.d) + class SmallReduction(Benchmark): def setup(self): self.d = np.ones(100, dtype=np.float32) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 46b77d4776f4..219bca57cbfa 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -830,7 +830,7 @@ def gl_if_msvc(build_cmd): multiarray_deps = [ join('src', 'multiarray', 'abstractdtypes.h'), join('src', 'multiarray', 'arrayobject.h'), - join('src', 'multiarray', 'arraytypes.h'), + join('src', 'multiarray', 'arraytypes.h.src'), join('src', 'multiarray', 'arrayfunction_override.h'), join('src', 'multiarray', 'array_coercion.h'), join('src', 'multiarray', 'array_method.h'), @@ -892,7 +892,9 @@ def gl_if_msvc(build_cmd): join('src', 'multiarray', 'abstractdtypes.c'), join('src', 'multiarray', 'alloc.c'), join('src', 'multiarray', 'arrayobject.c'), + join('src', 'multiarray', 'arraytypes.h.src'), join('src', 'multiarray', 'arraytypes.c.src'), + join('src', 'multiarray', 'argfunc.dispatch.c.src'), join('src', 'multiarray', 'array_coercion.c'), join('src', 'multiarray', 'array_method.c'), join('src', 'multiarray', 'array_assign_scalar.c'), diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src new file mode 100644 index 000000000000..39222bc9a3ad --- /dev/null +++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src @@ -0,0 +1,392 @@ +/* -*- c -*- */ +/*@targets + ** $maxopt baseline + ** sse2 sse42 xop avx2 avx512_skx + ** vsx2 + ** neon asimd + **/ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "numpy/npy_math.h" + +#include "arraytypes.h" + +#define MIN(a,b) (((a)<(b))?(a):(b)) + +#if NPY_SIMD +#if NPY_SIMD > 512 || NPY_SIMD < 0 + #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD" + // TODO: add special loop for large SIMD width. + // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width + // or maybe expand the indices to 32|64-bit vectors(slower). +#endif +/**begin repeat + * #sfx = u8, s8, u16, s16# + * #usfx = u8, u8, u16, u16# + * #bsfx = b8, b8, b16, b16# + * #idx_max = NPY_MAX_UINT8*2, NPY_MAX_UINT16*2# + */ +/**begin repeat1 + * #intrin = cmpgt, cmplt# + * #func = argmax, argmin# + * #op = >, <# + */ +static inline npy_intp +simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len) +{ + npyv_lanetype_@sfx@ s_acc = *ip; + npy_intp ret_idx = 0, i = 0; + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep*4; + npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4]; + for (int vi = 0; vi < wstep; ++vi) { + d_vindices[vi] = vi; + } + const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices); + const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep); + const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2); + const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3); + + const npy_intp max_block = @idx_max@*wstep & -wstep; + npy_intp len0 = len & -wstep; + while (i < len0) { + npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc); + npyv_@usfx@ acc_indices = npyv_zero_@usfx@(); + npyv_@usfx@ acc_indices_scale = npyv_zero_@usfx@(); + + npy_intp n = i + MIN(len0 - i, max_block); + npy_intp ik = i, i2 = 0; + for (; i < n; i += wstep, ++i2) { + npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i2); + npyv_@sfx@ a = npyv_load_@sfx@(ip + i); + npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep); + npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2); + npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3); + + // reverse to put lowest index first in case of matched values + npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a); + npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c); + npyv_@sfx@ x_ba = npyv_select_@sfx@(m_ba, b, a); + npyv_@sfx@ x_dc = npyv_select_@sfx@(m_dc, d, c); + npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba); + npyv_@sfx@ x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba); + + npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0); + npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2); + npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba); + npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc); + acc = npyv_select_@sfx@(m_acc, x_dcba, acc); + acc_indices = npyv_select_@usfx@(m_acc, idx_dcba, acc_indices); + acc_indices_scale = npyv_select_@usfx@(m_acc, vi, acc_indices_scale); + } + // reduce + npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@]; + npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@]; + npyv_lanetype_@usfx@ dacc_s[npyv_nlanes_@sfx@]; + npyv_store_@sfx@(dacc, acc); + npyv_store_@usfx@(dacc_i, acc_indices); + npyv_store_@usfx@(dacc_s, acc_indices_scale); + + for (int vi = 0; vi < vstep; ++vi) { + if (dacc[vi] @op@ s_acc) { + s_acc = dacc[vi]; + ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi]; + } + } + // get the lowest index in case of matched values + for (int vi = 0; vi < vstep; ++vi) { + npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi]; + if (s_acc == dacc[vi] && ret_idx > idx) { + ret_idx = idx; + } + } + } + for (; i < len; ++i) { + npyv_lanetype_@sfx@ a = ip[i]; + if (a @op@ s_acc) { + s_acc = a; + ret_idx = i; + } + } + return ret_idx; +} +/**end repeat1**/ +/**end repeat**/ +#endif + +/**begin repeat + * #sfx = u32, s32, u64, s64, f32, f64# + * #usfx = u32, u32, u64, u64, u32, u64# + * #bsfx = b32, b32, b64, b64, b32, b64# + * #is_fp = 0*4, 1*2# + * #is_idx32 = 1*2, 0*2, 1, 0# + * #chk_simd = NPY_SIMD*5, NPY_SIMD_F64# + */ +#if @chk_simd@ +/**begin repeat1 + * #intrin = cmpgt, cmplt# + * #func = argmax, argmin# + * #op = >, <# + * #iop = <, ># + */ +static inline npy_intp +simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len) +{ + npyv_lanetype_@sfx@ s_acc = *ip; + npy_intp ret_idx = 0, i = 0; + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep*4; + // loop by a scalar will perform better for small arrays + if (len < wstep) { + goto scalar_loop; + } + npy_intp len0 = len; + // guard against wraparound vector addition for 32-bit indices + // in case of the array length is larger than 16gb +#if @is_idx32@ + if (len0 > NPY_MAX_UINT32) { + len0 = NPY_MAX_UINT32; + } +#endif + // create index for vector indices + npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4]; + for (int vi = 0; vi < wstep; ++vi) { + d_vindices[vi] = vi; + } + const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices); + const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep); + const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2); + const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3); + // initialize vector accumulator for highest values and its indexes + npyv_@usfx@ acc_indices = npyv_zero_@usfx@(); + npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc); + for (npy_intp n = len0 & -wstep; i < n; i += wstep) { + npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i); + npyv_@sfx@ a = npyv_load_@sfx@(ip + i); + npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep); + npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2); + npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3); + + // reverse to put lowest index first in case of matched values + npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a); + npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c); + npyv_@sfx@ x_ba = npyv_select_@sfx@(m_ba, b, a); + npyv_@sfx@ x_dc = npyv_select_@sfx@(m_dc, d, c); + npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba); + npyv_@sfx@ x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba); + + npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0); + npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2); + npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba); + npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc); + acc = npyv_select_@sfx@(m_acc, x_dcba, acc); + acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, idx_dcba), acc_indices); + + #if @is_fp@ + npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a); + npyv_@bsfx@ nnan_b = npyv_notnan_@sfx@(b); + npyv_@bsfx@ nnan_c = npyv_notnan_@sfx@(c); + npyv_@bsfx@ nnan_d = npyv_notnan_@sfx@(d); + npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b); + npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d); + npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd)); + if (nnan != ((1LL << vstep) - 1)) { + npy_uint64 nnan_4[4]; + nnan_4[0] = npyv_tobits_@bsfx@(nnan_a); + nnan_4[1] = npyv_tobits_@bsfx@(nnan_b); + nnan_4[2] = npyv_tobits_@bsfx@(nnan_c); + nnan_4[3] = npyv_tobits_@bsfx@(nnan_d); + for (int ni = 0; ni < 4; ++ni) { + for (int vi = 0; vi < vstep; ++vi) { + if (!((nnan_4[ni] >> vi) & 1)) { + return i + ni*vstep + vi; + } + } + } + } + #endif + } + for (npy_intp n = len0 & -vstep; i < n; i += vstep) { + npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i); + npyv_@sfx@ a = npyv_load_@sfx@(ip + i); + npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(a, acc); + acc = npyv_select_@sfx@(m_acc, a, acc); + acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, vindices_0), acc_indices); + #if @is_fp@ + npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a); + npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a); + if (nnan != ((1LL << vstep) - 1)) { + for (int vi = 0; vi < vstep; ++vi) { + if (!((nnan >> vi) & 1)) { + return i + vi; + } + } + } + #endif + } + + // reduce + npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@]; + npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@]; + npyv_store_@usfx@(dacc_i, acc_indices); + npyv_store_@sfx@(dacc, acc); + + s_acc = dacc[0]; + ret_idx = dacc_i[0]; + for (int vi = 1; vi < vstep; ++vi) { + if (dacc[vi] @op@ s_acc) { + s_acc = dacc[vi]; + ret_idx = (npy_intp)dacc_i[vi]; + } + } + // get the lowest index in case of matched values + for (int vi = 0; vi < vstep; ++vi) { + if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) { + ret_idx = dacc_i[vi]; + } + } +scalar_loop: + for (; i < len; ++i) { + npyv_lanetype_@sfx@ a = ip[i]; + #if @is_fp@ + if (!(a @iop@= s_acc)) { // negated, for correct nan handling + #else + if (a @op@ s_acc) { + #endif + s_acc = a; + ret_idx = i; + #if @is_fp@ + if (npy_isnan(s_acc)) { + // nan encountered, it's maximal + return ret_idx; + } + #endif + } + } + return ret_idx; +} +/**end repeat1**/ +#endif // chk_simd +/**end repeat**/ + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_float, npy_double, npy_longdouble# + * + * #is_fp = 0*10, 1*3# + * #is_unsigned = 1*5, 0*5, 0*3# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigned@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ +#endif + +/**begin repeat1 + * #func = argmax, argmin# + * #op = >, <# + * #iop = <, ># + */ +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) +(@type@ *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) +{ +#if @is_fp@ + if (npy_isnan(*ip)) { + // nan encountered; it's maximal|minimal + *mindx = 0; + return 0; + } +#endif +#ifdef TO_SIMD_SFX + *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n); +#else + @type@ mp = *ip; + *mindx = 0; + npy_intp i = 1; + + for (; i < n; ++i) { + @type@ a = ip[i]; + #if @is_fp@ + if (!(a @iop@= mp)) { // negated, for correct nan handling + #else + if (a @op@ mp) { + #endif + mp = a; + *mindx = i; + #if @is_fp@ + if (npy_isnan(mp)) { + // nan encountered, it's maximal|minimal + break; + } + #endif + } + } +#endif // TO_SIMD_SFX + return 0; +} +/**end repeat1**/ +/**end repeat**/ + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) +(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) + +{ + npy_intp i = 0; +#if NPY_SIMD + const npyv_u8 zero = npyv_zero_u8(); + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * 4; + for (npy_intp n = len & -wstep; i < n; i += wstep) { + npyv_u8 a = npyv_load_u8(ip + i + vstep*0); + npyv_u8 b = npyv_load_u8(ip + i + vstep*1); + npyv_u8 c = npyv_load_u8(ip + i + vstep*2); + npyv_u8 d = npyv_load_u8(ip + i + vstep*3); + npyv_b8 m_a = npyv_cmpeq_u8(a, zero); + npyv_b8 m_b = npyv_cmpeq_u8(b, zero); + npyv_b8 m_c = npyv_cmpeq_u8(c, zero); + npyv_b8 m_d = npyv_cmpeq_u8(d, zero); + npyv_b8 m_ab = npyv_and_b8(m_a, m_b); + npyv_b8 m_cd = npyv_and_b8(m_c, m_d); + npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd)); + #if NPY_SIMD == 512 + if (m != NPY_MAX_UINT64) { + #else + if ((npy_int64)m != ((1LL << vstep) - 1)) { + #endif + break; + } + } +#endif // NPY_SIMD + for (; i < len; ++i) { + if (ip[i]) { + *mindx = i; + return 0; + } + } + *mindx = 0; + return 0; +} diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 71401c60e8d0..1dc6c9bb1feb 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -27,12 +27,6 @@ #include "arrayobject.h" #include "alloc.h" #include "typeinfo.h" -#if defined(__ARM_NEON__) || defined (__ARM_NEON) -#include -#endif -#ifdef NPY_HAVE_SSE2_INTRINSICS -#include -#endif #include "npy_longdouble.h" #include "numpyos.h" @@ -42,7 +36,7 @@ #include "npy_cblas.h" #include "npy_buffer.h" - +#include "arraytypes.h" /* * Define a stack allocated dummy array with only the minimum information set: * 1. The descr, the main field interesting here. @@ -3176,77 +3170,21 @@ finish: ** ARGFUNC ** ***************************************************************************** */ -#if defined(__ARM_NEON__) || defined (__ARM_NEON) - int32_t _mm_movemask_epi8_neon(uint8x16_t input) - { - int8x8_t m0 = vcreate_s8(0x0706050403020100ULL); - uint8x16_t v0 = vshlq_u8(vshrq_n_u8(input, 7), vcombine_s8(m0, m0)); - uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); - return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); - } -#endif -#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b)) -static int -BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind, - PyArrayObject *NPY_UNUSED(aip)) - -{ - npy_intp i = 0; - /* memcmp like logical_and on i386 is maybe slower for small arrays */ -#ifdef NPY_HAVE_SSE2_INTRINSICS - const __m128i zero = _mm_setzero_si128(); - for (; i < n - (n % 32); i+=32) { - __m128i d1 = _mm_loadu_si128((__m128i*)&ip[i]); - __m128i d2 = _mm_loadu_si128((__m128i*)&ip[i + 16]); - d1 = _mm_cmpeq_epi8(d1, zero); - d2 = _mm_cmpeq_epi8(d2, zero); - if (_mm_movemask_epi8(_mm_min_epu8(d1, d2)) != 0xFFFF) { - break; - } - } -#else - #if defined(__ARM_NEON__) || defined (__ARM_NEON) - uint8x16_t zero = vdupq_n_u8(0); - for(; i < n - (n % 32); i+=32) { - uint8x16_t d1 = vld1q_u8((uint8_t *)&ip[i]); - uint8x16_t d2 = vld1q_u8((uint8_t *)&ip[i + 16]); - d1 = vceqq_u8(d1, zero); - d2 = vceqq_u8(d2, zero); - if(_mm_movemask_epi8_neon(vminq_u8(d1, d2)) != 0xFFFF) { - break; - } - } - #endif -#endif - for (; i < n; i++) { - if (ip[i]) { - *max_ind = i; - return 0; - } - } - *max_ind = 0; - return 0; -} +#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b)) /**begin repeat * - * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT, - * LONG, ULONG, LONGLONG, ULONGLONG, - * HALF, FLOAT, DOUBLE, LONGDOUBLE, - * CFLOAT, CDOUBLE, CLONGDOUBLE, + * #fname = HALF, CFLOAT, CDOUBLE, CLONGDOUBLE, * DATETIME, TIMEDELTA# - * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, - * npy_long, npy_ulong, npy_longlong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_float, npy_double, npy_longdouble, + * #type = npy_half, npy_float, npy_double, npy_longdouble, * npy_datetime, npy_timedelta# - * #isfloat = 0*10, 1*7, 0*2# - * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2# - * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8# - * #iscomplex = 0*14, 1*3, 0*2# - * #incr = ip++*14, ip+=2*3, ip++*2# - * #isdatetime = 0*17, 1*2# + * #isfloat = 1*4, 0*2# + * #isnan = npy_half_isnan, npy_isnan*3, nop*2# + * #le = npy_half_le, _LESS_THAN_OR_EQUAL*5# + * #iscomplex = 0, 1*3, 0*2# + * #incr = ip++, ip+=2*3, ip++*2# + * #isdatetime = 0*4, 1*2# */ static int @fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind, @@ -3337,22 +3275,16 @@ BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind, /**begin repeat * - * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT, - * LONG, ULONG, LONGLONG, ULONGLONG, - * HALF, FLOAT, DOUBLE, LONGDOUBLE, - * CFLOAT, CDOUBLE, CLONGDOUBLE, + * #fname = HALF, CFLOAT, CDOUBLE, CLONGDOUBLE, * DATETIME, TIMEDELTA# - * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, - * npy_long, npy_ulong, npy_longlong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_float, npy_double, npy_longdouble, + * #type = npy_half, npy_float, npy_double, npy_longdouble, * npy_datetime, npy_timedelta# - * #isfloat = 0*10, 1*7, 0*2# - * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2# - * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8# - * #iscomplex = 0*14, 1*3, 0*2# - * #incr = ip++*14, ip+=2*3, ip++*2# - * #isdatetime = 0*17, 1*2# + * #isfloat = 1*4, 0*2# + * #isnan = npy_half_isnan, npy_isnan*3, nop*2# + * #le = npy_half_le, _LESS_THAN_OR_EQUAL*5# + * #iscomplex = 0, 1*3, 0*2# + * #incr = ip++, ip+=2*3, ip++*2# + * #isdatetime = 0*4, 1*2# */ static int @fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind, @@ -3409,7 +3341,7 @@ static int *min_ind = i; break; } -#endif +#endif if (!@le@(mp, *ip)) { /* negated, for correct nan handling */ mp = *ip; *min_ind = i; @@ -4494,6 +4426,27 @@ set_typeinfo(PyObject *dict) PyArray_Descr *dtype; PyObject *cobj, *key; + // SIMD runtime dispatching + #ifndef NPY_DISABLE_OPTIMIZATION + #include "argfunc.dispatch.h" + #endif + /**begin repeat + * #FROM = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #NAME = Byte, UByte, Short, UShort, Int, UInt, + * Long, ULong, LongLong, ULongLong, + * Float, Double, LongDouble# + */ + /**begin repeat1 + * #func = argmax, argmin# + */ + NPY_CPU_DISPATCH_CALL_XB(_Py@NAME@_ArrFuncs.@func@ = (PyArray_ArgFunc*)@FROM@_@func@); + /**end repeat1**/ + /**end repeat**/ + NPY_CPU_DISPATCH_CALL_XB(_PyBool_ArrFuncs.argmax = (PyArray_ArgFunc*)BOOL_argmax); + /* * Override the base class for all types, eventually all of this logic * should be defined on the class and inherited to the scalar. diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h.src similarity index 56% rename from numpy/core/src/multiarray/arraytypes.h rename to numpy/core/src/multiarray/arraytypes.h.src index b3a13b297da1..4c7487189b5a 100644 --- a/numpy/core/src/multiarray/arraytypes.h +++ b/numpy/core/src/multiarray/arraytypes.h.src @@ -28,4 +28,25 @@ small_correlate(const char * d_, npy_intp dstride, npy_intp nk, enum NPY_TYPES ktype, char * out_, npy_intp ostride); +#ifndef NPY_DISABLE_OPTIMIZATION + #include "argfunc.dispatch.h" +#endif +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = byte, ubyte, short, ushort, int, uint, + * long, ulong, longlong, ulonglong, + * float, double, longdouble# + */ +/**begin repeat1 + * #func = argmax, argmin# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int @TYPE@_@func@, + (npy_@type@ *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip)) +/**end repeat1**/ +/**end repeat**/ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax, + (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip)) + #endif /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */ diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 4e23625470b1..137fb02ec838 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -4190,7 +4190,8 @@ class TestArgmaxArgminCommon: sizes = [(), (3,), (3, 2), (2, 3), (3, 3), (2, 3, 4), (4, 3, 2), (1, 2, 3, 4), (2, 3, 4, 1), - (3, 4, 1, 2), (4, 1, 2, 3)] + (3, 4, 1, 2), (4, 1, 2, 3), + (64,), (128,), (256,)] @pytest.mark.parametrize("size, axis", itertools.chain(*[[(size, axis) for axis in list(range(-len(size), len(size))) + [None]] @@ -4304,9 +4305,9 @@ def test_output_shape(self, method): @pytest.mark.parametrize('ndim', [0, 1]) @pytest.mark.parametrize('method', ['argmax', 'argmin']) def test_ret_is_out(self, ndim, method): - a = np.ones((4,) + (3,)*ndim) + a = np.ones((4,) + (256,)*ndim) arg_method = getattr(a, method) - out = np.empty((3,)*ndim, dtype=np.intp) + out = np.empty((256,)*ndim, dtype=np.intp) ret = arg_method(axis=0, out=out) assert ret is out @@ -4357,12 +4358,44 @@ def test_object_with_NULLs(self, method, vals): assert_equal(arg_method(), 1) class TestArgmax: - - nan_arr = [ - ([0, 1, 2, 3, np.nan], 4), - ([0, 1, 2, np.nan, 3], 3), - ([np.nan, 0, 1, 2, 3], 0), - ([np.nan, 0, np.nan, 2, 3], 0), + usg_data = [ + ([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 0), + ([3, 3, 3, 3, 2, 2, 2, 2], 0), + ([0, 1, 2, 3, 4, 5, 6, 7], 7), + ([7, 6, 5, 4, 3, 2, 1, 0], 0) + ] + sg_data = usg_data + [ + ([1, 2, 3, 4, -4, -3, -2, -1], 3), + ([1, 2, 3, 4, -1, -2, -3, -4], 3) + ] + darr = [(np.array(d[0], dtype=t), d[1]) for d, t in ( + itertools.product(usg_data, ( + np.uint8, np.uint16, np.uint32, np.uint64 + )) + )] + darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in ( + itertools.product(sg_data, ( + np.int8, np.int16, np.int32, np.int64, np.float32, np.float64 + )) + )] + darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in ( + itertools.product(( + ([0, 1, 2, 3, np.nan], 4), + ([0, 1, 2, np.nan, 3], 3), + ([np.nan, 0, 1, 2, 3], 0), + ([np.nan, 0, np.nan, 2, 3], 0), + # To hit the tail of SIMD multi-level(x4, x1) inner loops + # on varient SIMD widthes + ([1] * (2*5-1) + [np.nan], 2*5-1), + ([1] * (4*5-1) + [np.nan], 4*5-1), + ([1] * (8*5-1) + [np.nan], 8*5-1), + ([1] * (16*5-1) + [np.nan], 16*5-1), + ([1] * (32*5-1) + [np.nan], 32*5-1) + ), ( + np.float32, np.float64 + )) + )] + nan_arr = darr + [ ([0, 1, 2, 3, complex(0, np.nan)], 4), ([0, 1, 2, 3, complex(np.nan, 0)], 4), ([0, 1, 2, complex(np.nan, 0), 3], 3), @@ -4432,28 +4465,80 @@ def test_combinations(self, data): assert_equal(np.argmax(arr), pos, err_msg="%r" % arr) assert_equal(arr[np.argmax(arr)], val, err_msg="%r" % arr) + # add padding to test SIMD loops + rarr = np.repeat(arr, 129) + rpos = pos * 129 + assert_equal(np.argmax(rarr), rpos, err_msg="%r" % rarr) + assert_equal(rarr[np.argmax(rarr)], val, err_msg="%r" % rarr) + + padd = np.repeat(np.min(arr), 513) + rarr = np.concatenate((arr, padd)) + rpos = pos + assert_equal(np.argmax(rarr), rpos, err_msg="%r" % rarr) + assert_equal(rarr[np.argmax(rarr)], val, err_msg="%r" % rarr) + + def test_maximum_signed_integers(self): a = np.array([1, 2**7 - 1, -2**7], dtype=np.int8) assert_equal(np.argmax(a), 1) + a.repeat(129) + assert_equal(np.argmax(a), 1) a = np.array([1, 2**15 - 1, -2**15], dtype=np.int16) assert_equal(np.argmax(a), 1) + a.repeat(129) + assert_equal(np.argmax(a), 1) a = np.array([1, 2**31 - 1, -2**31], dtype=np.int32) assert_equal(np.argmax(a), 1) + a.repeat(129) + assert_equal(np.argmax(a), 1) a = np.array([1, 2**63 - 1, -2**63], dtype=np.int64) assert_equal(np.argmax(a), 1) - + a.repeat(129) + assert_equal(np.argmax(a), 1) class TestArgmin: - - nan_arr = [ - ([0, 1, 2, 3, np.nan], 4), - ([0, 1, 2, np.nan, 3], 3), - ([np.nan, 0, 1, 2, 3], 0), - ([np.nan, 0, np.nan, 2, 3], 0), + usg_data = [ + ([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 8), + ([3, 3, 3, 3, 2, 2, 2, 2], 4), + ([0, 1, 2, 3, 4, 5, 6, 7], 0), + ([7, 6, 5, 4, 3, 2, 1, 0], 7) + ] + sg_data = usg_data + [ + ([1, 2, 3, 4, -4, -3, -2, -1], 4), + ([1, 2, 3, 4, -1, -2, -3, -4], 7) + ] + darr = [(np.array(d[0], dtype=t), d[1]) for d, t in ( + itertools.product(usg_data, ( + np.uint8, np.uint16, np.uint32, np.uint64 + )) + )] + darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in ( + itertools.product(sg_data, ( + np.int8, np.int16, np.int32, np.int64, np.float32, np.float64 + )) + )] + darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in ( + itertools.product(( + ([0, 1, 2, 3, np.nan], 4), + ([0, 1, 2, np.nan, 3], 3), + ([np.nan, 0, 1, 2, 3], 0), + ([np.nan, 0, np.nan, 2, 3], 0), + # To hit the tail of SIMD multi-level(x4, x1) inner loops + # on varient SIMD widthes + ([1] * (2*5-1) + [np.nan], 2*5-1), + ([1] * (4*5-1) + [np.nan], 4*5-1), + ([1] * (8*5-1) + [np.nan], 8*5-1), + ([1] * (16*5-1) + [np.nan], 16*5-1), + ([1] * (32*5-1) + [np.nan], 32*5-1) + ), ( + np.float32, np.float64 + )) + )] + nan_arr = darr + [ ([0, 1, 2, 3, complex(0, np.nan)], 4), ([0, 1, 2, 3, complex(np.nan, 0)], 4), ([0, 1, 2, complex(np.nan, 0), 3], 3), @@ -4512,30 +4597,50 @@ class TestArgmin: ([False, True, False, True, True], 0), ] - def test_combinations(self): - for arr, pos in self.nan_arr: - with suppress_warnings() as sup: - sup.filter(RuntimeWarning, - "invalid value encountered in reduce") - min_val = np.min(arr) + @pytest.mark.parametrize('data', nan_arr) + def test_combinations(self, data): + arr, pos = data + with suppress_warnings() as sup: + sup.filter(RuntimeWarning, + "invalid value encountered in reduce") + min_val = np.min(arr) + + assert_equal(np.argmin(arr), pos, err_msg="%r" % arr) + assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr) - assert_equal(np.argmin(arr), pos, err_msg="%r" % arr) - assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr) + # add padding to test SIMD loops + rarr = np.repeat(arr, 129) + rpos = pos * 129 + assert_equal(np.argmin(rarr), rpos, err_msg="%r" % rarr) + assert_equal(rarr[np.argmin(rarr)], min_val, err_msg="%r" % rarr) + + padd = np.repeat(np.max(arr), 513) + rarr = np.concatenate((arr, padd)) + rpos = pos + assert_equal(np.argmin(rarr), rpos, err_msg="%r" % rarr) + assert_equal(rarr[np.argmin(rarr)], min_val, err_msg="%r" % rarr) def test_minimum_signed_integers(self): a = np.array([1, -2**7, -2**7 + 1, 2**7 - 1], dtype=np.int8) assert_equal(np.argmin(a), 1) + a.repeat(129) + assert_equal(np.argmin(a), 1) a = np.array([1, -2**15, -2**15 + 1, 2**15 - 1], dtype=np.int16) assert_equal(np.argmin(a), 1) + a.repeat(129) + assert_equal(np.argmin(a), 1) a = np.array([1, -2**31, -2**31 + 1, 2**31 - 1], dtype=np.int32) assert_equal(np.argmin(a), 1) + a.repeat(129) + assert_equal(np.argmin(a), 1) a = np.array([1, -2**63, -2**63 + 1, 2**63 - 1], dtype=np.int64) assert_equal(np.argmin(a), 1) - + a.repeat(129) + assert_equal(np.argmin(a), 1) class TestMinMax: From 52787cc75a462466cd3e41ec12c4776eff4c98de Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Tue, 18 Jan 2022 06:53:30 +0200 Subject: [PATCH 2/2] add npyv_cleanup() to avoid the AVX-SSE transition penalty. --- numpy/core/src/multiarray/argfunc.dispatch.c.src | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src index 39222bc9a3ad..cbfaebdb4796 100644 --- a/numpy/core/src/multiarray/argfunc.dispatch.c.src +++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src @@ -323,6 +323,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) #endif #ifdef TO_SIMD_SFX *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n); + npyv_cleanup(); #else @type@ mp = *ip; *mindx = 0; @@ -380,6 +381,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) break; } } + npyv_cleanup(); #endif // NPY_SIMD for (; i < len; ++i) { if (ip[i]) {