From 50752aa920be32b74c1a7d0e4242e84b15ffa73c Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 8 Mar 2021 14:17:19 +0530 Subject: [PATCH 1/9] ENH, SIMD: Added integer dispatch --- .../src/umath/loops_arithmetic.dispatch.c.src | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 numpy/core/src/umath/loops_arithmetic.dispatch.c.src diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src new file mode 100644 index 000000000000..0e68f1b7b26e --- /dev/null +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -0,0 +1,131 @@ +/*@targets + ** $maxopt baseline + ** sse2 sse41 avx2 avx512_skx + ** vsx2 + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +#include +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +//############################################################################### +//## Unsigned Integers +//############################################################################### +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +#ifdef NPY_SIMD +/**begin repeat + * #sfx = u8, u16, u32, u64# + */ + +static void simd_divide_by_scalar_contig_contig_@sfx@ +(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst, + int len) +{ + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); + npyv_store_@sfx@(dst, c); + } + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + *dst = a / scalar; + } + npyv_cleanup(); +} + +/**end repeat**/ +#endif + + + +// XXX Need to see what can be done for 64 bits +/**begin repeat + * Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ +#if NPY_BITSOF_@SIGNED_TYPE@ <= 8 + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8 +#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16 + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16 +#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32 + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32 +#else + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64 +#endif +static NPY_INLINE int +run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + BINARY_DEFS + + if (n == 0) { + return 1; + } + + const @type@ in2 = *(@type@ *)ip2; + if (in2 == 0) { + npy_set_floatstatus_divbyzero(); + BINARY_LOOP_SLIDING { + *((@type@ *)op1) = 0; + } + return 1; + } +#if defined NPY_SIMD + #ifdef NPY_HAVE_AVX512F + const npy_intp vector_size_bytes = 64; + #elif defined NPY_HAVE_AVX2 + const npy_intp vector_size_bytes = 32; + #else + const npy_intp vector_size_bytes = 16; + #endif + // XXX Implement other loops + if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) { + simd_divide_by_scalar_@type@(ip1, in2, op1, n); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + +/**begin repeat + * Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + io1 /= *(@type@ *)ip2; + } + *((@type@ *)iop1) = io1; + } + else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (in2 == 0) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + *((@type@ *)op1) = in1 / in2; + } + } +} +/**end repeat**/ From 6b2fb9e6a567e24a8940d0c8d78410a310c531a1 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 8 Mar 2021 14:18:23 +0530 Subject: [PATCH 2/9] ENH, SIMD: Use integer dispatch --- numpy/core/src/umath/loops.c.src | 16 ---------------- numpy/core/src/umath/loops.h.src | 13 ++++++++++++- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 68e209fe9312..04665dc5296e 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1014,22 +1014,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); } -NPY_NO_EXPORT void -@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - *((@type@ *)op1)= in1/in2; - } - } -} - NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index b3a19be12d62..0301aa5ed7b8 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -53,6 +53,17 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void ***************************************************************************** */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + /**begin repeat * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ @@ -141,7 +152,7 @@ NPY_NO_EXPORT void @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); From f2cb33bcf60e72924b46dd652af64d0af8da2508 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 8 Mar 2021 14:19:18 +0530 Subject: [PATCH 3/9] ENH, SIMD: Add dispatch to build process --- numpy/core/code_generators/generate_umath.py | 2 +- numpy/core/setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index b5305fbfce98..2e5548b6924a 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -325,7 +325,7 @@ def english_upper(s): Ufunc(2, 1, None, # One is only a unit to the right, not the left docstrings.get('numpy.core.umath.floor_divide'), 'PyUFunc_DivisionTypeResolver', - TD(intfltcmplx), + TD(intfltcmplx, cfunc_alias='divide', dispatch=[('loops_arithmetic', 'BHILQ')]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), TypeDescription('m', FullTypeDescr, 'mm', 'q'), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 8c34a3286d72..df405bcaf487 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -931,6 +931,7 @@ def generate_umath_c(ext, build_dir): join('src', 'umath', 'loops.c.src'), join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), From 453043c10a1d343a0ecc16c9a88bcfd0dfdfd4ce Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 8 Mar 2021 14:19:40 +0530 Subject: [PATCH 4/9] MAINT, SIMD: Add loops_arithmetic.dispatch.c.src --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 05df19335be2..736597b6b7af 100644 --- a/.gitignore +++ b/.gitignore @@ -215,5 +215,6 @@ numpy/core/src/_simd/_simd_inc.h # umath module numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c +numpy/core/src/umath/loops_arithmetic.dispatch.c numpy/core/src/umath/loops_trigonometric.dispatch.c numpy/core/src/umath/loops_exponent_log.dispatch.c From 71e84dcd2ec1a59b6426f05b9095a3a2fd51c01d Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Wed, 10 Mar 2021 20:05:34 +0530 Subject: [PATCH 5/9] MAINT: Fixed dispatch in generate_umath --- numpy/core/code_generators/generate_umath.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 2e5548b6924a..57c811ff3306 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -245,6 +245,8 @@ def english_upper(s): O = 'O' P = 'P' ints = 'bBhHiIlLqQ' +sints = 'bhilq' +uints = 'BHILQ' times = 'Mm' timedeltaonly = 'm' intsO = ints + O @@ -325,7 +327,9 @@ def english_upper(s): Ufunc(2, 1, None, # One is only a unit to the right, not the left docstrings.get('numpy.core.umath.floor_divide'), 'PyUFunc_DivisionTypeResolver', - TD(intfltcmplx, cfunc_alias='divide', dispatch=[('loops_arithmetic', 'BHILQ')]), + TD(uints, cfunc_alias='divide', + dispatch=[('loops_arithmetic', 'BHILQ')]), + TD(sints + flts + cmplx), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), TypeDescription('m', FullTypeDescr, 'mm', 'q'), From bbb143646cbaad2866ed401ca3c795f083285f78 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 20 Mar 2021 16:22:06 +0530 Subject: [PATCH 6/9] SIMD, MAINT: Refined kernel and inner ufunc functions --- .../src/umath/loops_arithmetic.dispatch.c.src | 109 +++++++----------- 1 file changed, 43 insertions(+), 66 deletions(-) diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 0e68f1b7b26e..a012d50dd72c 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** sse2 sse41 avx2 avx512_skx + ** sse2 sse41 avx2 avx512f avx512_skx ** vsx2 ** neon **/ @@ -12,26 +12,26 @@ #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" -#include // Provides the various *_LOOP macros #include "fast_loop_macros.h" //############################################################################### -//## Unsigned Integers +//## Division //############################################################################### /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ -#ifdef NPY_SIMD +#if NPY_SIMD /**begin repeat * #sfx = u8, u16, u32, u64# */ - -static void simd_divide_by_scalar_contig_contig_@sfx@ -(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst, - int len) +static NPY_INLINE void +simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { - const int vstep = npyv_nlanes_@sfx@; + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { @@ -39,92 +39,69 @@ static void simd_divide_by_scalar_contig_contig_@sfx@ npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); npyv_store_@sfx@(dst, c); } + for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; *dst = a / scalar; } + npyv_cleanup(); } - /**end repeat**/ #endif +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ - -// XXX Need to see what can be done for 64 bits /**begin repeat * Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# - * #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ -#if NPY_BITSOF_@SIGNED_TYPE@ <= 8 - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8 -#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16 - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16 -#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32 - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32 -#else - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64 -#endif -static NPY_INLINE int -run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - BINARY_DEFS - - if (n == 0) { - return 1; - } - - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - BINARY_LOOP_SLIDING { - *((@type@ *)op1) = 0; - } - return 1; - } -#if defined NPY_SIMD - #ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - #elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - #else - const npy_intp vector_size_bytes = 16; - #endif - // XXX Implement other loops - if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) { - simd_divide_by_scalar_@type@(ip1, in2, op1, n); - return 1; - } +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@STYPE@ == @len@ + #define TO_SIMD_SFX(X) X##_u@len@ +/**end repeat1**/ #endif - return 0; -} -/**end repeat**/ -/**begin repeat - * Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# - */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(@type@) { - io1 /= *(@type@ *)ip2; + const @type@ d = *(@type@ *)ip2; + if (NPY_UNLIKELY(d == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else { + io1 /= d; + } } *((@type@ *)iop1) = io1; } - else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) { +#if NPY_SIMD && defined(TO_SIMD_SFX) + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); + } +#endif + else { BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { + if (NPY_UNLIKELY(in2 == 0)) { npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; + } else{ + *((@type@ *)op1) = in1 / in2; } - *((@type@ *)op1) = in1 / in2; } } } From c78d9a0bb1429f3c4d56d8687ae54cbbe7158838 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Sat, 20 Mar 2021 17:01:43 +0530 Subject: [PATCH 7/9] TST: Division tests for unsigned ints --- numpy/core/tests/test_umath.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 2249c866caf5..b31b84d0cc2b 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -250,13 +250,22 @@ def test_division_int(self): assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80]) @pytest.mark.parametrize("input_dtype", - [np.int8, np.int16, np.int32, np.int64]) + np.sctypes['int'] + np.sctypes['uint']) def test_division_int_boundary(self, input_dtype): iinfo = np.iinfo(input_dtype) + # Unsigned: + # Create list with 0, 25th, 50th, 75th percentile and max + if iinfo.min == 0: + lst = [0, iinfo.max//4, iinfo.max//2, + int(iinfo.max/1.33), iinfo.max] + divisors = [iinfo.max//4, iinfo.max//2, + int(iinfo.max/1.33), iinfo.max] + # Signed: # Create list with min, 25th percentile, 0, 75th percentile, max - lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max] - divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] + else: + lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max] + divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] a = np.array(lst, dtype=input_dtype) for divisor in divisors: @@ -926,7 +935,7 @@ def test_log_values(self): assert_raises(FloatingPointError, np.log, np.float32(-np.inf)) assert_raises(FloatingPointError, np.log, np.float32(-1.0)) - # See https://github.com/numpy/numpy/issues/18005 + # See https://github.com/numpy/numpy/issues/18005 with assert_no_warnings(): a = np.array(1e9, dtype='float32') np.log(a) From a2c5af9c4f170cd452645a5d938d93ed24f246fa Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Mon, 22 Mar 2021 20:57:54 +0530 Subject: [PATCH 8/9] BENCH: Benchmarks for unsigned ints (#18075) --- benchmarks/benchmarks/bench_ufunc.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index 13b7382a1708..b036581e1aae 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -135,18 +135,19 @@ def time_less_than_scalar2(self, dtype): class CustomScalarFloorDivideInt(Benchmark): - params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43, 0]) + params = (np.sctypes['int'] + np.sctypes['uint'], [8, -8, 43, -43]) param_names = ['dtype', 'divisors'] - max_value = 10**7 - min_value = -10**7 def setup(self, dtype, divisor): + if dtype in np.sctypes['uint'] and divisor < 0: + raise NotImplementedError( + "Skipping test for negative divisor with unsigned type") + iinfo = np.iinfo(dtype) - self.x = np.arange( - max(iinfo.min, self.min_value), - min(iinfo.max, self.max_value), dtype=dtype) + self.x = np.random.randint( + iinfo.min, iinfo.max, size=10000, dtype=dtype) - def time_floor_divide_int(self, dtpye, divisor): + def time_floor_divide_int(self, dtype, divisor): self.x // divisor From 4d2e4847823d3d3c9b7380f8ee7bc1799bd070f9 Mon Sep 17 00:00:00 2001 From: Ganesh Kathiresan Date: Tue, 6 Apr 2021 09:04:53 +0530 Subject: [PATCH 9/9] SIMD: Use scalar division for Armv7, Aarch64, and IBM/Power Co-authored-by: Sayed Adel --- numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index a012d50dd72c..7e9f464636c5 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -69,7 +69,17 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) #define TO_SIMD_SFX(X) X##_u@len@ /**end repeat1**/ #endif - +/* + * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division + * because emulating multiply-high on these architectures is going to be expensive comparing + * to the native scalar dividers. + * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. + * Power10(VSX4) is an exception here since it has native support for integer vector division, + * note neither infrastructure nor NPYV has supported VSX4 yet. + */ +#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) + #undef TO_SIMD_SFX +#endif NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {