diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 2ce2fdb55428..5dd9798c8332 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -48,8 +48,11 @@ class TypeDescription: simd: list Available SIMD ufunc loops, dispatched at runtime in specified order Currently only supported for simples types (see make_arrays) + dispatch: list + Available SIMD ufunc loops, dispatched at runtime in specified order + Currently only supported for simples types (see make_arrays) """ - def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None): + def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None): self.type = type self.func_data = f if astype is None: @@ -62,6 +65,15 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None): out = out.replace('P', type) self.out = out self.simd = simd + """ + TODO: + - remove 'SIMD' after integrate the rest of simd loops + - support astype, this can be done by cover each func in NPYV intrinsics, + also, SIMD kernels of PyUFunc_*_As_* must handle remained scalars as vectors, + since func_data only takes one ptr for each type, + a lot of work I know but it will leave a good memory. + """ + self.dispatch = dispatch def finish_signature(self, nin, nout): if self.in_ is None: @@ -86,7 +98,7 @@ def build_func_data(types, f): func_data = [_fdata_map.get(t, '%s') % (f,) for t in types] return func_data -def TD(types, f=None, astype=None, in_=None, out=None, simd=None): +def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None): if f is not None: if isinstance(f, str): func_data = build_func_data(types, f) @@ -115,7 +127,14 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None): simdt = [k for k, v in simd if t in v] else: simdt = [] - tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt)) + # [(dispatch file name without extension '.dispatch.c*', list of types)] + if dispatch: + dispt = [k for k, v in dispatch if t in v] + else: + dispt = [] + tds.append(TypeDescription( + t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt + )) return tds class Ufunc: @@ -269,7 +288,8 @@ def english_upper(s): Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.add'), 'PyUFunc_AdditionTypeResolver', - TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx512f', cmplxvec)], + dispatch=[('loops_fast', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'mM', 'M'), @@ -280,7 +300,8 @@ def english_upper(s): Ufunc(2, 1, None, # Zero is only a unit to the right, not the left docstrings.get('numpy.core.umath.subtract'), 'PyUFunc_SubtractionTypeResolver', - TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]), + TD(ints + inexact, simd=[('avx512f', cmplxvec)], + dispatch=[('loops_fast', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'MM', 'm'), @@ -291,7 +312,8 @@ def english_upper(s): Ufunc(2, 1, One, docstrings.get('numpy.core.umath.multiply'), 'PyUFunc_MultiplicationTypeResolver', - TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx512f', cmplxvec)], + dispatch=[('loops_fast', ints)]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'qm', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), @@ -326,7 +348,8 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.conjugate'), None, - TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]), + TD(ints+flts+cmplx, simd=[('avx512f', cmplxvec)], + dispatch=[('loops_fast', ints)]), TD(P, f='conjugate'), ), 'fmod': @@ -341,14 +364,16 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]), + TD(ints+inexact, simd=[('fma', 'fd'), ('avx512f', 'FDfd')], + dispatch=[('loops_fast', ints)]), TD(O, f='Py_square'), ), 'reciprocal': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.reciprocal'), None, - TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]), + TD(ints+inexact, simd=[('fma', 'fd'), ('avx512f','fd')], + dispatch=[('loops_fast', ints)]), TD(O, f='Py_reciprocal'), ), # This is no longer used as numpy.ones_like, however it is @@ -392,7 +417,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.negative'), 'PyUFunc_NegativeTypeResolver', - TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]), + TD(ints+flts+timedeltaonly, dispatch=[('loops_fast', ints)]), TD(cmplx, f='neg'), TD(O, f='PyNumber_Negative'), ), @@ -414,7 +439,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.greater'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_fast', ints)]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -422,7 +447,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.greater_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_fast', ints)]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -430,7 +455,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.less'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_fast', ints)]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -438,7 +463,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.less_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_fast', ints)]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -446,7 +471,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_fast', ints)]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -454,7 +479,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.not_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_fast', ints)]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -462,7 +487,7 @@ def english_upper(s): Ufunc(2, 1, True_, docstrings.get('numpy.core.umath.logical_and'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), + TD(nodatetime_or_obj, out='?', dispatch=[('loops_fast', ints)]), TD(O, f='npy_ObjectLogicalAnd'), TD(O, f='npy_ObjectLogicalAnd', out='?'), ), @@ -470,7 +495,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.logical_not'), None, - TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), + TD(nodatetime_or_obj, out='?', dispatch=[('loops_fast', ints)]), TD(O, f='npy_ObjectLogicalNot'), TD(O, f='npy_ObjectLogicalNot', out='?'), ), @@ -478,7 +503,7 @@ def english_upper(s): Ufunc(2, 1, False_, docstrings.get('numpy.core.umath.logical_or'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), + TD(nodatetime_or_obj, out='?', dispatch=[('loops_fast', ints)]), TD(O, f='npy_ObjectLogicalOr'), TD(O, f='npy_ObjectLogicalOr', out='?'), ), @@ -540,42 +565,42 @@ def english_upper(s): Ufunc(2, 1, AllOnes, docstrings.get('numpy.core.umath.bitwise_and'), None, - TD(bints, simd=[('avx2', ints)]), + TD(bints, dispatch=[('loops_fast', ints)]), TD(O, f='PyNumber_And'), ), 'bitwise_or': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.bitwise_or'), None, - TD(bints, simd=[('avx2', ints)]), + TD(bints, dispatch=[('loops_fast', ints)]), TD(O, f='PyNumber_Or'), ), 'bitwise_xor': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.bitwise_xor'), None, - TD(bints, simd=[('avx2', ints)]), + TD(bints, dispatch=[('loops_fast', ints)]), TD(O, f='PyNumber_Xor'), ), 'invert': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.invert'), None, - TD(bints, simd=[('avx2', ints)]), + TD(bints, dispatch=[('loops_fast', ints)]), TD(O, f='PyNumber_Invert'), ), 'left_shift': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.left_shift'), None, - TD(ints, simd=[('avx2', ints)]), + TD(ints, dispatch=[('loops_fast', ints)]), TD(O, f='PyNumber_Lshift'), ), 'right_shift': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.right_shift'), None, - TD(ints, simd=[('avx2', ints)]), + TD(ints, dispatch=[('loops_fast', ints)]), TD(O, f='PyNumber_Rshift'), ), 'heaviside': @@ -1024,6 +1049,16 @@ def make_arrays(funcdict): ISA=vt.upper(), isa=vt, fname=name, type=tname, idx=k )) + if t.dispatch is not None: + for dname in t.dispatch: + code2list.append(textwrap.dedent("""\ + #ifndef NPY_DISABLE_OPTIMIZATION + #include "{dname}.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name}) + """).format( + dname=dname, name=name, tname=tname, k=k + )) else: funclist.append('NULL') try: diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 5498601794b8..81f9fabc333b 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -906,6 +906,7 @@ def generate_umath_c(ext, build_dir): join('src', 'umath', 'scalarmath.c.src'), join('src', 'umath', 'ufunc_type_resolution.c'), join('src', 'umath', 'override.c'), + join('src', 'umath', 'loops_fast.dispatch.c.src'), ] umath_deps = [ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0cfa1cea7f11..b505b7b97aa3 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -567,160 +567,18 @@ NPY_NO_EXPORT void UNARY_LOOP_FAST(@type@, @type@, *out = +in); } -/**begin repeat1 - * #isa = , _avx2# - * #ISA = , AVX2# - * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2# - * #ATTR = , NPY_GCC_TARGET_AVX2# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in * in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = -in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = ~in); -} -#endif - -/**begin repeat2 - * Arithmetic - * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor# - * #OP = +, -, *, &, |, ^# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); - } -} -#endif - -/**end repeat2**/ - -/* - * Arithmetic bit shift operations. - * - * Intel hardware masks bit shift values, so large shifts wrap around - * and can produce surprising results. The special handling ensures that - * behavior is independent of compiler or hardware. - * TODO: We could implement consistent behavior for negative shifts, - * which is undefined in C. - */ - -#define INT_left_shift_needs_clear_floatstatus -#define UINT_left_shift_needs_clear_floatstatus - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); - -#ifdef @TYPE@_left_shift_needs_clear_floatstatus - // For some reason, our macOS CI sets an "invalid" flag here, but only - // for some types. - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} - -#undef INT_left_shift_needs_clear_floatstatus -#undef UINT_left_shift_needs_clear_floatstatus - -NPY_NO_EXPORT -#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift -NPY_GCC_OPT_3 -#endif -void -@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); -} - - -/**begin repeat2 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or# - * #OP = ==, !=, >, >=, <, <=, &&, ||# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * gcc vectorization of this is not good (PR60575) but manual integer - * vectorization is too tedious to be worthwhile - */ - BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); -} -#endif - -/**end repeat2**/ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +#ifdef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift +NPY_NO_EXPORT void +@TYPE@_right_shift_noopt(char **args, npy_intp const *dimensions, npy_intp const *steps) { BINARY_LOOP { - const int t1 = !!*(@type@ *)ip1; - const int t2 = !!*(@type@ *)ip2; - *((npy_bool *)op1) = (t1 != t2); + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1) = npy_rshift@c@(in1, in2); } } #endif -/**end repeat1**/ - /**begin repeat1 * #kind = maximum, minimum# * #OP = >, <# diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 5dd49c465b0e..e68f739ef17b 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -69,52 +69,25 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#ifndef NPY_DISABLE_OPTIMIZATION + // autogenerated, need it for NPY_CPU_DISPATCH_DECLARE + #include "loops_fast.dispatch.h" +#endif /**begin repeat2 - * #isa = , _avx2# - */ - -NPY_NO_EXPORT void -@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat3 - * Arithmetic - * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, - * left_shift, right_shift# - * #OP = +, -,*, &, |, ^, <<, >># - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -/**begin repeat3 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or# - * #OP = ==, !=, >, >=, <, <=, &&, ||# - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ + * #dispatch = square, reciprocal, conjugate, negative, logical, invert, + * add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift, + * equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or, logical_xor, logical_not# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @S@@TYPE@_@dispatch@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat2**/ +#ifdef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift NPY_NO_EXPORT void -@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ +@S@@TYPE@_right_shift_noopt(char **args, npy_intp const *dimensions, npy_intp const *steps); +#endif /**begin repeat2 * #kind = maximum, minimum# diff --git a/numpy/core/src/umath/loops_fast.dispatch.c.src b/numpy/core/src/umath/loops_fast.dispatch.c.src new file mode 100644 index 000000000000..868f8f1a3b23 --- /dev/null +++ b/numpy/core/src/umath/loops_fast.dispatch.c.src @@ -0,0 +1,177 @@ +/* -*- c -*- */ +/* + * vim:syntax=c + */ +/*@targets + ** $autovec baseline + ** sse2 avx2 avx512_skx + ** vsx vsx2 + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "Python.h" + +#include "npy_config.h" +#include "numpy/npy_common.h" +#include "numpy/arrayobject.h" +#include "numpy/ufuncobject.h" +#include "numpy/npy_math.h" +#include "numpy/halffloat.h" +#include "lowlevel_strided_loops.h" + +#include "loops.h" + +/* + * largest simd vector size in bytes numpy supports + * it is currently a extremely large value as it is only used for memory + * overlap checks + */ +#ifndef NPY_MAX_SIMD_SIZE +#define NPY_MAX_SIMD_SIZE 1024 +#endif + +/** Provides the various *_LOOP macros */ +#include "fast_loop_macros.h" + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** +*/ + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong# + * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in * in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_negative) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = -in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = ~in); +} + +/**begin repeat1 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor# + * #OP = +, -, *, &, |, ^# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + io1 @OP@= *(@type@ *)ip2; + } + *((@type@ *)iop1) = io1; + } + else { + BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); + } +} +/**end repeat1**/ + +/* + * Arithmetic bit shift operations. + * + * Intel hardware masks bit shift values, so large shifts wrap around + * and can produce surprising results. The special handling ensures that + * behavior is independent of compiler or hardware. + * TODO: We could implement consistent behavior for negative shifts, + * which is undefined in C. + */ +#define INT_left_shift_needs_clear_floatstatus +#define UINT_left_shift_needs_clear_floatstatus + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); + +#ifdef @TYPE@_left_shift_needs_clear_floatstatus + // For some reason, our macOS CI sets an "invalid" flag here, but only + // for some types. + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +#undef INT_left_shift_needs_clear_floatstatus +#undef UINT_left_shift_needs_clear_floatstatus + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +/** + * dispatch-able sources through policy '$maxopt' or '$autovec' + * get the maximum safe optimize flags that compiler can offer + * e.g on gcc '-O3', so @TYPE@_right_shift_noopt is defined in loops.c. +*/ +#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift + BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); +#else + @TYPE@_right_shift_noopt(args, dimensions, steps); +#endif +} + +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or# + * #OP = ==, !=, >, >=, <, <=, &&, ||# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* + * gcc vectorization of this is not good (PR60575) but manual integer + * vectorization is too tedious to be worthwhile + */ + BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const int t1 = !!*(@type@ *)ip1; + const int t2 = !!*(@type@ *)ip2; + *((npy_bool *)op1) = (t1 != t2); + } +} + +/**end repeat1**/