From 1950fb55ce21f2e125672979872dd4c7781d18ce Mon Sep 17 00:00:00 2001 From: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> Date: Mon, 18 Oct 2021 14:04:17 -0700 Subject: [PATCH 1/8] BUG: NEON min/max is slow (#17989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes numpy/numpy#17989 by adding ARM NEON implementations for min/max and fmin/max. Before: Rosetta faster than native arm64 by `1.2x - 8.6x`. After: Native arm64 faster than Rosetta by `1.6x - 6.7x`. (2.8x - 15.5x improvement) **Benchmarks** ``` before after ratio [b0e1a445] [8301ffd7]
+ 32.6±0.04μs 37.5±0.08μs 1.15 bench_ufunc_strides.Unary.time_ufunc(, 2, 1, 'd') + 32.6±0.06μs 37.5±0.04μs 1.15 bench_ufunc_strides.Unary.time_ufunc(, 2, 1, 'd') + 37.8±0.09μs 43.2±0.09μs 1.14 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'f') + 37.7±0.09μs 42.9±0.1μs 1.14 bench_ufunc_strides.Unary.time_ufunc(, 2, 2, 'd') + 37.9±0.2μs 43.0±0.02μs 1.14 bench_ufunc_strides.Unary.time_ufunc(, 2, 2, 'd') + 37.7±0.01μs 42.3±1μs 1.12 bench_ufunc_strides.Unary.time_ufunc(, 2, 2, 'd') + 34.2±0.07μs 38.1±0.05μs 1.12 bench_ufunc_strides.Unary.time_ufunc(, 4, 2, 'f') + 32.6±0.03μs 35.8±0.04μs 1.10 bench_ufunc_strides.Unary.time_ufunc(, 4, 1, 'f') + 37.1±0.1μs 40.3±0.1μs 1.09 bench_ufunc_strides.Unary.time_ufunc(, 1, 2, 'd') + 37.2±0.1μs 40.3±0.04μs 1.08 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'f') + 37.1±0.09μs 40.3±0.07μs 1.08 bench_ufunc_strides.Unary.time_ufunc(, 1, 2, 'd') + 68.6±0.5μs 74.2±0.3μs 1.08 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'd') + 37.1±0.2μs 40.0±0.1μs 1.08 bench_ufunc_strides.Unary.time_ufunc(, 1, 2, 'd') + 2.42±0μs 2.61±0.05μs 1.08 bench_core.CountNonzero.time_count_nonzero_axis(3, 100, ) + 69.1±0.7μs 73.5±0.7μs 1.06 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'd') + 54.7±0.3μs 58.0±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'd') + 54.5±0.2μs 57.8±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'd') + 3.78±0.04μs 4.00±0.02μs 1.06 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 100, ) + 54.8±0.2μs 57.9±0.3μs 1.06 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'd') + 3.68±0.01μs 3.87±0.02μs 1.05 bench_core.CountNonzero.time_count_nonzero_multi_axis(1, 100, ) + 69.6±0.2μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'd') + 229±2μs 241±0.2μs 1.05 bench_random.Bounded.time_bounded('PCG64', [, 1535]) - 73.0±0.8μs 69.5±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'd') - 37.6±0.1μs 35.7±0.3μs 0.95 bench_ufunc_strides.Unary.time_ufunc(, 1, 4, 'f') - 88.7±0.04μs 84.2±0.7μs 0.95 bench_lib.Pad.time_pad((256, 128, 1), 1, 'wrap') - 57.9±0.2μs 54.8±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'd') - 39.9±0.2μs 37.2±0.04μs 0.93 bench_ufunc_strides.Unary.time_ufunc(, 1, 2, 'd') - 2.66±0.01μs 2.47±0.01μs 0.93 bench_lib.Nan.time_nanmin(200, 0) - 2.65±0.02μs 2.46±0.04μs 0.93 bench_lib.Nan.time_nanmin(200, 50.0) - 2.64±0.01μs 2.45±0.01μs 0.93 bench_lib.Nan.time_nanmax(200, 90.0) - 2.64±0μs 2.44±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0) - 2.68±0.02μs 2.48±0μs 0.92 bench_lib.Nan.time_nanmax(200, 2.0) - 40.2±0.01μs 37.1±0.1μs 0.92 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'f') - 2.69±0μs 2.47±0μs 0.92 bench_lib.Nan.time_nanmin(200, 2.0) - 2.70±0.02μs 2.48±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0.1) - 2.70±0μs 2.47±0μs 0.91 bench_lib.Nan.time_nanmin(200, 90.0) - 2.70±0μs 2.46±0μs 0.91 bench_lib.Nan.time_nanmin(200, 0.1) - 2.70±0μs 2.42±0.01μs 0.90 bench_lib.Nan.time_nanmax(200, 50.0) - 11.8±0.6ms 10.6±0.6ms 0.89 bench_core.CountNonzero.time_count_nonzero_axis(2, 1000000, ) - 42.7±0.1μs 37.8±0.02μs 0.88 bench_ufunc_strides.Unary.time_ufunc(, 2, 2, 'd') - 42.8±0.03μs 37.8±0.2μs 0.88 bench_ufunc_strides.Unary.time_ufunc(, 2, 2, 'd') - 43.1±0.2μs 37.7±0.09μs 0.87 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'f') - 37.5±0.07μs 32.6±0.06μs 0.87 bench_ufunc_strides.Unary.time_ufunc(, 2, 1, 'd') - 41.7±0.03μs 36.3±0.07μs 0.87 bench_ufunc_strides.Unary.time_ufunc(, 1, 4, 'd') - 166±0.8μs 144±1μs 0.87 bench_ufunc.UFunc.time_ufunc_types('fmin') - 11.6±0.8ms 10.0±0.01ms 0.87 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 1000000, ) - 167±0.9μs 144±2μs 0.86 bench_ufunc.UFunc.time_ufunc_types('minimum') - 168±4μs 143±0.5μs 0.85 bench_ufunc.UFunc.time_ufunc_types('fmax') - 167±1μs 142±0.8μs 0.85 bench_ufunc.UFunc.time_ufunc_types('maximum') - 7.10±0μs 4.97±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 2) - 7.11±0.07μs 4.96±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 2) - 7.05±0.07μs 4.68±0μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 4) - 7.13±0μs 4.68±0.01μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 4) - 461±0.2μs 297±7μs 0.64 bench_app.MaxesOfDots.time_it - 7.04±0.07μs 3.95±0μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 2) - 7.06±0.06μs 3.95±0.01μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 2) - 7.09±0.06μs 3.24±0μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 1) - 7.12±0.07μs 3.25±0.02μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 1) - 14.5±0.02μs 3.98±0μs 0.27 bench_reduce.MinMax.time_max() - 14.6±0.1μs 4.00±0.01μs 0.27 bench_reduce.MinMax.time_min() - 6.88±0.06μs 1.34±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 1) - 7.00±0μs 1.33±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 1) - 39.4±0.01μs 3.95±0.01μs 0.10 bench_reduce.MinMax.time_min() - 39.4±0.01μs 3.95±0.02μs 0.10 bench_reduce.MinMax.time_max() - 254±0.02μs 22.8±0.2μs 0.09 bench_lib.Nan.time_nanmax(200000, 50.0) - 253±0.1μs 22.7±0.1μs 0.09 bench_lib.Nan.time_nanmin(200000, 0) - 254±0.06μs 22.7±0.09μs 0.09 bench_lib.Nan.time_nanmin(200000, 2.0) - 254±0.01μs 22.7±0.03μs 0.09 bench_lib.Nan.time_nanmin(200000, 0.1) - 254±0.04μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmin(200000, 50.0) - 253±0.1μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 0.1) - 253±0.03μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmin(200000, 90.0) - 253±0.02μs 22.7±0.07μs 0.09 bench_lib.Nan.time_nanmax(200000, 0) - 254±0.03μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmax(200000, 90.0) - 254±0.09μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 2.0) - 39.2±0.01μs 2.51±0.01μs 0.06 bench_reduce.MinMax.time_max() - 39.2±0.01μs 2.50±0.01μs 0.06 bench_reduce.MinMax.time_min() ``` Size change of _multiarray_umath.cpython-39-darwin.so: Before: 3,890,723 After: 3,924,035 Change: +33,312 (~ +0.856 %) --- .gitignore | 1 + numpy/core/code_generators/generate_umath.py | 4 + numpy/core/setup.py | 1 + numpy/core/setup.py.orig | 1063 +++++++++++++++++ numpy/core/src/common/simd/neon/math.h | 66 + numpy/core/src/umath/loops.c.src | 9 + numpy/core/src/umath/loops.h.src | 50 + numpy/core/src/umath/loops.h.src.orig | 667 +++++++++++ .../src/umath/loops_minmax.dispatch.c.src | 671 +++++++++++ 9 files changed, 2532 insertions(+) create mode 100644 numpy/core/setup.py.orig create mode 100644 numpy/core/src/umath/loops.h.src.orig create mode 100644 numpy/core/src/umath/loops_minmax.dispatch.c.src diff --git a/.gitignore b/.gitignore index d856762492df..63815abc37c9 100644 --- a/.gitignore +++ b/.gitignore @@ -218,5 +218,6 @@ numpy/core/src/_simd/_simd_inc.h numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c numpy/core/src/umath/loops_arithmetic.dispatch.c +numpy/core/src/umath/loops_minmax.dispatch.c numpy/core/src/umath/loops_trigonometric.dispatch.c numpy/core/src/umath/loops_exponent_log.dispatch.c diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 3a27a34cdd51..c20468a8f58b 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -516,6 +516,7 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMax') ), @@ -523,6 +524,7 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMin') ), @@ -537,6 +539,7 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmax'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), TD(noobj), TD(O, f='npy_ObjectMax') ), @@ -544,6 +547,7 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmin'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), TD(noobj), TD(O, f='npy_ObjectMin') ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index f2524ae13c26..2b0e33244534 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -943,6 +943,7 @@ def generate_umath_c(ext, build_dir): join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_minmax.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), diff --git a/numpy/core/setup.py.orig b/numpy/core/setup.py.orig new file mode 100644 index 000000000000..f2524ae13c26 --- /dev/null +++ b/numpy/core/setup.py.orig @@ -0,0 +1,1063 @@ +import os +import sys +import pickle +import copy +import warnings +import platform +import textwrap +import glob +from os.path import join + +from numpy.distutils import log +from distutils.dep_util import newer +from sysconfig import get_config_var +from numpy.compat import npy_load_module +from setup_common import * # noqa: F403 + +# Set to True to enable relaxed strides checking. This (mostly) means +# that `strides[dim]` is ignored if `shape[dim] == 1` when setting flags. +NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0") + +# Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a +# bogus value for affected strides in order to help smoke out bad stride usage +# when relaxed stride checking is enabled. +NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0") +NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING + +# XXX: ugly, we use a class to avoid calling twice some expensive functions in +# config.h/numpyconfig.h. I don't see a better way because distutils force +# config.h generation inside an Extension class, and as such sharing +# configuration information between extensions is not easy. +# Using a pickled-based memoize does not work because config_cmd is an instance +# method, which cPickle does not like. +# +# Use pickle in all cases, as cPickle is gone in python3 and the difference +# in time is only in build. -- Charles Harris, 2013-03-30 + +class CallOnceOnly: + def __init__(self): + self._check_types = None + self._check_ieee_macros = None + self._check_complex = None + + def check_types(self, *a, **kw): + if self._check_types is None: + out = check_types(*a, **kw) + self._check_types = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_types)) + return out + + def check_ieee_macros(self, *a, **kw): + if self._check_ieee_macros is None: + out = check_ieee_macros(*a, **kw) + self._check_ieee_macros = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_ieee_macros)) + return out + + def check_complex(self, *a, **kw): + if self._check_complex is None: + out = check_complex(*a, **kw) + self._check_complex = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_complex)) + return out + +def can_link_svml(): + """SVML library is supported only on x86_64 architecture and currently + only on linux + """ + machine = platform.machine() + system = platform.system() + return "x86_64" in machine and system == "Linux" + +def check_svml_submodule(svmlpath): + if not os.path.exists(svmlpath + "/README.md"): + raise RuntimeError("Missing `SVML` submodule! Run `git submodule " + "update --init` to fix this.") + return True + +def pythonlib_dir(): + """return path where libpython* is.""" + if sys.platform == 'win32': + return os.path.join(sys.prefix, "libs") + else: + return get_config_var('LIBDIR') + +def is_npy_no_signal(): + """Return True if the NPY_NO_SIGNAL symbol must be defined in configuration + header.""" + return sys.platform == 'win32' + +def is_npy_no_smp(): + """Return True if the NPY_NO_SMP symbol must be defined in public + header (when SMP support cannot be reliably enabled).""" + # Perhaps a fancier check is in order here. + # so that threads are only enabled if there + # are actually multiple CPUS? -- but + # threaded code can be nice even on a single + # CPU so that long-calculating code doesn't + # block. + return 'NPY_NOSMP' in os.environ + +def win32_checks(deflist): + from numpy.distutils.misc_util import get_build_architecture + a = get_build_architecture() + + # Distutils hack on AMD64 on windows + print('BUILD_ARCHITECTURE: %r, os.name=%r, sys.platform=%r' % + (a, os.name, sys.platform)) + if a == 'AMD64': + deflist.append('DISTUTILS_USE_SDK') + + # On win32, force long double format string to be 'g', not + # 'Lg', since the MS runtime does not support long double whose + # size is > sizeof(double) + if a == "Intel" or a == "AMD64": + deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING') + +def check_math_capabilities(config, ext, moredefs, mathlibs): + def check_func(func_name): + return config.check_func(func_name, libraries=mathlibs, + decl=True, call=True) + + def check_funcs_once(funcs_name): + decl = dict([(f, True) for f in funcs_name]) + st = config.check_funcs_once(funcs_name, libraries=mathlibs, + decl=decl, call=decl) + if st: + moredefs.extend([(fname2def(f), 1) for f in funcs_name]) + return st + + def check_funcs(funcs_name): + # Use check_funcs_once first, and if it does not work, test func per + # func. Return success only if all the functions are available + if not check_funcs_once(funcs_name): + # Global check failed, check func per func + for f in funcs_name: + if check_func(f): + moredefs.append((fname2def(f), 1)) + return 0 + else: + return 1 + + #use_msvc = config.check_decl("_MSC_VER") + + if not check_funcs_once(MANDATORY_FUNCS): + raise SystemError("One of the required function to build numpy is not" + " available (the list is %s)." % str(MANDATORY_FUNCS)) + + # Standard functions which may not be available and for which we have a + # replacement implementation. Note that some of these are C99 functions. + + # XXX: hack to circumvent cpp pollution from python: python put its + # config.h in the public namespace, so we have a clash for the common + # functions we test. We remove every function tested by python's + # autoconf, hoping their own test are correct + for f in OPTIONAL_STDFUNCS_MAYBE: + if config.check_decl(fname2def(f), + headers=["Python.h", "math.h"]): + OPTIONAL_STDFUNCS.remove(f) + + check_funcs(OPTIONAL_STDFUNCS) + + for h in OPTIONAL_HEADERS: + if config.check_func("", decl=False, call=False, headers=[h]): + h = h.replace(".", "_").replace(os.path.sep, "_") + moredefs.append((fname2def(h), 1)) + + for tup in OPTIONAL_INTRINSICS: + headers = None + if len(tup) == 2: + f, args, m = tup[0], tup[1], fname2def(tup[0]) + elif len(tup) == 3: + f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[0]) + else: + f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[3]) + if config.check_func(f, decl=False, call=True, call_args=args, + headers=headers): + moredefs.append((m, 1)) + + for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES: + if config.check_gcc_function_attribute(dec, fn): + moredefs.append((fname2def(fn), 1)) + if fn == 'attribute_target_avx512f': + # GH-14787: Work around GCC<8.4 bug when compiling with AVX512 + # support on Windows-based platforms + if (sys.platform in ('win32', 'cygwin') and + config.check_compiler_gcc() and + not config.check_gcc_version_at_least(8, 4)): + ext.extra_compile_args.extend( + ['-ffixed-xmm%s' % n for n in range(16, 32)]) + + for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS: + if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code, + header): + moredefs.append((fname2def(fn), 1)) + + for fn in OPTIONAL_VARIABLE_ATTRIBUTES: + if config.check_gcc_variable_attribute(fn): + m = fn.replace("(", "_").replace(")", "_") + moredefs.append((fname2def(m), 1)) + + # C99 functions: float and long double versions + check_funcs(C99_FUNCS_SINGLE) + check_funcs(C99_FUNCS_EXTENDED) + +def check_complex(config, mathlibs): + priv = [] + pub = [] + + try: + if os.uname()[0] == "Interix": + warnings.warn("Disabling broken complex support. See #1365", stacklevel=2) + return priv, pub + except Exception: + # os.uname not available on all platforms. blanket except ugly but safe + pass + + # Check for complex support + st = config.check_header('complex.h') + if st: + priv.append(('HAVE_COMPLEX_H', 1)) + pub.append(('NPY_USE_C99_COMPLEX', 1)) + + for t in C99_COMPLEX_TYPES: + st = config.check_type(t, headers=["complex.h"]) + if st: + pub.append(('NPY_HAVE_%s' % type2def(t), 1)) + + def check_prec(prec): + flist = [f + prec for f in C99_COMPLEX_FUNCS] + decl = dict([(f, True) for f in flist]) + if not config.check_funcs_once(flist, call=decl, decl=decl, + libraries=mathlibs): + for f in flist: + if config.check_func(f, call=True, decl=True, + libraries=mathlibs): + priv.append((fname2def(f), 1)) + else: + priv.extend([(fname2def(f), 1) for f in flist]) + + check_prec('') + check_prec('f') + check_prec('l') + + return priv, pub + +def check_ieee_macros(config): + priv = [] + pub = [] + + macros = [] + + def _add_decl(f): + priv.append(fname2def("decl_%s" % f)) + pub.append('NPY_%s' % fname2def("decl_%s" % f)) + + # XXX: hack to circumvent cpp pollution from python: python put its + # config.h in the public namespace, so we have a clash for the common + # functions we test. We remove every function tested by python's + # autoconf, hoping their own test are correct + _macros = ["isnan", "isinf", "signbit", "isfinite"] + for f in _macros: + py_symbol = fname2def("decl_%s" % f) + already_declared = config.check_decl(py_symbol, + headers=["Python.h", "math.h"]) + if already_declared: + if config.check_macro_true(py_symbol, + headers=["Python.h", "math.h"]): + pub.append('NPY_%s' % fname2def("decl_%s" % f)) + else: + macros.append(f) + # Normally, isnan and isinf are macro (C99), but some platforms only have + # func, or both func and macro version. Check for macro only, and define + # replacement ones if not found. + # Note: including Python.h is necessary because it modifies some math.h + # definitions + for f in macros: + st = config.check_decl(f, headers=["Python.h", "math.h"]) + if st: + _add_decl(f) + + return priv, pub + +def check_types(config_cmd, ext, build_dir): + private_defines = [] + public_defines = [] + + # Expected size (in number of bytes) for each type. This is an + # optimization: those are only hints, and an exhaustive search for the size + # is done if the hints are wrong. + expected = {'short': [2], 'int': [4], 'long': [8, 4], + 'float': [4], 'double': [8], 'long double': [16, 12, 8], + 'Py_intptr_t': [8, 4], 'PY_LONG_LONG': [8], 'long long': [8], + 'off_t': [8, 4]} + + # Check we have the python header (-dev* packages on Linux) + result = config_cmd.check_header('Python.h') + if not result: + python = 'python' + if '__pypy__' in sys.builtin_module_names: + python = 'pypy' + raise SystemError( + "Cannot compile 'Python.h'. Perhaps you need to " + "install {0}-dev|{0}-devel.".format(python)) + res = config_cmd.check_header("endian.h") + if res: + private_defines.append(('HAVE_ENDIAN_H', 1)) + public_defines.append(('NPY_HAVE_ENDIAN_H', 1)) + res = config_cmd.check_header("sys/endian.h") + if res: + private_defines.append(('HAVE_SYS_ENDIAN_H', 1)) + public_defines.append(('NPY_HAVE_SYS_ENDIAN_H', 1)) + + # Check basic types sizes + for type in ('short', 'int', 'long'): + res = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), headers=["Python.h"]) + if res: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), "SIZEOF_%s" % sym2def(type))) + else: + res = config_cmd.check_type_size(type, expected=expected[type]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + for type in ('float', 'double', 'long double'): + already_declared = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), + headers=["Python.h"]) + res = config_cmd.check_type_size(type, expected=expected[type]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + if not already_declared and not type == 'long double': + private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + # Compute size of corresponding complex type: used to check that our + # definition is binary compatible with C99 complex type (check done at + # build time in npy_common.h) + complex_def = "struct {%s __x; %s __y;}" % (type, type) + res = config_cmd.check_type_size(complex_def, + expected=[2 * x for x in expected[type]]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_COMPLEX_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % complex_def) + + for type in ('Py_intptr_t', 'off_t'): + res = config_cmd.check_type_size(type, headers=["Python.h"], + library_dirs=[pythonlib_dir()], + expected=expected[type]) + + if res >= 0: + private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + # We check declaration AND type because that's how distutils does it. + if config_cmd.check_decl('PY_LONG_LONG', headers=['Python.h']): + res = config_cmd.check_type_size('PY_LONG_LONG', headers=['Python.h'], + library_dirs=[pythonlib_dir()], + expected=expected['PY_LONG_LONG']) + if res >= 0: + private_defines.append(('SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % 'PY_LONG_LONG') + + res = config_cmd.check_type_size('long long', + expected=expected['long long']) + if res >= 0: + #private_defines.append(('SIZEOF_%s' % sym2def('long long'), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def('long long'), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % 'long long') + + if not config_cmd.check_decl('CHAR_BIT', headers=['Python.h']): + raise RuntimeError( + "Config wo CHAR_BIT is not supported" + ", please contact the maintainers") + + return private_defines, public_defines + +def check_mathlib(config_cmd): + # Testing the C math library + mathlibs = [] + mathlibs_choices = [[], ['m'], ['cpml']] + mathlib = os.environ.get('MATHLIB') + if mathlib: + mathlibs_choices.insert(0, mathlib.split(',')) + for libs in mathlibs_choices: + if config_cmd.check_func("exp", libraries=libs, decl=True, call=True): + mathlibs = libs + break + else: + raise RuntimeError( + "math library missing; rerun setup.py after setting the " + "MATHLIB env variable") + return mathlibs + +def visibility_define(config): + """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty + string).""" + hide = '__attribute__((visibility("hidden")))' + if config.check_gcc_function_attribute(hide, 'hideme'): + return hide + else: + return '' + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import Configuration, dot_join + from numpy.distutils.system_info import (get_info, blas_opt_info, + lapack_opt_info) + + config = Configuration('core', parent_package, top_path) + local_dir = config.local_path + codegen_dir = join(local_dir, 'code_generators') + + if is_released(config): + warnings.simplefilter('error', MismatchCAPIWarning) + + # Check whether we have a mismatch between the set C API VERSION and the + # actual C API VERSION + check_api_version(C_API_VERSION, codegen_dir) + + generate_umath_py = join(codegen_dir, 'generate_umath.py') + n = dot_join(config.name, 'generate_umath') + generate_umath = npy_load_module('_'.join(n.split('.')), + generate_umath_py, ('.py', 'U', 1)) + + header_dir = 'include/numpy' # this is relative to config.path_in_package + + cocache = CallOnceOnly() + + def generate_config_h(ext, build_dir): + target = join(build_dir, header_dir, 'config.h') + d = os.path.dirname(target) + if not os.path.exists(d): + os.makedirs(d) + + if newer(__file__, target): + config_cmd = config.get_config_cmd() + log.info('Generating %s', target) + + # Check sizeof + moredefs, ignored = cocache.check_types(config_cmd, ext, build_dir) + + # Check math library and C99 math funcs availability + mathlibs = check_mathlib(config_cmd) + moredefs.append(('MATHLIB', ','.join(mathlibs))) + + check_math_capabilities(config_cmd, ext, moredefs, mathlibs) + moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) + moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) + + # Signal check + if is_npy_no_signal(): + moredefs.append('__NPY_PRIVATE_NO_SIGNAL') + + # Windows checks + if sys.platform == 'win32' or os.name == 'nt': + win32_checks(moredefs) + + # C99 restrict keyword + moredefs.append(('NPY_RESTRICT', config_cmd.check_restrict())) + + # Inline check + inline = config_cmd.check_inline() + + if can_link_svml(): + moredefs.append(('NPY_CAN_LINK_SVML', 1)) + + # Use relaxed stride checking + if NPY_RELAXED_STRIDES_CHECKING: + moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) + else: + moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 0)) + + # Use bogus stride debug aid when relaxed strides are enabled + if NPY_RELAXED_STRIDES_DEBUG: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) + else: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 0)) + + # Get long double representation + rep = check_long_double_representation(config_cmd) + moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1)) + + if check_for_right_shift_internal_compiler_error(config_cmd): + moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift') + + # Generate the config.h file from moredefs + with open(target, 'w') as target_f: + for d in moredefs: + if isinstance(d, str): + target_f.write('#define %s\n' % (d)) + else: + target_f.write('#define %s %s\n' % (d[0], d[1])) + + # define inline to our keyword, or nothing + target_f.write('#ifndef __cplusplus\n') + if inline == 'inline': + target_f.write('/* #undef inline */\n') + else: + target_f.write('#define inline %s\n' % inline) + target_f.write('#endif\n') + + # add the guard to make sure config.h is never included directly, + # but always through npy_config.h + target_f.write(textwrap.dedent(""" + #ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ + #error config.h should never be included directly, include npy_config.h instead + #endif + """)) + + log.info('File: %s' % target) + with open(target) as target_f: + log.info(target_f.read()) + log.info('EOF') + else: + mathlibs = [] + with open(target) as target_f: + for line in target_f: + s = '#define MATHLIB' + if line.startswith(s): + value = line[len(s):].strip() + if value: + mathlibs.extend(value.split(',')) + + # Ugly: this can be called within a library and not an extension, + # in which case there is no libraries attributes (and none is + # needed). + if hasattr(ext, 'libraries'): + ext.libraries.extend(mathlibs) + + incl_dir = os.path.dirname(target) + if incl_dir not in config.numpy_include_dirs: + config.numpy_include_dirs.append(incl_dir) + + return target + + def generate_numpyconfig_h(ext, build_dir): + """Depends on config.h: generate_config_h has to be called before !""" + # put common include directory in build_dir on search path + # allows using code generation in headers + config.add_include_dirs(join(build_dir, "src", "common")) + config.add_include_dirs(join(build_dir, "src", "npymath")) + + target = join(build_dir, header_dir, '_numpyconfig.h') + d = os.path.dirname(target) + if not os.path.exists(d): + os.makedirs(d) + if newer(__file__, target): + config_cmd = config.get_config_cmd() + log.info('Generating %s', target) + + # Check sizeof + ignored, moredefs = cocache.check_types(config_cmd, ext, build_dir) + + if is_npy_no_signal(): + moredefs.append(('NPY_NO_SIGNAL', 1)) + + if is_npy_no_smp(): + moredefs.append(('NPY_NO_SMP', 1)) + else: + moredefs.append(('NPY_NO_SMP', 0)) + + mathlibs = check_mathlib(config_cmd) + moredefs.extend(cocache.check_ieee_macros(config_cmd)[1]) + moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[1]) + + if NPY_RELAXED_STRIDES_CHECKING: + moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) + + if NPY_RELAXED_STRIDES_DEBUG: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) + + # Check whether we can use inttypes (C99) formats + if config_cmd.check_decl('PRIdPTR', headers=['inttypes.h']): + moredefs.append(('NPY_USE_C99_FORMATS', 1)) + + # visibility check + hidden_visibility = visibility_define(config_cmd) + moredefs.append(('NPY_VISIBILITY_HIDDEN', hidden_visibility)) + + # Add the C API/ABI versions + moredefs.append(('NPY_ABI_VERSION', '0x%.8X' % C_ABI_VERSION)) + moredefs.append(('NPY_API_VERSION', '0x%.8X' % C_API_VERSION)) + + # Add moredefs to header + with open(target, 'w') as target_f: + for d in moredefs: + if isinstance(d, str): + target_f.write('#define %s\n' % (d)) + else: + target_f.write('#define %s %s\n' % (d[0], d[1])) + + # Define __STDC_FORMAT_MACROS + target_f.write(textwrap.dedent(""" + #ifndef __STDC_FORMAT_MACROS + #define __STDC_FORMAT_MACROS 1 + #endif + """)) + + # Dump the numpyconfig.h header to stdout + log.info('File: %s' % target) + with open(target) as target_f: + log.info(target_f.read()) + log.info('EOF') + config.add_data_files((header_dir, target)) + return target + + def generate_api_func(module_name): + def generate_api(ext, build_dir): + script = join(codegen_dir, module_name + '.py') + sys.path.insert(0, codegen_dir) + try: + m = __import__(module_name) + log.info('executing %s', script) + h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir)) + finally: + del sys.path[0] + config.add_data_files((header_dir, h_file), + (header_dir, doc_file)) + return (h_file,) + return generate_api + + generate_numpy_api = generate_api_func('generate_numpy_api') + generate_ufunc_api = generate_api_func('generate_ufunc_api') + + config.add_include_dirs(join(local_dir, "src", "common")) + config.add_include_dirs(join(local_dir, "src")) + config.add_include_dirs(join(local_dir)) + + config.add_data_dir('include/numpy') + config.add_include_dirs(join('src', 'npymath')) + config.add_include_dirs(join('src', 'multiarray')) + config.add_include_dirs(join('src', 'umath')) + config.add_include_dirs(join('src', 'npysort')) + config.add_include_dirs(join('src', '_simd')) + + config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process + config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")]) + if sys.platform[:3] == "aix": + config.add_define_macros([("_LARGE_FILES", None)]) + else: + config.add_define_macros([("_FILE_OFFSET_BITS", "64")]) + config.add_define_macros([('_LARGEFILE_SOURCE', '1')]) + config.add_define_macros([('_LARGEFILE64_SOURCE', '1')]) + + config.numpy_include_dirs.extend(config.paths('include')) + + deps = [join('src', 'npymath', '_signbit.c'), + join('include', 'numpy', '*object.h'), + join(codegen_dir, 'genapi.py'), + ] + + ####################################################################### + # npymath library # + ####################################################################### + + subst_dict = dict([("sep", os.path.sep), ("pkgname", "numpy.core")]) + + def get_mathlib_info(*args): + # Another ugly hack: the mathlib info is known once build_src is run, + # but we cannot use add_installed_pkg_config here either, so we only + # update the substitution dictionary during npymath build + config_cmd = config.get_config_cmd() + + # Check that the toolchain works, to fail early if it doesn't + # (avoid late errors with MATHLIB which are confusing if the + # compiler does not work). + st = config_cmd.try_link('int main(void) { return 0;}') + if not st: + # rerun the failing command in verbose mode + config_cmd.compiler.verbose = True + config_cmd.try_link('int main(void) { return 0;}') + raise RuntimeError("Broken toolchain: cannot link a simple C program") + mlibs = check_mathlib(config_cmd) + + posix_mlib = ' '.join(['-l%s' % l for l in mlibs]) + msvc_mlib = ' '.join(['%s.lib' % l for l in mlibs]) + subst_dict["posix_mathlib"] = posix_mlib + subst_dict["msvc_mathlib"] = msvc_mlib + + npymath_sources = [join('src', 'npymath', 'npy_math_internal.h.src'), + join('src', 'npymath', 'npy_math.c'), + join('src', 'npymath', 'ieee754.c.src'), + join('src', 'npymath', 'npy_math_complex.c.src'), + join('src', 'npymath', 'halffloat.c') + ] + + # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977. + # Intel and Clang also don't seem happy with /GL + is_msvc = (platform.platform().startswith('Windows') and + platform.python_compiler().startswith('MS')) + config.add_installed_library('npymath', + sources=npymath_sources + [get_mathlib_info], + install_dir='lib', + build_info={ + 'include_dirs' : [], # empty list required for creating npy_math_internal.h + 'extra_compiler_args' : (['/GL-'] if is_msvc else []), + }) + config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config", + subst_dict) + config.add_npy_pkg_config("mlib.ini.in", "lib/npy-pkg-config", + subst_dict) + + ####################################################################### + # multiarray_tests module # + ####################################################################### + + config.add_extension('_multiarray_tests', + sources=[join('src', 'multiarray', '_multiarray_tests.c.src'), + join('src', 'common', 'mem_overlap.c'), + join('src', 'common', 'npy_argparse.c'), + join('src', 'common', 'npy_hashtable.c')], + depends=[join('src', 'common', 'mem_overlap.h'), + join('src', 'common', 'npy_argparse.h'), + join('src', 'common', 'npy_hashtable.h'), + join('src', 'common', 'npy_extint128.h')], + libraries=['npymath']) + + ####################################################################### + # _multiarray_umath module - common part # + ####################################################################### + + common_deps = [ + join('src', 'common', 'array_assign.h'), + join('src', 'common', 'binop_override.h'), + join('src', 'common', 'cblasfuncs.h'), + join('src', 'common', 'lowlevel_strided_loops.h'), + join('src', 'common', 'mem_overlap.h'), + join('src', 'common', 'npy_argparse.h'), + join('src', 'common', 'npy_cblas.h'), + join('src', 'common', 'npy_config.h'), + join('src', 'common', 'npy_ctypes.h'), + join('src', 'common', 'npy_extint128.h'), + join('src', 'common', 'npy_import.h'), + join('src', 'common', 'npy_hashtable.h'), + join('src', 'common', 'npy_longdouble.h'), + join('src', 'common', 'npy_svml.h'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'common', 'ucsnarrow.h'), + join('src', 'common', 'ufunc_override.h'), + join('src', 'common', 'umathmodule.h'), + join('src', 'common', 'numpyos.h'), + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + ] + + common_src = [ + join('src', 'common', 'array_assign.c'), + join('src', 'common', 'mem_overlap.c'), + join('src', 'common', 'npy_argparse.c'), + join('src', 'common', 'npy_hashtable.c'), + join('src', 'common', 'npy_longdouble.c'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'common', 'ucsnarrow.c'), + join('src', 'common', 'ufunc_override.c'), + join('src', 'common', 'numpyos.c'), + join('src', 'common', 'npy_cpu_features.c.src'), + ] + + if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0": + blas_info = get_info('blas_ilp64_opt', 2) + else: + blas_info = get_info('blas_opt', 0) + + have_blas = blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', []) + + if have_blas: + extra_info = blas_info + # These files are also in MANIFEST.in so that they are always in + # the source distribution independently of HAVE_CBLAS. + common_src.extend([join('src', 'common', 'cblasfuncs.c'), + join('src', 'common', 'python_xerbla.c'), + ]) + else: + extra_info = {} + + ####################################################################### + # _multiarray_umath module - multiarray part # + ####################################################################### + + multiarray_deps = [ + join('src', 'multiarray', 'abstractdtypes.h'), + join('src', 'multiarray', 'arrayobject.h'), + join('src', 'multiarray', 'arraytypes.h'), + join('src', 'multiarray', 'arrayfunction_override.h'), + join('src', 'multiarray', 'array_coercion.h'), + join('src', 'multiarray', 'array_method.h'), + join('src', 'multiarray', 'npy_buffer.h'), + join('src', 'multiarray', 'calculation.h'), + join('src', 'multiarray', 'common.h'), + join('src', 'multiarray', 'common_dtype.h'), + join('src', 'multiarray', 'convert_datatype.h'), + join('src', 'multiarray', 'convert.h'), + join('src', 'multiarray', 'conversion_utils.h'), + join('src', 'multiarray', 'ctors.h'), + join('src', 'multiarray', 'descriptor.h'), + join('src', 'multiarray', 'dtypemeta.h'), + join('src', 'multiarray', 'dtype_transfer.h'), + join('src', 'multiarray', 'dragon4.h'), + join('src', 'multiarray', 'einsum_debug.h'), + join('src', 'multiarray', 'einsum_sumprod.h'), + join('src', 'multiarray', 'experimental_public_dtype_api.h'), + join('src', 'multiarray', 'getset.h'), + join('src', 'multiarray', 'hashdescr.h'), + join('src', 'multiarray', 'iterators.h'), + join('src', 'multiarray', 'legacy_dtype_implementation.h'), + join('src', 'multiarray', 'mapping.h'), + join('src', 'multiarray', 'methods.h'), + join('src', 'multiarray', 'multiarraymodule.h'), + join('src', 'multiarray', 'nditer_impl.h'), + join('src', 'multiarray', 'number.h'), + join('src', 'multiarray', 'refcount.h'), + join('src', 'multiarray', 'scalartypes.h'), + join('src', 'multiarray', 'sequence.h'), + join('src', 'multiarray', 'shape.h'), + join('src', 'multiarray', 'strfuncs.h'), + join('src', 'multiarray', 'typeinfo.h'), + join('src', 'multiarray', 'usertypes.h'), + join('src', 'multiarray', 'vdot.h'), + join('include', 'numpy', 'arrayobject.h'), + join('include', 'numpy', '_neighborhood_iterator_imp.h'), + join('include', 'numpy', 'npy_endian.h'), + join('include', 'numpy', 'arrayscalars.h'), + join('include', 'numpy', 'noprefix.h'), + join('include', 'numpy', 'npy_interrupt.h'), + join('include', 'numpy', 'npy_3kcompat.h'), + join('include', 'numpy', 'npy_math.h'), + join('include', 'numpy', 'halffloat.h'), + join('include', 'numpy', 'npy_common.h'), + join('include', 'numpy', 'npy_os.h'), + join('include', 'numpy', 'utils.h'), + join('include', 'numpy', 'ndarrayobject.h'), + join('include', 'numpy', 'npy_cpu.h'), + join('include', 'numpy', 'numpyconfig.h'), + join('include', 'numpy', 'ndarraytypes.h'), + join('include', 'numpy', 'npy_1_7_deprecated_api.h'), + # add library sources as distuils does not consider libraries + # dependencies + ] + npymath_sources + + multiarray_src = [ + join('src', 'multiarray', 'abstractdtypes.c'), + join('src', 'multiarray', 'alloc.c'), + join('src', 'multiarray', 'arrayobject.c'), + join('src', 'multiarray', 'arraytypes.c.src'), + join('src', 'multiarray', 'array_coercion.c'), + join('src', 'multiarray', 'array_method.c'), + join('src', 'multiarray', 'array_assign_scalar.c'), + join('src', 'multiarray', 'array_assign_array.c'), + join('src', 'multiarray', 'arrayfunction_override.c'), + join('src', 'multiarray', 'buffer.c'), + join('src', 'multiarray', 'calculation.c'), + join('src', 'multiarray', 'compiled_base.c'), + join('src', 'multiarray', 'common.c'), + join('src', 'multiarray', 'common_dtype.c'), + join('src', 'multiarray', 'convert.c'), + join('src', 'multiarray', 'convert_datatype.c'), + join('src', 'multiarray', 'conversion_utils.c'), + join('src', 'multiarray', 'ctors.c'), + join('src', 'multiarray', 'datetime.c'), + join('src', 'multiarray', 'datetime_strings.c'), + join('src', 'multiarray', 'datetime_busday.c'), + join('src', 'multiarray', 'datetime_busdaycal.c'), + join('src', 'multiarray', 'descriptor.c'), + join('src', 'multiarray', 'dtypemeta.c'), + join('src', 'multiarray', 'dragon4.c'), + join('src', 'multiarray', 'dtype_transfer.c'), + join('src', 'multiarray', 'einsum.c.src'), + join('src', 'multiarray', 'einsum_sumprod.c.src'), + join('src', 'multiarray', 'experimental_public_dtype_api.c'), + join('src', 'multiarray', 'flagsobject.c'), + join('src', 'multiarray', 'getset.c'), + join('src', 'multiarray', 'hashdescr.c'), + join('src', 'multiarray', 'item_selection.c'), + join('src', 'multiarray', 'iterators.c'), + join('src', 'multiarray', 'legacy_dtype_implementation.c'), + join('src', 'multiarray', 'lowlevel_strided_loops.c.src'), + join('src', 'multiarray', 'mapping.c'), + join('src', 'multiarray', 'methods.c'), + join('src', 'multiarray', 'multiarraymodule.c'), + join('src', 'multiarray', 'nditer_templ.c.src'), + join('src', 'multiarray', 'nditer_api.c'), + join('src', 'multiarray', 'nditer_constr.c'), + join('src', 'multiarray', 'nditer_pywrap.c'), + join('src', 'multiarray', 'number.c'), + join('src', 'multiarray', 'refcount.c'), + join('src', 'multiarray', 'sequence.c'), + join('src', 'multiarray', 'shape.c'), + join('src', 'multiarray', 'scalarapi.c'), + join('src', 'multiarray', 'scalartypes.c.src'), + join('src', 'multiarray', 'strfuncs.c'), + join('src', 'multiarray', 'temp_elide.c'), + join('src', 'multiarray', 'typeinfo.c'), + join('src', 'multiarray', 'usertypes.c'), + join('src', 'multiarray', 'vdot.c'), + join('src', 'common', 'npy_sort.h.src'), + join('src', 'npysort', 'quicksort.c.src'), + join('src', 'npysort', 'mergesort.c.src'), + join('src', 'npysort', 'timsort.c.src'), + join('src', 'npysort', 'heapsort.c.src'), + join('src', 'npysort', 'radixsort.c.src'), + join('src', 'common', 'npy_partition.h.src'), + join('src', 'npysort', 'selection.c.src'), + join('src', 'common', 'npy_binsearch.h.src'), + join('src', 'npysort', 'binsearch.c.src'), + ] + + ####################################################################### + # _multiarray_umath module - umath part # + ####################################################################### + + def generate_umath_c(ext, build_dir): + target = join(build_dir, header_dir, '__umath_generated.c') + dir = os.path.dirname(target) + if not os.path.exists(dir): + os.makedirs(dir) + script = generate_umath_py + if newer(script, target): + with open(target, 'w') as f: + f.write(generate_umath.make_code(generate_umath.defdict, + generate_umath.__file__)) + return [] + + umath_src = [ + join('src', 'umath', 'umathmodule.c'), + join('src', 'umath', 'reduction.c'), + join('src', 'umath', 'funcs.inc.src'), + join('src', 'umath', 'simd.inc.src'), + join('src', 'umath', 'loops.h.src'), + join('src', 'umath', 'loops_utils.h.src'), + join('src', 'umath', 'loops.c.src'), + join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), + join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), + join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), + join('src', 'umath', 'matmul.h.src'), + join('src', 'umath', 'matmul.c.src'), + join('src', 'umath', 'clip.h.src'), + join('src', 'umath', 'clip.c.src'), + join('src', 'umath', 'dispatching.c'), + join('src', 'umath', 'legacy_array_method.c'), + join('src', 'umath', 'ufunc_object.c'), + join('src', 'umath', 'extobj.c'), + join('src', 'umath', 'scalarmath.c.src'), + join('src', 'umath', 'ufunc_type_resolution.c'), + join('src', 'umath', 'override.c'), + # For testing. Eventually, should use public API and be separate: + join('src', 'umath', '_scaled_float_dtype.c'), + ] + + umath_deps = [ + generate_umath_py, + join('include', 'numpy', 'npy_math.h'), + join('include', 'numpy', 'halffloat.h'), + join('src', 'multiarray', 'common.h'), + join('src', 'multiarray', 'number.h'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'umath', 'simd.inc.src'), + join('src', 'umath', 'override.h'), + join(codegen_dir, 'generate_ufunc_api.py'), + ] + + svml_path = join('numpy', 'core', 'src', 'umath', 'svml') + svml_objs = [] + if can_link_svml() and check_svml_submodule(svml_path): + svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True) + + config.add_extension('_multiarray_umath', + sources=multiarray_src + umath_src + + common_src + + [generate_config_h, + generate_numpyconfig_h, + generate_numpy_api, + join(codegen_dir, 'generate_numpy_api.py'), + join('*.py'), + generate_umath_c, + generate_ufunc_api, + ], + depends=deps + multiarray_deps + umath_deps + + common_deps, + libraries=['npymath'], + extra_objects=svml_objs, + extra_info=extra_info) + + ####################################################################### + # umath_tests module # + ####################################################################### + + config.add_extension('_umath_tests', sources=[ + join('src', 'umath', '_umath_tests.c.src'), + join('src', 'umath', '_umath_tests.dispatch.c'), + join('src', 'common', 'npy_cpu_features.c.src'), + ]) + + ####################################################################### + # custom rational dtype module # + ####################################################################### + + config.add_extension('_rational_tests', + sources=[join('src', 'umath', '_rational_tests.c.src')]) + + ####################################################################### + # struct_ufunc_test module # + ####################################################################### + + config.add_extension('_struct_ufunc_tests', + sources=[join('src', 'umath', '_struct_ufunc_tests.c.src')]) + + + ####################################################################### + # operand_flag_tests module # + ####################################################################### + + config.add_extension('_operand_flag_tests', + sources=[join('src', 'umath', '_operand_flag_tests.c.src')]) + + ####################################################################### + # SIMD module # + ####################################################################### + + config.add_extension('_simd', sources=[ + join('src', 'common', 'npy_cpu_features.c.src'), + join('src', '_simd', '_simd.c'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd.dispatch.c.src'), + ], depends=[ + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + join('src', '_simd', '_simd.h'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd_arg.inc'), + join('src', '_simd', '_simd_convert.inc'), + join('src', '_simd', '_simd_easyintrin.inc'), + join('src', '_simd', '_simd_vector.inc'), + ]) + + config.add_subpackage('tests') + config.add_data_dir('tests/data') + config.add_data_dir('tests/examples') + config.add_data_files('*.pyi') + + config.make_svn_version_py() + + return config + +if __name__ == '__main__': + from numpy.distutils.core import setup + setup(configuration=configuration) diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 19ea6f22fab0..32b5bc89ec8b 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,4 +153,70 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +#define npyv_reduce_max_u8 vmaxvq_u8 +#define npyv_reduce_max_s8 vmaxvq_s8 +#define npyv_reduce_max_u16 vmaxvq_u16 +#define npyv_reduce_max_s16 vmaxvq_s16 +#define npyv_reduce_max_u32 vmaxvq_u32 +#define npyv_reduce_max_s32 vmaxvq_s32 +NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v) +{ + npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); + npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); + npyv_lanetype_u64 result = (a > b) ? a : b; + return result; +} +NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v) +{ + npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); + npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); + npyv_lanetype_s64 result = (a > b) ? a : b; + return result; +} +#define npyv_reduce_max_f32 vmaxvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_max_f64 vmaxvq_f64 +#endif // NPY_SIMD_F64 + +// Minimum, supports IEEE floating-point arithmetic (IEC 60559), +// - If one of the two vectors contains NaN, the equivalent element of the other vector is set +// - Only if both corresponded elements are NaN, NaN is set. +#define npyv_reduce_maxp_f32 vmaxnmvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_maxp_f64 vmaxnmvq_f64 +#endif // NPY_SIMD_F64 + +#define npyv_reduce_min_u8 vminvq_u8 +#define npyv_reduce_min_s8 vminvq_s8 +#define npyv_reduce_min_u16 vminvq_u16 +#define npyv_reduce_min_s16 vminvq_s16 +#define npyv_reduce_min_u32 vminvq_u32 +#define npyv_reduce_min_s32 vminvq_s32 +NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v) +{ + npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); + npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); + npyv_lanetype_u64 result = (a < b) ? a : b; + return result; +} +NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v) +{ + npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); + npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); + npyv_lanetype_s64 result = (a < b) ? a : b; + return result; +} +#define npyv_reduce_min_f32 vminvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_min_f64 vminvq_f64 +#endif // NPY_SIMD_F64 + +// Minimum, supports IEEE floating-point arithmetic (IEC 60559), +// - If one of the two vectors contains NaN, the equivalent element of the other vector is set +// - Only if both corresponded elements are NaN, NaN is set. +#define npyv_reduce_minp_f32 vminnmvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_minp_f64 vminnmvq_f64 +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 7c0710819d7c..950ea011ef4e 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -13,6 +13,7 @@ #include "numpy/npy_math.h" #include "numpy/halffloat.h" #include "lowlevel_strided_loops.h" +#include "loops.h" #include "npy_pycompat.h" @@ -550,6 +551,7 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void * npy_double, npy_double, npy_double, npy_double# * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# */ #define @TYPE@_floor_divide @TYPE@_divide @@ -724,6 +726,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void /**end repeat1**/ +#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ /**begin repeat1 * #kind = maximum, minimum# * #OP = >, <# @@ -749,6 +752,7 @@ NPY_NO_EXPORT void } /**end repeat1**/ +#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -1593,6 +1597,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# + * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, @@ -1716,6 +1721,7 @@ NPY_NO_EXPORT void npy_clear_floatstatus_barrier((char*)dimensions); } +#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1741,8 +1747,10 @@ NPY_NO_EXPORT void } npy_clear_floatstatus_barrier((char*)dimensions); } +#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ /**end repeat1**/ +#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ /**begin repeat1 * #kind = fmax, fmin# * #OP = >=, <=# @@ -1770,6 +1778,7 @@ NPY_NO_EXPORT void npy_clear_floatstatus_barrier((char*)dimensions); } /**end repeat1**/ +#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 0938cd050f58..aab1b5bbb8fc 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -22,6 +22,13 @@ #define BOOL_fmax BOOL_maximum #define BOOL_fmin BOOL_minimum +/* + * The ARM NEON implementation of min/max for longdouble, longlong, and + * ulonglong assume that they're all 64-bits. If that's not true, then we'll + * use the scalar implementations instead. + */ +#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) +#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) /* ***************************************************************************** @@ -658,6 +665,49 @@ OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void NPY_NO_EXPORT void PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); +/* + ***************************************************************************** + ** MIN/MAX LOOPS ** + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_minmax.dispatch.h" +#endif + +//---------- Integers ---------- + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ +#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +//---------- Float ---------- + + /**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# + */ + #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum, fmax, fmin# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + /* ***************************************************************************** ** END LOOPS ** diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig new file mode 100644 index 000000000000..0938cd050f58 --- /dev/null +++ b/numpy/core/src/umath/loops.h.src.orig @@ -0,0 +1,667 @@ +/* -*- c -*- */ +/* + * vim:syntax=c + */ + +#ifndef _NPY_UMATH_LOOPS_H_ +#define _NPY_UMATH_LOOPS_H_ + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#define BOOL_invert BOOL_logical_not +#define BOOL_add BOOL_logical_or +#define BOOL_bitwise_and BOOL_logical_and +#define BOOL_bitwise_or BOOL_logical_or +#define BOOL_logical_xor BOOL_not_equal +#define BOOL_bitwise_xor BOOL_logical_xor +#define BOOL_multiply BOOL_logical_and +#define BOOL_maximum BOOL_logical_or +#define BOOL_minimum BOOL_logical_and +#define BOOL_fmax BOOL_maximum +#define BOOL_fmin BOOL_minimum + + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or, absolute, logical_not# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +NPY_NO_EXPORT void +BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ + +/**begin repeat1 + * both signed and unsigned integer types + * #s = , u# + * #S = , U# + */ + +#define @S@@TYPE@_floor_divide @S@@TYPE@_divide +#define @S@@TYPE@_fmax @S@@TYPE@_maximum +#define @S@@TYPE@_fmin @S@@TYPE@_minimum + +NPY_NO_EXPORT void +@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #isa = , _avx2# + */ + +NPY_NO_EXPORT void +@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat3 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift# + * #OP = +, -,*, &, |, ^, <<, >># + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +/**begin repeat3 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or# + * #OP = ==, !=, >, >=, <, <=, &&, ||# + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +NPY_NO_EXPORT void +@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**begin repeat2 + * #kind = maximum, minimum# + * #OP = >, <# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +NPY_NO_EXPORT void +@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**end repeat1**/ + +/**end repeat**/ + +/* + ***************************************************************************** + ** FLOAT LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat**/ + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_trigonometric.dispatch.h" +#endif +/**begin repeat + * #func = sin, cos# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_exponent_log.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * # kind = exp, log, frexp, ldexp# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = rint, ceil, floor, trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat2 + * #isa = avx512f, fma# + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * Float types + * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, f, , l# + * #C = F, F, , L# + */ + + +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal, + * logical_and, logical_or# + * #OP = ==, !=, <, <=, >, >=, &&, ||# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# + * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + **/ + +/**begin repeat2 + * #ISA = , _avx512_skx# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + + +/* + ***************************************************************************** + ** COMPLEX LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = add, subtract, multiply# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); +#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); +#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); +#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); +#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); +#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); + +/**begin repeat + * complex types + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, , l# + * #C = F, , L# + */ + +/**begin repeat1 + * arithmetic + * #kind = add, subtract, multiply# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind= greater, greater_equal, less, less_equal, equal, not_equal# + * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + #kind = logical_and, logical_or# + #OP1 = ||, ||# + #OP2 = &&, ||# +*/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #OP = ||, ||, &&# + **/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #isa = , _avx512f# + */ + +NPY_NO_EXPORT void +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +#undef CGE +#undef CLE +#undef CGT +#undef CLT +#undef CEQ +#undef CNE + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void +TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat + * #TYPE = DATETIME, TIMEDELTA# + */ + +NPY_NO_EXPORT void +@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +#define @TYPE@_isnan @TYPE@_isnat + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = ==, !=, >, >=, <, <=# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum, fmin, fmax# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +NPY_NO_EXPORT void +DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/* Special case equivalents to above functions */ +#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide +#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide +/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +/* + ***************************************************************************** + ** OBJECT LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = EQ, NE, GT, GE, LT, LE# + */ +/**begin repeat1 + * #suffix = , _OO_O# + */ +NPY_NO_EXPORT void +OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ +/**end repeat**/ + +NPY_NO_EXPORT void +OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); + +/* + ***************************************************************************** + ** END LOOPS ** + ***************************************************************************** + */ + +#endif diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src new file mode 100644 index 000000000000..06aed1bb736a --- /dev/null +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -0,0 +1,671 @@ +/*@targets + ** $maxopt baseline + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + + +//############################################################################## +//## Maximum, Minimum +//############################################################################## + +#ifdef NPY_HAVE_NEON +//****************************************************************************** +//** NEON Min / Max +//****************************************************************************** + + +//------------------------------------------------------------------------------ +//-- Integer +//------------------------------------------------------------------------------ + + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong# + * #sfx = s8, u8, s16, u16, s32, u32, + * s64, u64, s64, u64# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ + +// Implementation below assumes longlong and ulonglong are 64-bit. +#if @HAVE_NEON_IMPL@ + +/**begin repeat1 +* Arithmetic +* # kind = maximum, minimum# +* # OP = >, <# +* # vop = max, min# +*/ + +#define SCALAR_OP @TYPE@_scalar_@vop@ +static inline @type@ SCALAR_OP(@type@ a, @type@ b){ + return ((a @OP@ b) ? a : b); +} + +static inline npy_intp +simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + @type@ *ip2 = (@type@ *)args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + // SIMD if possible + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + #define NPYV_CAST (const npyv_lanetype_@sfx@ *) + npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); + + m0 = npyv_@vop@_@sfx@(m0, v0); + m1 = npyv_@vop@_@sfx@(m1, v1); + m2 = npyv_@vop@_@sfx@(m2, v2); + m3 = npyv_@vop@_@sfx@(m3, v3); + m4 = npyv_@vop@_@sfx@(m4, v4); + m5 = npyv_@vop@_@sfx@(m5, v5); + m6 = npyv_@vop@_@sfx@(m6, v6); + m7 = npyv_@vop@_@sfx@(m7, v7); + } + + #undef NPYV_CAST + + m0 = npyv_@vop@_@sfx@(m0, m1); + m2 = npyv_@vop@_@sfx@(m2, m3); + m4 = npyv_@vop@_@sfx@(m4, m5); + m6 = npyv_@vop@_@sfx@(m6, m7); + + m0 = npyv_@vop@_@sfx@(m0, m2); + m4 = npyv_@vop@_@sfx@(m4, m6); + + m0 = npyv_@vop@_@sfx@(m0, m4); + + @type@ r = npyv_reduce_@vop@_@sfx@(m0); + + io1 = SCALAR_OP(io1, r); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline npy_intp +scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + // 8x scalar + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= n){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + io1 = SCALAR_OP(io1, m0); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline void +run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + npy_intp i; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + i = 0; + + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + // SIMD - do as many iterations as we can with vectors + i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } else{ + // Unrolled scalar - do as many iterations as we can with unrolled loops + i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } + + // Scalar - finish up any remaining iterations + io1 = iop1[0]; + ip2 += i * is2; + for(; i=, <=, >=, <=# +* # vop = max, min, maxp, minp# +* # asmInstr = fmax, fmin, fmaxnm, fminnm# +* # PROPAGATE_NAN = 1, 1, 0, 0# +*/ + +#define SCALAR_OP @TYPE@_scalar_@vop@ +static inline @type@ SCALAR_OP(@type@ a, @type@ b){ +#ifdef __aarch64__ + @type@ result = 0; + __asm( + "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]" + : [result] "=w" (result) + : [a] "w" (a), [b] "w" (b) + ); + return result; +#else + +#if @PROPAGATE_NAN@ + return ((a @OP@ b || npy_isnan(a)) ? a : b); +#else + return ((a @OP@ b || npy_isnan(b)) ? a : b); +#endif +#endif // __aarch64__ +} + +static inline npy_intp +simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + @type@ *ip2 = (@type@ *)args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + // SIMD if possible + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); + + m0 = npyv_@vop@_@sfx@(m0, v0); + m1 = npyv_@vop@_@sfx@(m1, v1); + m2 = npyv_@vop@_@sfx@(m2, v2); + m3 = npyv_@vop@_@sfx@(m3, v3); + m4 = npyv_@vop@_@sfx@(m4, v4); + m5 = npyv_@vop@_@sfx@(m5, v5); + m6 = npyv_@vop@_@sfx@(m6, v6); + m7 = npyv_@vop@_@sfx@(m7, v7); + } + + m0 = npyv_@vop@_@sfx@(m0, m1); + m2 = npyv_@vop@_@sfx@(m2, m3); + m4 = npyv_@vop@_@sfx@(m4, m5); + m6 = npyv_@vop@_@sfx@(m6, m7); + + m0 = npyv_@vop@_@sfx@(m0, m2); + m4 = npyv_@vop@_@sfx@(m4, m6); + + m0 = npyv_@vop@_@sfx@(m0, m4); + + @type@ r = npyv_reduce_@vop@_@sfx@(m0); + + io1 = SCALAR_OP(io1, r); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline npy_intp +scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + // 8x scalar + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= n){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + io1 = SCALAR_OP(io1, m0); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline void +run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + npy_intp i; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + i = 0; + + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + // SIMD - do as many iterations as we can with vectors + i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } else{ + // Unrolled scalar - do as many iterations as we can with unrolled loops + i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } + + // Scalar - finish up any remaining iterations + io1 = iop1[0]; + ip2 += i * is2; + for(; i Date: Mon, 18 Oct 2021 14:43:09 -0700 Subject: [PATCH 2/8] Remove extraneous .org files --- numpy/core/setup.py.orig | 1063 ------------------------- numpy/core/src/umath/loops.h.src.orig | 667 ---------------- 2 files changed, 1730 deletions(-) delete mode 100644 numpy/core/setup.py.orig delete mode 100644 numpy/core/src/umath/loops.h.src.orig diff --git a/numpy/core/setup.py.orig b/numpy/core/setup.py.orig deleted file mode 100644 index f2524ae13c26..000000000000 --- a/numpy/core/setup.py.orig +++ /dev/null @@ -1,1063 +0,0 @@ -import os -import sys -import pickle -import copy -import warnings -import platform -import textwrap -import glob -from os.path import join - -from numpy.distutils import log -from distutils.dep_util import newer -from sysconfig import get_config_var -from numpy.compat import npy_load_module -from setup_common import * # noqa: F403 - -# Set to True to enable relaxed strides checking. This (mostly) means -# that `strides[dim]` is ignored if `shape[dim] == 1` when setting flags. -NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0") - -# Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a -# bogus value for affected strides in order to help smoke out bad stride usage -# when relaxed stride checking is enabled. -NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0") -NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING - -# XXX: ugly, we use a class to avoid calling twice some expensive functions in -# config.h/numpyconfig.h. I don't see a better way because distutils force -# config.h generation inside an Extension class, and as such sharing -# configuration information between extensions is not easy. -# Using a pickled-based memoize does not work because config_cmd is an instance -# method, which cPickle does not like. -# -# Use pickle in all cases, as cPickle is gone in python3 and the difference -# in time is only in build. -- Charles Harris, 2013-03-30 - -class CallOnceOnly: - def __init__(self): - self._check_types = None - self._check_ieee_macros = None - self._check_complex = None - - def check_types(self, *a, **kw): - if self._check_types is None: - out = check_types(*a, **kw) - self._check_types = pickle.dumps(out) - else: - out = copy.deepcopy(pickle.loads(self._check_types)) - return out - - def check_ieee_macros(self, *a, **kw): - if self._check_ieee_macros is None: - out = check_ieee_macros(*a, **kw) - self._check_ieee_macros = pickle.dumps(out) - else: - out = copy.deepcopy(pickle.loads(self._check_ieee_macros)) - return out - - def check_complex(self, *a, **kw): - if self._check_complex is None: - out = check_complex(*a, **kw) - self._check_complex = pickle.dumps(out) - else: - out = copy.deepcopy(pickle.loads(self._check_complex)) - return out - -def can_link_svml(): - """SVML library is supported only on x86_64 architecture and currently - only on linux - """ - machine = platform.machine() - system = platform.system() - return "x86_64" in machine and system == "Linux" - -def check_svml_submodule(svmlpath): - if not os.path.exists(svmlpath + "/README.md"): - raise RuntimeError("Missing `SVML` submodule! Run `git submodule " - "update --init` to fix this.") - return True - -def pythonlib_dir(): - """return path where libpython* is.""" - if sys.platform == 'win32': - return os.path.join(sys.prefix, "libs") - else: - return get_config_var('LIBDIR') - -def is_npy_no_signal(): - """Return True if the NPY_NO_SIGNAL symbol must be defined in configuration - header.""" - return sys.platform == 'win32' - -def is_npy_no_smp(): - """Return True if the NPY_NO_SMP symbol must be defined in public - header (when SMP support cannot be reliably enabled).""" - # Perhaps a fancier check is in order here. - # so that threads are only enabled if there - # are actually multiple CPUS? -- but - # threaded code can be nice even on a single - # CPU so that long-calculating code doesn't - # block. - return 'NPY_NOSMP' in os.environ - -def win32_checks(deflist): - from numpy.distutils.misc_util import get_build_architecture - a = get_build_architecture() - - # Distutils hack on AMD64 on windows - print('BUILD_ARCHITECTURE: %r, os.name=%r, sys.platform=%r' % - (a, os.name, sys.platform)) - if a == 'AMD64': - deflist.append('DISTUTILS_USE_SDK') - - # On win32, force long double format string to be 'g', not - # 'Lg', since the MS runtime does not support long double whose - # size is > sizeof(double) - if a == "Intel" or a == "AMD64": - deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING') - -def check_math_capabilities(config, ext, moredefs, mathlibs): - def check_func(func_name): - return config.check_func(func_name, libraries=mathlibs, - decl=True, call=True) - - def check_funcs_once(funcs_name): - decl = dict([(f, True) for f in funcs_name]) - st = config.check_funcs_once(funcs_name, libraries=mathlibs, - decl=decl, call=decl) - if st: - moredefs.extend([(fname2def(f), 1) for f in funcs_name]) - return st - - def check_funcs(funcs_name): - # Use check_funcs_once first, and if it does not work, test func per - # func. Return success only if all the functions are available - if not check_funcs_once(funcs_name): - # Global check failed, check func per func - for f in funcs_name: - if check_func(f): - moredefs.append((fname2def(f), 1)) - return 0 - else: - return 1 - - #use_msvc = config.check_decl("_MSC_VER") - - if not check_funcs_once(MANDATORY_FUNCS): - raise SystemError("One of the required function to build numpy is not" - " available (the list is %s)." % str(MANDATORY_FUNCS)) - - # Standard functions which may not be available and for which we have a - # replacement implementation. Note that some of these are C99 functions. - - # XXX: hack to circumvent cpp pollution from python: python put its - # config.h in the public namespace, so we have a clash for the common - # functions we test. We remove every function tested by python's - # autoconf, hoping their own test are correct - for f in OPTIONAL_STDFUNCS_MAYBE: - if config.check_decl(fname2def(f), - headers=["Python.h", "math.h"]): - OPTIONAL_STDFUNCS.remove(f) - - check_funcs(OPTIONAL_STDFUNCS) - - for h in OPTIONAL_HEADERS: - if config.check_func("", decl=False, call=False, headers=[h]): - h = h.replace(".", "_").replace(os.path.sep, "_") - moredefs.append((fname2def(h), 1)) - - for tup in OPTIONAL_INTRINSICS: - headers = None - if len(tup) == 2: - f, args, m = tup[0], tup[1], fname2def(tup[0]) - elif len(tup) == 3: - f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[0]) - else: - f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[3]) - if config.check_func(f, decl=False, call=True, call_args=args, - headers=headers): - moredefs.append((m, 1)) - - for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES: - if config.check_gcc_function_attribute(dec, fn): - moredefs.append((fname2def(fn), 1)) - if fn == 'attribute_target_avx512f': - # GH-14787: Work around GCC<8.4 bug when compiling with AVX512 - # support on Windows-based platforms - if (sys.platform in ('win32', 'cygwin') and - config.check_compiler_gcc() and - not config.check_gcc_version_at_least(8, 4)): - ext.extra_compile_args.extend( - ['-ffixed-xmm%s' % n for n in range(16, 32)]) - - for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS: - if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code, - header): - moredefs.append((fname2def(fn), 1)) - - for fn in OPTIONAL_VARIABLE_ATTRIBUTES: - if config.check_gcc_variable_attribute(fn): - m = fn.replace("(", "_").replace(")", "_") - moredefs.append((fname2def(m), 1)) - - # C99 functions: float and long double versions - check_funcs(C99_FUNCS_SINGLE) - check_funcs(C99_FUNCS_EXTENDED) - -def check_complex(config, mathlibs): - priv = [] - pub = [] - - try: - if os.uname()[0] == "Interix": - warnings.warn("Disabling broken complex support. See #1365", stacklevel=2) - return priv, pub - except Exception: - # os.uname not available on all platforms. blanket except ugly but safe - pass - - # Check for complex support - st = config.check_header('complex.h') - if st: - priv.append(('HAVE_COMPLEX_H', 1)) - pub.append(('NPY_USE_C99_COMPLEX', 1)) - - for t in C99_COMPLEX_TYPES: - st = config.check_type(t, headers=["complex.h"]) - if st: - pub.append(('NPY_HAVE_%s' % type2def(t), 1)) - - def check_prec(prec): - flist = [f + prec for f in C99_COMPLEX_FUNCS] - decl = dict([(f, True) for f in flist]) - if not config.check_funcs_once(flist, call=decl, decl=decl, - libraries=mathlibs): - for f in flist: - if config.check_func(f, call=True, decl=True, - libraries=mathlibs): - priv.append((fname2def(f), 1)) - else: - priv.extend([(fname2def(f), 1) for f in flist]) - - check_prec('') - check_prec('f') - check_prec('l') - - return priv, pub - -def check_ieee_macros(config): - priv = [] - pub = [] - - macros = [] - - def _add_decl(f): - priv.append(fname2def("decl_%s" % f)) - pub.append('NPY_%s' % fname2def("decl_%s" % f)) - - # XXX: hack to circumvent cpp pollution from python: python put its - # config.h in the public namespace, so we have a clash for the common - # functions we test. We remove every function tested by python's - # autoconf, hoping their own test are correct - _macros = ["isnan", "isinf", "signbit", "isfinite"] - for f in _macros: - py_symbol = fname2def("decl_%s" % f) - already_declared = config.check_decl(py_symbol, - headers=["Python.h", "math.h"]) - if already_declared: - if config.check_macro_true(py_symbol, - headers=["Python.h", "math.h"]): - pub.append('NPY_%s' % fname2def("decl_%s" % f)) - else: - macros.append(f) - # Normally, isnan and isinf are macro (C99), but some platforms only have - # func, or both func and macro version. Check for macro only, and define - # replacement ones if not found. - # Note: including Python.h is necessary because it modifies some math.h - # definitions - for f in macros: - st = config.check_decl(f, headers=["Python.h", "math.h"]) - if st: - _add_decl(f) - - return priv, pub - -def check_types(config_cmd, ext, build_dir): - private_defines = [] - public_defines = [] - - # Expected size (in number of bytes) for each type. This is an - # optimization: those are only hints, and an exhaustive search for the size - # is done if the hints are wrong. - expected = {'short': [2], 'int': [4], 'long': [8, 4], - 'float': [4], 'double': [8], 'long double': [16, 12, 8], - 'Py_intptr_t': [8, 4], 'PY_LONG_LONG': [8], 'long long': [8], - 'off_t': [8, 4]} - - # Check we have the python header (-dev* packages on Linux) - result = config_cmd.check_header('Python.h') - if not result: - python = 'python' - if '__pypy__' in sys.builtin_module_names: - python = 'pypy' - raise SystemError( - "Cannot compile 'Python.h'. Perhaps you need to " - "install {0}-dev|{0}-devel.".format(python)) - res = config_cmd.check_header("endian.h") - if res: - private_defines.append(('HAVE_ENDIAN_H', 1)) - public_defines.append(('NPY_HAVE_ENDIAN_H', 1)) - res = config_cmd.check_header("sys/endian.h") - if res: - private_defines.append(('HAVE_SYS_ENDIAN_H', 1)) - public_defines.append(('NPY_HAVE_SYS_ENDIAN_H', 1)) - - # Check basic types sizes - for type in ('short', 'int', 'long'): - res = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), headers=["Python.h"]) - if res: - public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), "SIZEOF_%s" % sym2def(type))) - else: - res = config_cmd.check_type_size(type, expected=expected[type]) - if res >= 0: - public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) - else: - raise SystemError("Checking sizeof (%s) failed !" % type) - - for type in ('float', 'double', 'long double'): - already_declared = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), - headers=["Python.h"]) - res = config_cmd.check_type_size(type, expected=expected[type]) - if res >= 0: - public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) - if not already_declared and not type == 'long double': - private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) - else: - raise SystemError("Checking sizeof (%s) failed !" % type) - - # Compute size of corresponding complex type: used to check that our - # definition is binary compatible with C99 complex type (check done at - # build time in npy_common.h) - complex_def = "struct {%s __x; %s __y;}" % (type, type) - res = config_cmd.check_type_size(complex_def, - expected=[2 * x for x in expected[type]]) - if res >= 0: - public_defines.append(('NPY_SIZEOF_COMPLEX_%s' % sym2def(type), '%d' % res)) - else: - raise SystemError("Checking sizeof (%s) failed !" % complex_def) - - for type in ('Py_intptr_t', 'off_t'): - res = config_cmd.check_type_size(type, headers=["Python.h"], - library_dirs=[pythonlib_dir()], - expected=expected[type]) - - if res >= 0: - private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) - public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) - else: - raise SystemError("Checking sizeof (%s) failed !" % type) - - # We check declaration AND type because that's how distutils does it. - if config_cmd.check_decl('PY_LONG_LONG', headers=['Python.h']): - res = config_cmd.check_type_size('PY_LONG_LONG', headers=['Python.h'], - library_dirs=[pythonlib_dir()], - expected=expected['PY_LONG_LONG']) - if res >= 0: - private_defines.append(('SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) - public_defines.append(('NPY_SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) - else: - raise SystemError("Checking sizeof (%s) failed !" % 'PY_LONG_LONG') - - res = config_cmd.check_type_size('long long', - expected=expected['long long']) - if res >= 0: - #private_defines.append(('SIZEOF_%s' % sym2def('long long'), '%d' % res)) - public_defines.append(('NPY_SIZEOF_%s' % sym2def('long long'), '%d' % res)) - else: - raise SystemError("Checking sizeof (%s) failed !" % 'long long') - - if not config_cmd.check_decl('CHAR_BIT', headers=['Python.h']): - raise RuntimeError( - "Config wo CHAR_BIT is not supported" - ", please contact the maintainers") - - return private_defines, public_defines - -def check_mathlib(config_cmd): - # Testing the C math library - mathlibs = [] - mathlibs_choices = [[], ['m'], ['cpml']] - mathlib = os.environ.get('MATHLIB') - if mathlib: - mathlibs_choices.insert(0, mathlib.split(',')) - for libs in mathlibs_choices: - if config_cmd.check_func("exp", libraries=libs, decl=True, call=True): - mathlibs = libs - break - else: - raise RuntimeError( - "math library missing; rerun setup.py after setting the " - "MATHLIB env variable") - return mathlibs - -def visibility_define(config): - """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty - string).""" - hide = '__attribute__((visibility("hidden")))' - if config.check_gcc_function_attribute(hide, 'hideme'): - return hide - else: - return '' - -def configuration(parent_package='',top_path=None): - from numpy.distutils.misc_util import Configuration, dot_join - from numpy.distutils.system_info import (get_info, blas_opt_info, - lapack_opt_info) - - config = Configuration('core', parent_package, top_path) - local_dir = config.local_path - codegen_dir = join(local_dir, 'code_generators') - - if is_released(config): - warnings.simplefilter('error', MismatchCAPIWarning) - - # Check whether we have a mismatch between the set C API VERSION and the - # actual C API VERSION - check_api_version(C_API_VERSION, codegen_dir) - - generate_umath_py = join(codegen_dir, 'generate_umath.py') - n = dot_join(config.name, 'generate_umath') - generate_umath = npy_load_module('_'.join(n.split('.')), - generate_umath_py, ('.py', 'U', 1)) - - header_dir = 'include/numpy' # this is relative to config.path_in_package - - cocache = CallOnceOnly() - - def generate_config_h(ext, build_dir): - target = join(build_dir, header_dir, 'config.h') - d = os.path.dirname(target) - if not os.path.exists(d): - os.makedirs(d) - - if newer(__file__, target): - config_cmd = config.get_config_cmd() - log.info('Generating %s', target) - - # Check sizeof - moredefs, ignored = cocache.check_types(config_cmd, ext, build_dir) - - # Check math library and C99 math funcs availability - mathlibs = check_mathlib(config_cmd) - moredefs.append(('MATHLIB', ','.join(mathlibs))) - - check_math_capabilities(config_cmd, ext, moredefs, mathlibs) - moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) - moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) - - # Signal check - if is_npy_no_signal(): - moredefs.append('__NPY_PRIVATE_NO_SIGNAL') - - # Windows checks - if sys.platform == 'win32' or os.name == 'nt': - win32_checks(moredefs) - - # C99 restrict keyword - moredefs.append(('NPY_RESTRICT', config_cmd.check_restrict())) - - # Inline check - inline = config_cmd.check_inline() - - if can_link_svml(): - moredefs.append(('NPY_CAN_LINK_SVML', 1)) - - # Use relaxed stride checking - if NPY_RELAXED_STRIDES_CHECKING: - moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) - else: - moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 0)) - - # Use bogus stride debug aid when relaxed strides are enabled - if NPY_RELAXED_STRIDES_DEBUG: - moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) - else: - moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 0)) - - # Get long double representation - rep = check_long_double_representation(config_cmd) - moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1)) - - if check_for_right_shift_internal_compiler_error(config_cmd): - moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift') - moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift') - moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift') - moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift') - - # Generate the config.h file from moredefs - with open(target, 'w') as target_f: - for d in moredefs: - if isinstance(d, str): - target_f.write('#define %s\n' % (d)) - else: - target_f.write('#define %s %s\n' % (d[0], d[1])) - - # define inline to our keyword, or nothing - target_f.write('#ifndef __cplusplus\n') - if inline == 'inline': - target_f.write('/* #undef inline */\n') - else: - target_f.write('#define inline %s\n' % inline) - target_f.write('#endif\n') - - # add the guard to make sure config.h is never included directly, - # but always through npy_config.h - target_f.write(textwrap.dedent(""" - #ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ - #error config.h should never be included directly, include npy_config.h instead - #endif - """)) - - log.info('File: %s' % target) - with open(target) as target_f: - log.info(target_f.read()) - log.info('EOF') - else: - mathlibs = [] - with open(target) as target_f: - for line in target_f: - s = '#define MATHLIB' - if line.startswith(s): - value = line[len(s):].strip() - if value: - mathlibs.extend(value.split(',')) - - # Ugly: this can be called within a library and not an extension, - # in which case there is no libraries attributes (and none is - # needed). - if hasattr(ext, 'libraries'): - ext.libraries.extend(mathlibs) - - incl_dir = os.path.dirname(target) - if incl_dir not in config.numpy_include_dirs: - config.numpy_include_dirs.append(incl_dir) - - return target - - def generate_numpyconfig_h(ext, build_dir): - """Depends on config.h: generate_config_h has to be called before !""" - # put common include directory in build_dir on search path - # allows using code generation in headers - config.add_include_dirs(join(build_dir, "src", "common")) - config.add_include_dirs(join(build_dir, "src", "npymath")) - - target = join(build_dir, header_dir, '_numpyconfig.h') - d = os.path.dirname(target) - if not os.path.exists(d): - os.makedirs(d) - if newer(__file__, target): - config_cmd = config.get_config_cmd() - log.info('Generating %s', target) - - # Check sizeof - ignored, moredefs = cocache.check_types(config_cmd, ext, build_dir) - - if is_npy_no_signal(): - moredefs.append(('NPY_NO_SIGNAL', 1)) - - if is_npy_no_smp(): - moredefs.append(('NPY_NO_SMP', 1)) - else: - moredefs.append(('NPY_NO_SMP', 0)) - - mathlibs = check_mathlib(config_cmd) - moredefs.extend(cocache.check_ieee_macros(config_cmd)[1]) - moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[1]) - - if NPY_RELAXED_STRIDES_CHECKING: - moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) - - if NPY_RELAXED_STRIDES_DEBUG: - moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) - - # Check whether we can use inttypes (C99) formats - if config_cmd.check_decl('PRIdPTR', headers=['inttypes.h']): - moredefs.append(('NPY_USE_C99_FORMATS', 1)) - - # visibility check - hidden_visibility = visibility_define(config_cmd) - moredefs.append(('NPY_VISIBILITY_HIDDEN', hidden_visibility)) - - # Add the C API/ABI versions - moredefs.append(('NPY_ABI_VERSION', '0x%.8X' % C_ABI_VERSION)) - moredefs.append(('NPY_API_VERSION', '0x%.8X' % C_API_VERSION)) - - # Add moredefs to header - with open(target, 'w') as target_f: - for d in moredefs: - if isinstance(d, str): - target_f.write('#define %s\n' % (d)) - else: - target_f.write('#define %s %s\n' % (d[0], d[1])) - - # Define __STDC_FORMAT_MACROS - target_f.write(textwrap.dedent(""" - #ifndef __STDC_FORMAT_MACROS - #define __STDC_FORMAT_MACROS 1 - #endif - """)) - - # Dump the numpyconfig.h header to stdout - log.info('File: %s' % target) - with open(target) as target_f: - log.info(target_f.read()) - log.info('EOF') - config.add_data_files((header_dir, target)) - return target - - def generate_api_func(module_name): - def generate_api(ext, build_dir): - script = join(codegen_dir, module_name + '.py') - sys.path.insert(0, codegen_dir) - try: - m = __import__(module_name) - log.info('executing %s', script) - h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir)) - finally: - del sys.path[0] - config.add_data_files((header_dir, h_file), - (header_dir, doc_file)) - return (h_file,) - return generate_api - - generate_numpy_api = generate_api_func('generate_numpy_api') - generate_ufunc_api = generate_api_func('generate_ufunc_api') - - config.add_include_dirs(join(local_dir, "src", "common")) - config.add_include_dirs(join(local_dir, "src")) - config.add_include_dirs(join(local_dir)) - - config.add_data_dir('include/numpy') - config.add_include_dirs(join('src', 'npymath')) - config.add_include_dirs(join('src', 'multiarray')) - config.add_include_dirs(join('src', 'umath')) - config.add_include_dirs(join('src', 'npysort')) - config.add_include_dirs(join('src', '_simd')) - - config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process - config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")]) - if sys.platform[:3] == "aix": - config.add_define_macros([("_LARGE_FILES", None)]) - else: - config.add_define_macros([("_FILE_OFFSET_BITS", "64")]) - config.add_define_macros([('_LARGEFILE_SOURCE', '1')]) - config.add_define_macros([('_LARGEFILE64_SOURCE', '1')]) - - config.numpy_include_dirs.extend(config.paths('include')) - - deps = [join('src', 'npymath', '_signbit.c'), - join('include', 'numpy', '*object.h'), - join(codegen_dir, 'genapi.py'), - ] - - ####################################################################### - # npymath library # - ####################################################################### - - subst_dict = dict([("sep", os.path.sep), ("pkgname", "numpy.core")]) - - def get_mathlib_info(*args): - # Another ugly hack: the mathlib info is known once build_src is run, - # but we cannot use add_installed_pkg_config here either, so we only - # update the substitution dictionary during npymath build - config_cmd = config.get_config_cmd() - - # Check that the toolchain works, to fail early if it doesn't - # (avoid late errors with MATHLIB which are confusing if the - # compiler does not work). - st = config_cmd.try_link('int main(void) { return 0;}') - if not st: - # rerun the failing command in verbose mode - config_cmd.compiler.verbose = True - config_cmd.try_link('int main(void) { return 0;}') - raise RuntimeError("Broken toolchain: cannot link a simple C program") - mlibs = check_mathlib(config_cmd) - - posix_mlib = ' '.join(['-l%s' % l for l in mlibs]) - msvc_mlib = ' '.join(['%s.lib' % l for l in mlibs]) - subst_dict["posix_mathlib"] = posix_mlib - subst_dict["msvc_mathlib"] = msvc_mlib - - npymath_sources = [join('src', 'npymath', 'npy_math_internal.h.src'), - join('src', 'npymath', 'npy_math.c'), - join('src', 'npymath', 'ieee754.c.src'), - join('src', 'npymath', 'npy_math_complex.c.src'), - join('src', 'npymath', 'halffloat.c') - ] - - # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977. - # Intel and Clang also don't seem happy with /GL - is_msvc = (platform.platform().startswith('Windows') and - platform.python_compiler().startswith('MS')) - config.add_installed_library('npymath', - sources=npymath_sources + [get_mathlib_info], - install_dir='lib', - build_info={ - 'include_dirs' : [], # empty list required for creating npy_math_internal.h - 'extra_compiler_args' : (['/GL-'] if is_msvc else []), - }) - config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config", - subst_dict) - config.add_npy_pkg_config("mlib.ini.in", "lib/npy-pkg-config", - subst_dict) - - ####################################################################### - # multiarray_tests module # - ####################################################################### - - config.add_extension('_multiarray_tests', - sources=[join('src', 'multiarray', '_multiarray_tests.c.src'), - join('src', 'common', 'mem_overlap.c'), - join('src', 'common', 'npy_argparse.c'), - join('src', 'common', 'npy_hashtable.c')], - depends=[join('src', 'common', 'mem_overlap.h'), - join('src', 'common', 'npy_argparse.h'), - join('src', 'common', 'npy_hashtable.h'), - join('src', 'common', 'npy_extint128.h')], - libraries=['npymath']) - - ####################################################################### - # _multiarray_umath module - common part # - ####################################################################### - - common_deps = [ - join('src', 'common', 'array_assign.h'), - join('src', 'common', 'binop_override.h'), - join('src', 'common', 'cblasfuncs.h'), - join('src', 'common', 'lowlevel_strided_loops.h'), - join('src', 'common', 'mem_overlap.h'), - join('src', 'common', 'npy_argparse.h'), - join('src', 'common', 'npy_cblas.h'), - join('src', 'common', 'npy_config.h'), - join('src', 'common', 'npy_ctypes.h'), - join('src', 'common', 'npy_extint128.h'), - join('src', 'common', 'npy_import.h'), - join('src', 'common', 'npy_hashtable.h'), - join('src', 'common', 'npy_longdouble.h'), - join('src', 'common', 'npy_svml.h'), - join('src', 'common', 'templ_common.h.src'), - join('src', 'common', 'ucsnarrow.h'), - join('src', 'common', 'ufunc_override.h'), - join('src', 'common', 'umathmodule.h'), - join('src', 'common', 'numpyos.h'), - join('src', 'common', 'npy_cpu_dispatch.h'), - join('src', 'common', 'simd', 'simd.h'), - ] - - common_src = [ - join('src', 'common', 'array_assign.c'), - join('src', 'common', 'mem_overlap.c'), - join('src', 'common', 'npy_argparse.c'), - join('src', 'common', 'npy_hashtable.c'), - join('src', 'common', 'npy_longdouble.c'), - join('src', 'common', 'templ_common.h.src'), - join('src', 'common', 'ucsnarrow.c'), - join('src', 'common', 'ufunc_override.c'), - join('src', 'common', 'numpyos.c'), - join('src', 'common', 'npy_cpu_features.c.src'), - ] - - if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0": - blas_info = get_info('blas_ilp64_opt', 2) - else: - blas_info = get_info('blas_opt', 0) - - have_blas = blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', []) - - if have_blas: - extra_info = blas_info - # These files are also in MANIFEST.in so that they are always in - # the source distribution independently of HAVE_CBLAS. - common_src.extend([join('src', 'common', 'cblasfuncs.c'), - join('src', 'common', 'python_xerbla.c'), - ]) - else: - extra_info = {} - - ####################################################################### - # _multiarray_umath module - multiarray part # - ####################################################################### - - multiarray_deps = [ - join('src', 'multiarray', 'abstractdtypes.h'), - join('src', 'multiarray', 'arrayobject.h'), - join('src', 'multiarray', 'arraytypes.h'), - join('src', 'multiarray', 'arrayfunction_override.h'), - join('src', 'multiarray', 'array_coercion.h'), - join('src', 'multiarray', 'array_method.h'), - join('src', 'multiarray', 'npy_buffer.h'), - join('src', 'multiarray', 'calculation.h'), - join('src', 'multiarray', 'common.h'), - join('src', 'multiarray', 'common_dtype.h'), - join('src', 'multiarray', 'convert_datatype.h'), - join('src', 'multiarray', 'convert.h'), - join('src', 'multiarray', 'conversion_utils.h'), - join('src', 'multiarray', 'ctors.h'), - join('src', 'multiarray', 'descriptor.h'), - join('src', 'multiarray', 'dtypemeta.h'), - join('src', 'multiarray', 'dtype_transfer.h'), - join('src', 'multiarray', 'dragon4.h'), - join('src', 'multiarray', 'einsum_debug.h'), - join('src', 'multiarray', 'einsum_sumprod.h'), - join('src', 'multiarray', 'experimental_public_dtype_api.h'), - join('src', 'multiarray', 'getset.h'), - join('src', 'multiarray', 'hashdescr.h'), - join('src', 'multiarray', 'iterators.h'), - join('src', 'multiarray', 'legacy_dtype_implementation.h'), - join('src', 'multiarray', 'mapping.h'), - join('src', 'multiarray', 'methods.h'), - join('src', 'multiarray', 'multiarraymodule.h'), - join('src', 'multiarray', 'nditer_impl.h'), - join('src', 'multiarray', 'number.h'), - join('src', 'multiarray', 'refcount.h'), - join('src', 'multiarray', 'scalartypes.h'), - join('src', 'multiarray', 'sequence.h'), - join('src', 'multiarray', 'shape.h'), - join('src', 'multiarray', 'strfuncs.h'), - join('src', 'multiarray', 'typeinfo.h'), - join('src', 'multiarray', 'usertypes.h'), - join('src', 'multiarray', 'vdot.h'), - join('include', 'numpy', 'arrayobject.h'), - join('include', 'numpy', '_neighborhood_iterator_imp.h'), - join('include', 'numpy', 'npy_endian.h'), - join('include', 'numpy', 'arrayscalars.h'), - join('include', 'numpy', 'noprefix.h'), - join('include', 'numpy', 'npy_interrupt.h'), - join('include', 'numpy', 'npy_3kcompat.h'), - join('include', 'numpy', 'npy_math.h'), - join('include', 'numpy', 'halffloat.h'), - join('include', 'numpy', 'npy_common.h'), - join('include', 'numpy', 'npy_os.h'), - join('include', 'numpy', 'utils.h'), - join('include', 'numpy', 'ndarrayobject.h'), - join('include', 'numpy', 'npy_cpu.h'), - join('include', 'numpy', 'numpyconfig.h'), - join('include', 'numpy', 'ndarraytypes.h'), - join('include', 'numpy', 'npy_1_7_deprecated_api.h'), - # add library sources as distuils does not consider libraries - # dependencies - ] + npymath_sources - - multiarray_src = [ - join('src', 'multiarray', 'abstractdtypes.c'), - join('src', 'multiarray', 'alloc.c'), - join('src', 'multiarray', 'arrayobject.c'), - join('src', 'multiarray', 'arraytypes.c.src'), - join('src', 'multiarray', 'array_coercion.c'), - join('src', 'multiarray', 'array_method.c'), - join('src', 'multiarray', 'array_assign_scalar.c'), - join('src', 'multiarray', 'array_assign_array.c'), - join('src', 'multiarray', 'arrayfunction_override.c'), - join('src', 'multiarray', 'buffer.c'), - join('src', 'multiarray', 'calculation.c'), - join('src', 'multiarray', 'compiled_base.c'), - join('src', 'multiarray', 'common.c'), - join('src', 'multiarray', 'common_dtype.c'), - join('src', 'multiarray', 'convert.c'), - join('src', 'multiarray', 'convert_datatype.c'), - join('src', 'multiarray', 'conversion_utils.c'), - join('src', 'multiarray', 'ctors.c'), - join('src', 'multiarray', 'datetime.c'), - join('src', 'multiarray', 'datetime_strings.c'), - join('src', 'multiarray', 'datetime_busday.c'), - join('src', 'multiarray', 'datetime_busdaycal.c'), - join('src', 'multiarray', 'descriptor.c'), - join('src', 'multiarray', 'dtypemeta.c'), - join('src', 'multiarray', 'dragon4.c'), - join('src', 'multiarray', 'dtype_transfer.c'), - join('src', 'multiarray', 'einsum.c.src'), - join('src', 'multiarray', 'einsum_sumprod.c.src'), - join('src', 'multiarray', 'experimental_public_dtype_api.c'), - join('src', 'multiarray', 'flagsobject.c'), - join('src', 'multiarray', 'getset.c'), - join('src', 'multiarray', 'hashdescr.c'), - join('src', 'multiarray', 'item_selection.c'), - join('src', 'multiarray', 'iterators.c'), - join('src', 'multiarray', 'legacy_dtype_implementation.c'), - join('src', 'multiarray', 'lowlevel_strided_loops.c.src'), - join('src', 'multiarray', 'mapping.c'), - join('src', 'multiarray', 'methods.c'), - join('src', 'multiarray', 'multiarraymodule.c'), - join('src', 'multiarray', 'nditer_templ.c.src'), - join('src', 'multiarray', 'nditer_api.c'), - join('src', 'multiarray', 'nditer_constr.c'), - join('src', 'multiarray', 'nditer_pywrap.c'), - join('src', 'multiarray', 'number.c'), - join('src', 'multiarray', 'refcount.c'), - join('src', 'multiarray', 'sequence.c'), - join('src', 'multiarray', 'shape.c'), - join('src', 'multiarray', 'scalarapi.c'), - join('src', 'multiarray', 'scalartypes.c.src'), - join('src', 'multiarray', 'strfuncs.c'), - join('src', 'multiarray', 'temp_elide.c'), - join('src', 'multiarray', 'typeinfo.c'), - join('src', 'multiarray', 'usertypes.c'), - join('src', 'multiarray', 'vdot.c'), - join('src', 'common', 'npy_sort.h.src'), - join('src', 'npysort', 'quicksort.c.src'), - join('src', 'npysort', 'mergesort.c.src'), - join('src', 'npysort', 'timsort.c.src'), - join('src', 'npysort', 'heapsort.c.src'), - join('src', 'npysort', 'radixsort.c.src'), - join('src', 'common', 'npy_partition.h.src'), - join('src', 'npysort', 'selection.c.src'), - join('src', 'common', 'npy_binsearch.h.src'), - join('src', 'npysort', 'binsearch.c.src'), - ] - - ####################################################################### - # _multiarray_umath module - umath part # - ####################################################################### - - def generate_umath_c(ext, build_dir): - target = join(build_dir, header_dir, '__umath_generated.c') - dir = os.path.dirname(target) - if not os.path.exists(dir): - os.makedirs(dir) - script = generate_umath_py - if newer(script, target): - with open(target, 'w') as f: - f.write(generate_umath.make_code(generate_umath.defdict, - generate_umath.__file__)) - return [] - - umath_src = [ - join('src', 'umath', 'umathmodule.c'), - join('src', 'umath', 'reduction.c'), - join('src', 'umath', 'funcs.inc.src'), - join('src', 'umath', 'simd.inc.src'), - join('src', 'umath', 'loops.h.src'), - join('src', 'umath', 'loops_utils.h.src'), - join('src', 'umath', 'loops.c.src'), - join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), - join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), - join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), - join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), - join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), - join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), - join('src', 'umath', 'matmul.h.src'), - join('src', 'umath', 'matmul.c.src'), - join('src', 'umath', 'clip.h.src'), - join('src', 'umath', 'clip.c.src'), - join('src', 'umath', 'dispatching.c'), - join('src', 'umath', 'legacy_array_method.c'), - join('src', 'umath', 'ufunc_object.c'), - join('src', 'umath', 'extobj.c'), - join('src', 'umath', 'scalarmath.c.src'), - join('src', 'umath', 'ufunc_type_resolution.c'), - join('src', 'umath', 'override.c'), - # For testing. Eventually, should use public API and be separate: - join('src', 'umath', '_scaled_float_dtype.c'), - ] - - umath_deps = [ - generate_umath_py, - join('include', 'numpy', 'npy_math.h'), - join('include', 'numpy', 'halffloat.h'), - join('src', 'multiarray', 'common.h'), - join('src', 'multiarray', 'number.h'), - join('src', 'common', 'templ_common.h.src'), - join('src', 'umath', 'simd.inc.src'), - join('src', 'umath', 'override.h'), - join(codegen_dir, 'generate_ufunc_api.py'), - ] - - svml_path = join('numpy', 'core', 'src', 'umath', 'svml') - svml_objs = [] - if can_link_svml() and check_svml_submodule(svml_path): - svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True) - - config.add_extension('_multiarray_umath', - sources=multiarray_src + umath_src + - common_src + - [generate_config_h, - generate_numpyconfig_h, - generate_numpy_api, - join(codegen_dir, 'generate_numpy_api.py'), - join('*.py'), - generate_umath_c, - generate_ufunc_api, - ], - depends=deps + multiarray_deps + umath_deps + - common_deps, - libraries=['npymath'], - extra_objects=svml_objs, - extra_info=extra_info) - - ####################################################################### - # umath_tests module # - ####################################################################### - - config.add_extension('_umath_tests', sources=[ - join('src', 'umath', '_umath_tests.c.src'), - join('src', 'umath', '_umath_tests.dispatch.c'), - join('src', 'common', 'npy_cpu_features.c.src'), - ]) - - ####################################################################### - # custom rational dtype module # - ####################################################################### - - config.add_extension('_rational_tests', - sources=[join('src', 'umath', '_rational_tests.c.src')]) - - ####################################################################### - # struct_ufunc_test module # - ####################################################################### - - config.add_extension('_struct_ufunc_tests', - sources=[join('src', 'umath', '_struct_ufunc_tests.c.src')]) - - - ####################################################################### - # operand_flag_tests module # - ####################################################################### - - config.add_extension('_operand_flag_tests', - sources=[join('src', 'umath', '_operand_flag_tests.c.src')]) - - ####################################################################### - # SIMD module # - ####################################################################### - - config.add_extension('_simd', sources=[ - join('src', 'common', 'npy_cpu_features.c.src'), - join('src', '_simd', '_simd.c'), - join('src', '_simd', '_simd_inc.h.src'), - join('src', '_simd', '_simd_data.inc.src'), - join('src', '_simd', '_simd.dispatch.c.src'), - ], depends=[ - join('src', 'common', 'npy_cpu_dispatch.h'), - join('src', 'common', 'simd', 'simd.h'), - join('src', '_simd', '_simd.h'), - join('src', '_simd', '_simd_inc.h.src'), - join('src', '_simd', '_simd_data.inc.src'), - join('src', '_simd', '_simd_arg.inc'), - join('src', '_simd', '_simd_convert.inc'), - join('src', '_simd', '_simd_easyintrin.inc'), - join('src', '_simd', '_simd_vector.inc'), - ]) - - config.add_subpackage('tests') - config.add_data_dir('tests/data') - config.add_data_dir('tests/examples') - config.add_data_files('*.pyi') - - config.make_svn_version_py() - - return config - -if __name__ == '__main__': - from numpy.distutils.core import setup - setup(configuration=configuration) diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig deleted file mode 100644 index 0938cd050f58..000000000000 --- a/numpy/core/src/umath/loops.h.src.orig +++ /dev/null @@ -1,667 +0,0 @@ -/* -*- c -*- */ -/* - * vim:syntax=c - */ - -#ifndef _NPY_UMATH_LOOPS_H_ -#define _NPY_UMATH_LOOPS_H_ - -#ifndef NPY_NO_EXPORT - #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#define BOOL_invert BOOL_logical_not -#define BOOL_add BOOL_logical_or -#define BOOL_bitwise_and BOOL_logical_and -#define BOOL_bitwise_or BOOL_logical_or -#define BOOL_logical_xor BOOL_not_equal -#define BOOL_bitwise_xor BOOL_logical_xor -#define BOOL_multiply BOOL_logical_and -#define BOOL_maximum BOOL_logical_or -#define BOOL_minimum BOOL_logical_and -#define BOOL_fmax BOOL_maximum -#define BOOL_fmin BOOL_minimum - - -/* - ***************************************************************************** - ** BOOLEAN LOOPS ** - ***************************************************************************** - */ - -/**begin repeat - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or, absolute, logical_not# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat**/ - -NPY_NO_EXPORT void -BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat - * #kind = isnan, isinf, isfinite# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat**/ - -/* - ***************************************************************************** - ** INTEGER LOOPS - ***************************************************************************** - */ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_arithmetic.dispatch.h" -#endif - -/**begin repeat - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, - BYTE, SHORT, INT, LONG, LONGLONG# - */ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) -/**end repeat**/ - -/**begin repeat - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# - */ - -/**begin repeat1 - * both signed and unsigned integer types - * #s = , u# - * #S = , U# - */ - -#define @S@@TYPE@_floor_divide @S@@TYPE@_divide -#define @S@@TYPE@_fmax @S@@TYPE@_maximum -#define @S@@TYPE@_fmin @S@@TYPE@_minimum - -NPY_NO_EXPORT void -@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat2 - * #isa = , _avx2# - */ - -NPY_NO_EXPORT void -@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat3 - * Arithmetic - * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, - * left_shift, right_shift# - * #OP = +, -,*, &, |, ^, <<, >># - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -/**begin repeat3 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or# - * #OP = ==, !=, >, >=, <, <=, &&, ||# - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -NPY_NO_EXPORT void -@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ - -/**begin repeat2 - * #kind = maximum, minimum# - * #OP = >, <# - **/ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ - -NPY_NO_EXPORT void -@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat2 - * #kind = isnan, isinf, isfinite# - **/ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ - -/**end repeat1**/ - -/**end repeat**/ - -/* - ***************************************************************************** - ** FLOAT LOOPS ** - ***************************************************************************** - */ -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_unary_fp.dispatch.h" -#endif -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #kind = sqrt, absolute, square, reciprocal# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) -/**end repeat1**/ -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_arithm_fp.dispatch.h" -#endif -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) -/**end repeat1**/ -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_umath_fp.dispatch.h" -#endif - -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# - */ - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = sin, cos# - */ - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) - -/**end repeat**/ - -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = maximum, minimum# - */ -NPY_NO_EXPORT void -@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat1**/ -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_trigonometric.dispatch.h" -#endif -/**begin repeat - * #func = sin, cos# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( - char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) -)) -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_exponent_log.dispatch.h" -#endif -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * # kind = exp, log, frexp, ldexp# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( - char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) -)) -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = rint, ceil, floor, trunc# - */ - -/**begin repeat1 -* #TYPE = FLOAT, DOUBLE# -*/ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat2 - * #isa = avx512f, fma# - */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * Float types - * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# - * #c = f, f, , l# - * #C = F, F, , L# - */ - - -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal, - * logical_and, logical_or# - * #OP = ==, !=, <, <=, >, >=, &&, ||# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# - * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# - **/ - -/**begin repeat2 - * #ISA = , _avx512_skx# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ -/**end repeat1**/ - -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat**/ - - -/* - ***************************************************************************** - ** COMPLEX LOOPS ** - ***************************************************************************** - */ -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_arithm_fp.dispatch.h" -#endif -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - */ -/**begin repeat1 - * #kind = add, subtract, multiply# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) -/**end repeat1**/ -/**end repeat**/ - -#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); -#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); -#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); -#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); -#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); -#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); - -/**begin repeat - * complex types - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #c = f, , l# - * #C = F, , L# - */ - -/**begin repeat1 - * arithmetic - * #kind = add, subtract, multiply# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind= greater, greater_equal, less, less_equal, equal, not_equal# - * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - #kind = logical_and, logical_or# - #OP1 = ||, ||# - #OP2 = &&, ||# -*/ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**begin repeat1 - * #kind = isnan, isinf, isfinite# - * #func = npy_isnan, npy_isinf, npy_isfinite# - * #OP = ||, ||, &&# - **/ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat1 - * #isa = , _avx512f# - */ - -NPY_NO_EXPORT void -C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = CGE, CLE# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = CGE, CLE# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**end repeat**/ - -#undef CGE -#undef CLE -#undef CGT -#undef CLT -#undef CEQ -#undef CNE - -/* - ***************************************************************************** - ** DATETIME LOOPS ** - ***************************************************************************** - */ - -NPY_NO_EXPORT void -TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat - * #TYPE = DATETIME, TIMEDELTA# - */ - -NPY_NO_EXPORT void -@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -#define @TYPE@_isnan @TYPE@_isnat - -NPY_NO_EXPORT void -@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat1 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal# - * #OP = ==, !=, >, >=, <, <=# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = maximum, minimum, fmin, fmax# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**end repeat**/ - -NPY_NO_EXPORT void -DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/* Special case equivalents to above functions */ -#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide -#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide -/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ - -/* - ***************************************************************************** - ** OBJECT LOOPS ** - ***************************************************************************** - */ - -/**begin repeat - * #kind = equal, not_equal, greater, greater_equal, less, less_equal# - * #OP = EQ, NE, GT, GE, LT, LE# - */ -/**begin repeat1 - * #suffix = , _OO_O# - */ -NPY_NO_EXPORT void -OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ -/**end repeat**/ - -NPY_NO_EXPORT void -OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); - -/* - ***************************************************************************** - ** END LOOPS ** - ***************************************************************************** - */ - -#endif From 2ff7ab64d4e7d5928e96ca95b85350aa9caa2b63 Mon Sep 17 00:00:00 2001 From: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> Date: Thu, 18 Nov 2021 12:51:00 -0800 Subject: [PATCH 3/8] Reorganize NEON min/max implementation to be more generic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you @seiko2plus for the excellent example. Reorganized code so that it can be used for other architectures. Core implementations and unroll factors should be the same as before for ARM NEON. Beyond reorganizing, we've added default implementations using universal intrinsics for non-ARM-NEON. Additionally, we've moved most min, max, fmin, fmax implementations to a new dispatchable source file: numpy/core/src/umath/loops_minmax.dispatch.c.src **Testing** - Apple silicon M1 native (arm64 / aarch64) -- No test failures - Apple silicon M1 Rosetta (x86_64) -- No new test failures - iMacPro1,1 (AVX512F) -- No test failures **Benchmarks** - Apple silicon M1 native (arm64 / aarch64) - Similar improvements as before reorg (comparison below) - x86_64 (both Apple silicon M1 Rosetta and iMacPro1,1 AVX512F) - Some x86_64 benchmarks are better, some are worse Apple silicon M1 native (arm64 / aarch64) comparison to original implementation / before reorg: ``` before after ratio [559ddede] [a3463b09] + 6.45±0.04μs 7.07±0.09μs 1.10 bench_lib.Nan.time_nanargmin(200, 0.1) + 32.1±0.3μs 35.2±0.2μs 1.10 bench_ufunc_strides.Unary.time_ufunc(, 2, 1, 'd') + 29.1±0.02μs 31.8±0.05μs 1.10 bench_core.Core.time_array_int_l1000 + 69.0±0.2μs 75.3±3μs 1.09 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'f') + 92.0±1μs 99.5±0.5μs 1.08 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'd') + 9.29±0.1μs 9.99±0.5μs 1.08 bench_ma.UFunc.time_1d(True, True, 10) + 338±0.6μs 362±10μs 1.07 bench_function_base.Sort.time_sort('quick', 'int16', ('random',)) + 4.21±0.03μs 4.48±0.2μs 1.07 bench_core.CountNonzero.time_count_nonzero_multi_axis(3, 100, ) + 12.3±0.06μs 13.1±0.7μs 1.06 bench_function_base.Median.time_even_small + 1.27±0μs 1.35±0.06μs 1.06 bench_itemselection.PutMask.time_dense(False, 'float16') + 139±1ns 147±6ns 1.06 bench_core.Core.time_array_1 + 33.7±0.01μs 35.5±2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'f') + 69.4±0.1μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'f') + 225±0.09μs 237±9μs 1.05 bench_random.Bounded.time_bounded('PCG64', [, 2047]) - 15.7±0.5μs 14.9±0.03μs 0.95 bench_core.CountNonzero.time_count_nonzero_axis(2, 10000, ) - 34.2±2μs 32.0±0.03μs 0.94 bench_ufunc_strides.Unary.time_ufunc(, 4, 2, 'f') - 1.03±0.05ms 955±3μs 0.92 bench_lib.Nan.time_nanargmax(200000, 50.0) - 6.97±0.08μs 6.43±0.02μs 0.92 bench_ma.UFunc.time_scalar(True, False, 10) - 5.41±0μs 4.98±0.01μs 0.92 bench_ufunc_strides.AVX_cmplx_arithmetic.time_ufunc('subtract', 2, 'F') - 22.4±0.01μs 20.6±0.02μs 0.92 bench_core.Core.time_array_float64_l1000 - 1.51±0.01ms 1.38±0ms 0.92 bench_core.CorrConv.time_correlate(1000, 10000, 'same') - 10.1±0.2μs 9.27±0.09μs 0.92 bench_ufunc.UFunc.time_ufunc_types('invert') - 8.50±0.02μs 7.80±0.09μs 0.92 bench_indexing.ScalarIndexing.time_assign_cast(1) - 29.5±0.2μs 26.6±0.03μs 0.90 bench_ma.Concatenate.time_it('masked', 100) - 2.09±0.02ms 1.87±0ms 0.90 bench_ma.UFunc.time_2d(True, True, 1000) - 298±10μs 267±0.3μs 0.89 bench_app.MaxesOfDots.time_it - 10.7±0.2μs 9.60±0.02μs 0.89 bench_ma.UFunc.time_1d(True, True, 100) - 567±3μs 505±2μs 0.89 bench_lib.Nan.time_nanargmax(200000, 90.0) - 342±0.9μs 282±5μs 0.83 bench_lib.Nan.time_nanargmax(200000, 2.0) - 307±0.7μs 244±0.8μs 0.80 bench_lib.Nan.time_nanargmax(200000, 0.1) - 309±1μs 241±0.1μs 0.78 bench_lib.Nan.time_nanargmax(200000, 0) ``` --- numpy/core/code_generators/generate_umath.py | 10 +- numpy/core/src/common/simd/neon/math.h | 66 -- numpy/core/src/umath/loops.c.src | 89 -- numpy/core/src/umath/loops.h.src | 14 - numpy/core/src/umath/loops.h.src.orig | 717 +++++++++++++ .../src/umath/loops_minmax.dispatch.c.src | 967 +++++++----------- 6 files changed, 1105 insertions(+), 758 deletions(-) create mode 100644 numpy/core/src/umath/loops.h.src.orig diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index c20468a8f58b..9fa87a11e0dc 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -516,16 +516,14 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), - TD(noobj, simd=[('avx512f', 'fd')]), + TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMax') ), 'minimum': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), - TD(noobj, simd=[('avx512f', 'fd')]), + TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMin') ), 'clip': @@ -539,7 +537,7 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmax'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), + TD('fdg', dispatch=[('loops_minmax', 'fdg')]), TD(noobj), TD(O, f='npy_ObjectMax') ), @@ -547,7 +545,7 @@ def english_upper(s): Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmin'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), + TD('fdg', dispatch=[('loops_minmax', 'fdg')]), TD(noobj), TD(O, f='npy_ObjectMin') ), diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 32b5bc89ec8b..19ea6f22fab0 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,70 +153,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } -#define npyv_reduce_max_u8 vmaxvq_u8 -#define npyv_reduce_max_s8 vmaxvq_s8 -#define npyv_reduce_max_u16 vmaxvq_u16 -#define npyv_reduce_max_s16 vmaxvq_s16 -#define npyv_reduce_max_u32 vmaxvq_u32 -#define npyv_reduce_max_s32 vmaxvq_s32 -NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v) -{ - npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); - npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); - npyv_lanetype_u64 result = (a > b) ? a : b; - return result; -} -NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v) -{ - npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); - npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); - npyv_lanetype_s64 result = (a > b) ? a : b; - return result; -} -#define npyv_reduce_max_f32 vmaxvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_max_f64 vmaxvq_f64 -#endif // NPY_SIMD_F64 - -// Minimum, supports IEEE floating-point arithmetic (IEC 60559), -// - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. -#define npyv_reduce_maxp_f32 vmaxnmvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_maxp_f64 vmaxnmvq_f64 -#endif // NPY_SIMD_F64 - -#define npyv_reduce_min_u8 vminvq_u8 -#define npyv_reduce_min_s8 vminvq_s8 -#define npyv_reduce_min_u16 vminvq_u16 -#define npyv_reduce_min_s16 vminvq_s16 -#define npyv_reduce_min_u32 vminvq_u32 -#define npyv_reduce_min_s32 vminvq_s32 -NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v) -{ - npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); - npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); - npyv_lanetype_u64 result = (a < b) ? a : b; - return result; -} -NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v) -{ - npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); - npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); - npyv_lanetype_s64 result = (a < b) ? a : b; - return result; -} -#define npyv_reduce_min_f32 vminvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_min_f64 vminvq_f64 -#endif // NPY_SIMD_F64 - -// Minimum, supports IEEE floating-point arithmetic (IEC 60559), -// - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. -#define npyv_reduce_minp_f32 vminnmvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_minp_f64 vminnmvq_f64 -#endif // NPY_SIMD_F64 - #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 950ea011ef4e..fa784401473e 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -13,7 +13,6 @@ #include "numpy/npy_math.h" #include "numpy/halffloat.h" #include "lowlevel_strided_loops.h" -#include "loops.h" #include "npy_pycompat.h" @@ -551,7 +550,6 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void * npy_double, npy_double, npy_double, npy_double# * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# - * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# */ #define @TYPE@_floor_divide @TYPE@_divide @@ -726,34 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void /**end repeat1**/ -#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >, <# - **/ - -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - io1 = (io1 @OP@ in2) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2; - } - } -} - -/**end repeat1**/ -#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ - NPY_NO_EXPORT void @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1597,7 +1567,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# - * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, @@ -1720,65 +1689,7 @@ NPY_NO_EXPORT void } npy_clear_floatstatus_barrier((char*)dimensions); } - -#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - } - else { - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; - *((@type@ *)op1) = in1; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} -#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -/**end repeat1**/ - -#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} /**end repeat1**/ -#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index aab1b5bbb8fc..90115006fc1c 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -22,14 +22,6 @@ #define BOOL_fmax BOOL_maximum #define BOOL_fmin BOOL_minimum -/* - * The ARM NEON implementation of min/max for longdouble, longlong, and - * ulonglong assume that they're all 64-bits. If that's not true, then we'll - * use the scalar implementations instead. - */ -#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) -#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) - /* ***************************************************************************** ** BOOLEAN LOOPS ** @@ -680,32 +672,26 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo /**begin repeat * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, * LONG, ULONG, LONGLONG, ULONGLONG# - * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# */ -#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**begin repeat1 * #kind = maximum, minimum# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat1**/ -#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**end repeat**/ //---------- Float ---------- /**begin repeat * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# */ - #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**begin repeat1 * #kind = maximum, minimum, fmax, fmin# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat1**/ -#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**end repeat**/ /* diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig new file mode 100644 index 000000000000..aab1b5bbb8fc --- /dev/null +++ b/numpy/core/src/umath/loops.h.src.orig @@ -0,0 +1,717 @@ +/* -*- c -*- */ +/* + * vim:syntax=c + */ + +#ifndef _NPY_UMATH_LOOPS_H_ +#define _NPY_UMATH_LOOPS_H_ + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#define BOOL_invert BOOL_logical_not +#define BOOL_add BOOL_logical_or +#define BOOL_bitwise_and BOOL_logical_and +#define BOOL_bitwise_or BOOL_logical_or +#define BOOL_logical_xor BOOL_not_equal +#define BOOL_bitwise_xor BOOL_logical_xor +#define BOOL_multiply BOOL_logical_and +#define BOOL_maximum BOOL_logical_or +#define BOOL_minimum BOOL_logical_and +#define BOOL_fmax BOOL_maximum +#define BOOL_fmin BOOL_minimum + +/* + * The ARM NEON implementation of min/max for longdouble, longlong, and + * ulonglong assume that they're all 64-bits. If that's not true, then we'll + * use the scalar implementations instead. + */ +#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) +#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or, absolute, logical_not# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +NPY_NO_EXPORT void +BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ + +/**begin repeat1 + * both signed and unsigned integer types + * #s = , u# + * #S = , U# + */ + +#define @S@@TYPE@_floor_divide @S@@TYPE@_divide +#define @S@@TYPE@_fmax @S@@TYPE@_maximum +#define @S@@TYPE@_fmin @S@@TYPE@_minimum + +NPY_NO_EXPORT void +@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #isa = , _avx2# + */ + +NPY_NO_EXPORT void +@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat3 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift# + * #OP = +, -,*, &, |, ^, <<, >># + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +/**begin repeat3 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or# + * #OP = ==, !=, >, >=, <, <=, &&, ||# + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +NPY_NO_EXPORT void +@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**begin repeat2 + * #kind = maximum, minimum# + * #OP = >, <# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +NPY_NO_EXPORT void +@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**end repeat1**/ + +/**end repeat**/ + +/* + ***************************************************************************** + ** FLOAT LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat**/ + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_trigonometric.dispatch.h" +#endif +/**begin repeat + * #func = sin, cos# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_exponent_log.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * # kind = exp, log, frexp, ldexp# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = rint, ceil, floor, trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat2 + * #isa = avx512f, fma# + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * Float types + * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, f, , l# + * #C = F, F, , L# + */ + + +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal, + * logical_and, logical_or# + * #OP = ==, !=, <, <=, >, >=, &&, ||# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# + * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + **/ + +/**begin repeat2 + * #ISA = , _avx512_skx# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + + +/* + ***************************************************************************** + ** COMPLEX LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = add, subtract, multiply# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); +#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); +#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); +#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); +#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); +#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); + +/**begin repeat + * complex types + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, , l# + * #C = F, , L# + */ + +/**begin repeat1 + * arithmetic + * #kind = add, subtract, multiply# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind= greater, greater_equal, less, less_equal, equal, not_equal# + * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + #kind = logical_and, logical_or# + #OP1 = ||, ||# + #OP2 = &&, ||# +*/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #OP = ||, ||, &&# + **/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #isa = , _avx512f# + */ + +NPY_NO_EXPORT void +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +#undef CGE +#undef CLE +#undef CGT +#undef CLT +#undef CEQ +#undef CNE + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void +TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat + * #TYPE = DATETIME, TIMEDELTA# + */ + +NPY_NO_EXPORT void +@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +#define @TYPE@_isnan @TYPE@_isnat + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = ==, !=, >, >=, <, <=# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum, fmin, fmax# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +NPY_NO_EXPORT void +DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/* Special case equivalents to above functions */ +#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide +#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide +/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +/* + ***************************************************************************** + ** OBJECT LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = EQ, NE, GT, GE, LT, LE# + */ +/**begin repeat1 + * #suffix = , _OO_O# + */ +NPY_NO_EXPORT void +OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ +/**end repeat**/ + +NPY_NO_EXPORT void +OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); + +/* + ***************************************************************************** + ** MIN/MAX LOOPS ** + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_minmax.dispatch.h" +#endif + +//---------- Integers ---------- + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ +#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +//---------- Float ---------- + + /**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# + */ + #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum, fmax, fmin# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +/* + ***************************************************************************** + ** END LOOPS ** + ***************************************************************************** + */ + +#endif diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index 06aed1bb736a..c2eddcee6972 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** neon + ** neon asimd **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -13,659 +13,460 @@ // Provides the various *_LOOP macros #include "fast_loop_macros.h" +/******************************************************************************* + ** Scalar intrinsics + ******************************************************************************/ +// signed/unsigned int +#define scalar_max_i(A, B) ((A > B) ? A : B) +#define scalar_min_i(A, B) ((A < B) ? A : B) +// fp, propagates NaNs +#if !defined(__aarch64__) +#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B) +#define scalar_max_d scalar_max_f +#define scalar_max_l scalar_max_f +#define scalar_min_f(A, B) ((A <= B || npy_isnan(A)) ? A : B) +#define scalar_min_d scalar_min_f +#define scalar_min_l scalar_min_f +// fp, ignores NaNs +#define scalar_maxp_f fmaxf +#define scalar_maxp_d fmax +#define scalar_maxp_l fmaxl +#define scalar_minp_f fminf +#define scalar_minp_d fmin +#define scalar_minp_l fminl +#elif defined(__aarch64__) +/**begin repeat + * #type = npy_float, npy_double, npy_longdouble# + * #scalar_sfx = f, d, l# + * #asm_sfx = s, d, d# + */ +/**begin repeat1 + * #op = max, min, maxp, minp# + * #asm_instr = fmax, fmin, fmaxnm, fminnm# + */ +static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ + @type@ result = 0; + __asm( + "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]" + : [result] "=w" (result) + : [a] "w" (a), [b] "w" (b) + ); + return result; +} +/**end repeat1**/ +/**end repeat**/ +#endif // !defined(__aarch64__) -//############################################################################## -//## Maximum, Minimum -//############################################################################## - -#ifdef NPY_HAVE_NEON -//****************************************************************************** -//** NEON Min / Max -//****************************************************************************** - - -//------------------------------------------------------------------------------ -//-- Integer -//------------------------------------------------------------------------------ +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ +#if NPY_SIMD +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# + * #is_64 = 0*6, 1*2# + */ +#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) + #if !@is_64@ + #define npyv_reduce_min_@sfx@ vminvq_@sfx@ + #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@ + #else + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); + npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); + npyv_lanetype_@sfx@ result = (a < b) ? a : b; + return result; + } + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); + npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); + npyv_lanetype_@sfx@ result = (a > b) ? a : b; + return result; + } + #endif // !@is_64@ +#else + /**begin repeat1 + * #intrin = min, max# + */ + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_storea_@sfx@(s, v); + npyv_lanetype_@sfx@ result = s[0]; + for(int i=1; i, <# -* # vop = max, min# -*/ - -#define SCALAR_OP @TYPE@_scalar_@vop@ -static inline @type@ SCALAR_OP(@type@ a, @type@ b){ - return ((a @OP@ b) ? a : b); -} + * # intrin = max, min, maxp, minp# + * # fp_only = 0, 0, 1, 1# + */ +#define SCALAR_OP scalar_@intrin@_@scalar_sfx@ +#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@)) +#if @is_fp@ && !@fp_only@ + #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs + #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@ +#else + #define V_INTRIN npyv_@intrin@_@sfx@ + #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@ +#endif -static inline npy_intp -simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +// contiguous input. +static inline void +simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len) { - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - @type@ *ip2 = (@type@ *)args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - + // Note, 8x unroll was chosen for best results on Apple M1 const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + npy_intp i = 0; + npyv_lanetype_@sfx@ io1 = op1[0]; + // SIMD if possible - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - #define NPYV_CAST (const npyv_lanetype_@sfx@ *) - npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); - npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); - npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); - npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); - npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); - npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); - npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); - npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); + if((i+elemPerLoop) <= len){ + npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); - npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); - npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); - npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); - npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); - npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); - npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); - npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); - - m0 = npyv_@vop@_@sfx@(m0, v0); - m1 = npyv_@vop@_@sfx@(m1, v1); - m2 = npyv_@vop@_@sfx@(m2, v2); - m3 = npyv_@vop@_@sfx@(m3, v3); - m4 = npyv_@vop@_@sfx@(m4, v4); - m5 = npyv_@vop@_@sfx@(m5, v5); - m6 = npyv_@vop@_@sfx@(m6, v6); - m7 = npyv_@vop@_@sfx@(m7, v7); + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); + + m0 = V_INTRIN(m0, v0); + m1 = V_INTRIN(m1, v1); + m2 = V_INTRIN(m2, v2); + m3 = V_INTRIN(m3, v3); + m4 = V_INTRIN(m4, v4); + m5 = V_INTRIN(m5, v5); + m6 = V_INTRIN(m6, v6); + m7 = V_INTRIN(m7, v7); } - #undef NPYV_CAST + m0 = V_INTRIN(m0, m1); + m2 = V_INTRIN(m2, m3); + m4 = V_INTRIN(m4, m5); + m6 = V_INTRIN(m6, m7); - m0 = npyv_@vop@_@sfx@(m0, m1); - m2 = npyv_@vop@_@sfx@(m2, m3); - m4 = npyv_@vop@_@sfx@(m4, m5); - m6 = npyv_@vop@_@sfx@(m6, m7); + m0 = V_INTRIN(m0, m2); + m4 = V_INTRIN(m4, m6); - m0 = npyv_@vop@_@sfx@(m0, m2); - m4 = npyv_@vop@_@sfx@(m4, m6); + m0 = V_INTRIN(m0, m4); - m0 = npyv_@vop@_@sfx@(m0, m4); - - @type@ r = npyv_reduce_@vop@_@sfx@(m0); + npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0); io1 = SCALAR_OP(io1, r); } - *((@type@ *)iop1) = io1; - - return i; -} - -static inline npy_intp -scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - - // 8x scalar - npy_intp elemPerLoop = 8; - if((i+elemPerLoop) <= n){ - @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - m0 = SCALAR_OP(m0, v0); - m1 = SCALAR_OP(m1, v1); - m2 = SCALAR_OP(m2, v2); - m3 = SCALAR_OP(m3, v3); - m4 = SCALAR_OP(m4, v4); - m5 = SCALAR_OP(m5, v5); - m6 = SCALAR_OP(m6, v6); - m7 = SCALAR_OP(m7, v7); - } - - m0 = SCALAR_OP(m0, m1); - m2 = SCALAR_OP(m2, m3); - m4 = SCALAR_OP(m4, m5); - m6 = SCALAR_OP(m6, m7); - - m0 = SCALAR_OP(m0, m2); - m4 = SCALAR_OP(m4, m6); - - m0 = SCALAR_OP(m0, m4); - - io1 = SCALAR_OP(io1, m0); - } - - *((@type@ *)iop1) = io1; - - return i; -} - -static inline void -run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - npy_intp i; - - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - i = 0; - - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - // SIMD - do as many iterations as we can with vectors - i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); - } else{ - // Unrolled scalar - do as many iterations as we can with unrolled loops - i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); - } - // Scalar - finish up any remaining iterations - io1 = iop1[0]; - ip2 += i * is2; - for(; i=, <=, >=, <=# -* # vop = max, min, maxp, minp# -* # asmInstr = fmax, fmin, fmaxnm, fminnm# -* # PROPAGATE_NAN = 1, 1, 0, 0# -*/ - -#define SCALAR_OP @TYPE@_scalar_@vop@ -static inline @type@ SCALAR_OP(@type@ a, @type@ b){ -#ifdef __aarch64__ - @type@ result = 0; - __asm( - "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]" - : [result] "=w" (result) - : [a] "w" (a), [b] "w" (b) - ); - return result; -#else - -#if @PROPAGATE_NAN@ - return ((a @OP@ b || npy_isnan(a)) ? a : b); -#else - return ((a @OP@ b || npy_isnan(b)) ? a : b); + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigend@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ #endif -#endif // __aarch64__ -} -static inline npy_intp -simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - @type@ *ip2 = (@type@ *)args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - // SIMD if possible - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); - npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); - npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); - npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); - npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); - npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); - npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); - npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); +/**begin repeat1 + * # kind = maximum, minimum, fmax, fmin# + * # intrin = max, min, maxp, minp# + * # fp_only = 0, 0, 1, 1# + */ +#if !@fp_only@ || (@is_fp@ && @fp_only@) +#define SCALAR_OP scalar_@intrin@_@scalar_sfx@ - i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); - npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); - npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); - npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); - npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); - npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); - - m0 = npyv_@vop@_@sfx@(m0, v0); - m1 = npyv_@vop@_@sfx@(m1, v1); - m2 = npyv_@vop@_@sfx@(m2, v2); - m3 = npyv_@vop@_@sfx@(m3, v3); - m4 = npyv_@vop@_@sfx@(m4, v4); - m5 = npyv_@vop@_@sfx@(m5, v5); - m6 = npyv_@vop@_@sfx@(m6, v6); - m7 = npyv_@vop@_@sfx@(m7, v7); +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], + len = dimensions[0]; + npy_intp i = 0; +#ifdef TO_SIMD_SFX + #undef STYPE + #define STYPE TO_SIMD_SFX(npyv_lanetype) + if (IS_BINARY_REDUCE) { + // reduce and contiguous + if (is2 == sizeof(@type@)) { + TO_SIMD_SFX(simd_reduce_c_@intrin@)( + (STYPE*)ip2, (STYPE*)op1, len + ); + goto clear_fp; } - - m0 = npyv_@vop@_@sfx@(m0, m1); - m2 = npyv_@vop@_@sfx@(m2, m3); - m4 = npyv_@vop@_@sfx@(m4, m5); - m6 = npyv_@vop@_@sfx@(m6, m7); - - m0 = npyv_@vop@_@sfx@(m0, m2); - m4 = npyv_@vop@_@sfx@(m4, m6); - - m0 = npyv_@vop@_@sfx@(m0, m4); - - @type@ r = npyv_reduce_@vop@_@sfx@(m0); - - io1 = SCALAR_OP(io1, r); } - - *((@type@ *)iop1) = io1; - - return i; -} - -static inline npy_intp -scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - - // 8x scalar - npy_intp elemPerLoop = 8; - if((i+elemPerLoop) <= n){ - @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - m0 = SCALAR_OP(m0, v0); - m1 = SCALAR_OP(m1, v1); - m2 = SCALAR_OP(m2, v2); - m3 = SCALAR_OP(m3, v3); - m4 = SCALAR_OP(m4, v4); - m5 = SCALAR_OP(m5, v5); - m6 = SCALAR_OP(m6, v6); - m7 = SCALAR_OP(m7, v7); + else if (!is_mem_overlap(ip1, is1, op1, os1, len) && + !is_mem_overlap(ip2, is2, op1, os1, len) + ) { + // no overlap and operands are binary contiguous + if (IS_BINARY_CONT(@type@, @type@)) { + TO_SIMD_SFX(simd_binary_ccc_@intrin@)( + (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len + ); + goto clear_fp; } - - m0 = SCALAR_OP(m0, m1); - m2 = SCALAR_OP(m2, m3); - m4 = SCALAR_OP(m4, m5); - m6 = SCALAR_OP(m6, m7); - - m0 = SCALAR_OP(m0, m2); - m4 = SCALAR_OP(m4, m6); - - m0 = SCALAR_OP(m0, m4); - - io1 = SCALAR_OP(io1, m0); } - - *((@type@ *)iop1) = io1; - - return i; -} - -static inline void -run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - npy_intp i; - - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - i = 0; - - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - // SIMD - do as many iterations as we can with vectors - i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); +#endif // TO_SIMD_SFX +#ifndef NPY_DISABLE_OPTIMIZATION + // scalar unrolls + if (IS_BINARY_REDUCE) { + // Note, 8x unroll was chosen for best results on Apple M1 + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= len){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0); + } } else{ - // Unrolled scalar - do as many iterations as we can with unrolled loops - i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); - } - - // Scalar - finish up any remaining iterations - io1 = iop1[0]; - ip2 += i * is2; - for(; i Date: Thu, 18 Nov 2021 14:31:18 -0800 Subject: [PATCH 4/8] Delete .org file, unnecessary --- numpy/core/src/umath/loops.h.src.orig | 717 -------------------------- 1 file changed, 717 deletions(-) delete mode 100644 numpy/core/src/umath/loops.h.src.orig diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig deleted file mode 100644 index aab1b5bbb8fc..000000000000 --- a/numpy/core/src/umath/loops.h.src.orig +++ /dev/null @@ -1,717 +0,0 @@ -/* -*- c -*- */ -/* - * vim:syntax=c - */ - -#ifndef _NPY_UMATH_LOOPS_H_ -#define _NPY_UMATH_LOOPS_H_ - -#ifndef NPY_NO_EXPORT - #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#endif - -#define BOOL_invert BOOL_logical_not -#define BOOL_add BOOL_logical_or -#define BOOL_bitwise_and BOOL_logical_and -#define BOOL_bitwise_or BOOL_logical_or -#define BOOL_logical_xor BOOL_not_equal -#define BOOL_bitwise_xor BOOL_logical_xor -#define BOOL_multiply BOOL_logical_and -#define BOOL_maximum BOOL_logical_or -#define BOOL_minimum BOOL_logical_and -#define BOOL_fmax BOOL_maximum -#define BOOL_fmin BOOL_minimum - -/* - * The ARM NEON implementation of min/max for longdouble, longlong, and - * ulonglong assume that they're all 64-bits. If that's not true, then we'll - * use the scalar implementations instead. - */ -#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) -#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) - -/* - ***************************************************************************** - ** BOOLEAN LOOPS ** - ***************************************************************************** - */ - -/**begin repeat - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or, absolute, logical_not# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat**/ - -NPY_NO_EXPORT void -BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat - * #kind = isnan, isinf, isfinite# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat**/ - -/* - ***************************************************************************** - ** INTEGER LOOPS - ***************************************************************************** - */ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_arithmetic.dispatch.h" -#endif - -/**begin repeat - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, - BYTE, SHORT, INT, LONG, LONGLONG# - */ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) -/**end repeat**/ - -/**begin repeat - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# - */ - -/**begin repeat1 - * both signed and unsigned integer types - * #s = , u# - * #S = , U# - */ - -#define @S@@TYPE@_floor_divide @S@@TYPE@_divide -#define @S@@TYPE@_fmax @S@@TYPE@_maximum -#define @S@@TYPE@_fmin @S@@TYPE@_minimum - -NPY_NO_EXPORT void -@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat2 - * #isa = , _avx2# - */ - -NPY_NO_EXPORT void -@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat3 - * Arithmetic - * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, - * left_shift, right_shift# - * #OP = +, -,*, &, |, ^, <<, >># - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -/**begin repeat3 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or# - * #OP = ==, !=, >, >=, <, <=, &&, ||# - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -NPY_NO_EXPORT void -@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ - -/**begin repeat2 - * #kind = maximum, minimum# - * #OP = >, <# - **/ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ - -NPY_NO_EXPORT void -@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat2 - * #kind = isnan, isinf, isfinite# - **/ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ - -/**end repeat1**/ - -/**end repeat**/ - -/* - ***************************************************************************** - ** FLOAT LOOPS ** - ***************************************************************************** - */ -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_unary_fp.dispatch.h" -#endif -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #kind = sqrt, absolute, square, reciprocal# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) -/**end repeat1**/ -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_arithm_fp.dispatch.h" -#endif -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) -/**end repeat1**/ -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_umath_fp.dispatch.h" -#endif - -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# - */ - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = sin, cos# - */ - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) - -/**end repeat**/ - -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = maximum, minimum# - */ -NPY_NO_EXPORT void -@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat1**/ -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_trigonometric.dispatch.h" -#endif -/**begin repeat - * #func = sin, cos# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( - char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) -)) -/**end repeat**/ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_exponent_log.dispatch.h" -#endif -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * # kind = exp, log, frexp, ldexp# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( - char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) -)) -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = rint, ceil, floor, trunc# - */ - -/**begin repeat1 -* #TYPE = FLOAT, DOUBLE# -*/ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat2 - * #isa = avx512f, fma# - */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * Float types - * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# - * #c = f, f, , l# - * #C = F, F, , L# - */ - - -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal, - * logical_and, logical_or# - * #OP = ==, !=, <, <=, >, >=, &&, ||# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# - * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# - **/ - -/**begin repeat2 - * #ISA = , _avx512_skx# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ -/**end repeat1**/ - -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat**/ - - -/* - ***************************************************************************** - ** COMPLEX LOOPS ** - ***************************************************************************** - */ -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_arithm_fp.dispatch.h" -#endif -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - */ -/**begin repeat1 - * #kind = add, subtract, multiply# - */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) -/**end repeat1**/ -/**end repeat**/ - -#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); -#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); -#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); -#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); -#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); -#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); - -/**begin repeat - * complex types - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #c = f, , l# - * #C = F, , L# - */ - -/**begin repeat1 - * arithmetic - * #kind = add, subtract, multiply# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind= greater, greater_equal, less, less_equal, equal, not_equal# - * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - #kind = logical_and, logical_or# - #OP1 = ||, ||# - #OP2 = &&, ||# -*/ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**begin repeat1 - * #kind = isnan, isinf, isfinite# - * #func = npy_isnan, npy_isinf, npy_isfinite# - * #OP = ||, ||, &&# - **/ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat1 - * #isa = , _avx512f# - */ - -NPY_NO_EXPORT void -C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); -/**end repeat1**/ - -NPY_NO_EXPORT void -C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = CGE, CLE# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = CGE, CLE# - */ -NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**end repeat**/ - -#undef CGE -#undef CLE -#undef CGT -#undef CLT -#undef CEQ -#undef CNE - -/* - ***************************************************************************** - ** DATETIME LOOPS ** - ***************************************************************************** - */ - -NPY_NO_EXPORT void -TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat - * #TYPE = DATETIME, TIMEDELTA# - */ - -NPY_NO_EXPORT void -@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -#define @TYPE@_isnan @TYPE@_isnat - -NPY_NO_EXPORT void -@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat1 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal# - * #OP = ==, !=, >, >=, <, <=# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**begin repeat1 - * #kind = maximum, minimum, fmin, fmax# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -/**end repeat**/ - -NPY_NO_EXPORT void -DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/* Special case equivalents to above functions */ -#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide -#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide -/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ - -/* - ***************************************************************************** - ** OBJECT LOOPS ** - ***************************************************************************** - */ - -/**begin repeat - * #kind = equal, not_equal, greater, greater_equal, less, less_equal# - * #OP = EQ, NE, GT, GE, LT, LE# - */ -/**begin repeat1 - * #suffix = , _OO_O# - */ -NPY_NO_EXPORT void -OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ -/**end repeat**/ - -NPY_NO_EXPORT void -OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); - -/* - ***************************************************************************** - ** MIN/MAX LOOPS ** - ***************************************************************************** - */ - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "loops_minmax.dispatch.h" -#endif - -//---------- Integers ---------- - -/**begin repeat - * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, - * LONG, ULONG, LONGLONG, ULONGLONG# - * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# - */ -#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ -/**begin repeat1 - * #kind = maximum, minimum# - */ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) -/**end repeat1**/ -#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ -/**end repeat**/ - -//---------- Float ---------- - - /**begin repeat - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# - */ - #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ -/**begin repeat1 - * #kind = maximum, minimum, fmax, fmin# - */ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) -/**end repeat1**/ -#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ -/**end repeat**/ - -/* - ***************************************************************************** - ** END LOOPS ** - ***************************************************************************** - */ - -#endif From 0d171288ecd9ccfee739b15faa58de8243ae4a53 Mon Sep 17 00:00:00 2001 From: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> Date: Mon, 13 Dec 2021 12:55:07 -0800 Subject: [PATCH 5/8] Integrate requested changes, improve scalar operations, address linux aarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've incorporated the changes you've requested for scalar operations. **Testing** - Apple silicon M1 native (arm64 / aarch64) -- No test failures - Apple silicon M1 Rosetta (x86_64) -- No new test failures - iMacPro1,1 (AVX512F) -- No test failures - Ubuntu VM (aarch64) -- No test failures **Benchmarks** Again, Apple silicon M1 native (arm64 / aarch64) looks similar to original patch (comparison below) Also, x86_64 (both Apple silicon M1 Rosetta and iMacPro1,1 AVX512F) have varying results. Some are better. Some are worse. Compared to previous re-org, we see improvements though. Apple silicon M1 native (arm64 / aarch64) comparison to previous commit: ``` before after ratio [8b01e839] [18565b27] + 176±0.2μs 196±1μs 1.11 bench_function_base.Sort.time_sort('heap', 'int16', ('ordered',)) + 234±0.2μs 261±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'f') + 43.4±0.4μs 48.3±0.4μs 1.11 bench_function_base.Sort.time_sort('quick', 'int64', ('uniform',)) + 22.5±0.1μs 25.1±0.3μs 1.11 bench_shape_base.Block2D.time_block2d((512, 512), 'uint8', (2, 2)) + 4.75±0.05μs 5.28±0.07μs 1.11 bench_ma.UFunc.time_scalar(True, True, 1000) + 224±0.2μs 248±0.9μs 1.11 bench_ufunc_strides.Unary.time_ufunc(, 1, 1, 'f') + 233±0.5μs 258±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(, 4, 2, 'f') + 8.81±0.02μs 9.72±0.1μs 1.10 bench_shape_base.Block2D.time_block2d((32, 32), 'uint16', (2, 2)) + 8.71±0.1μs 9.58±0.3μs 1.10 bench_indexing.ScalarIndexing.time_assign_cast(2) + 96.2±0.03μs 105±3μs 1.09 bench_ufunc_strides.Unary.time_ufunc(, 1, 1, 'd') + 20.2±0.1μs 22.0±0.5μs 1.09 bench_shape_base.Block.time_block_simple_row_wise(100) + 469±4μs 510±7μs 1.09 bench_ufunc_strides.Unary.time_ufunc(, 2, 1, 'd') + 43.9±0.02μs 46.4±2μs 1.06 bench_function_base.Median.time_odd_inplace + 4.75±0μs 5.02±0.2μs 1.06 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'int64') - 16.4±0.07μs 15.6±0.4μs 0.95 bench_ufunc.UFunc.time_ufunc_types('left_shift') - 127±6μs 120±0.1μs 0.94 bench_ufunc.UFunc.time_ufunc_types('deg2rad') - 10.9±0.5μs 10.3±0.01μs 0.94 bench_function_base.Sort.time_sort('merge', 'int64', ('reversed',)) - 115±5μs 108±0.2μs 0.94 bench_function_base.Bincount.time_bincount - 17.0±0.4μs 15.9±0.03μs 0.94 bench_ufunc.UFunc.time_ufunc_types('right_shift') - 797±30ns 743±0.5ns 0.93 bench_ufunc.ArgParsingReduce.time_add_reduce_arg_parsing((array([0., 1.]), axis=0)) - 18.4±1μs 17.2±0.04μs 0.93 bench_core.CountNonzero.time_count_nonzero_multi_axis(3, 10000, ) - 241±7μs 224±0.3μs 0.93 bench_ufunc_strides.Unary.time_ufunc(, 2, 1, 'f') - 105±1μs 96.7±0.02μs 0.92 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'f') - 23.3±0.2μs 21.4±0.02μs 0.92 bench_lib.Pad.time_pad((1, 1, 1, 1, 1), 1, 'edge') - 833±20μs 766±2μs 0.92 bench_ufunc_strides.Unary.time_ufunc(, 1, 1, 'd') - 86.8±4μs 79.5±0.4μs 0.92 bench_ufunc.UFunc.time_ufunc_types('conjugate') - 2.58±0.1μs 2.36±0μs 0.91 bench_ufunc.CustomScalar.time_divide_scalar2() - 102±4μs 92.8±0.7μs 0.91 bench_ufunc.UFunc.time_ufunc_types('logical_not') - 46.6±0.4μs 42.1±0.07μs 0.90 bench_ufunc_strides.Unary.time_ufunc(, 4, 1, 'd') - 158±0.7μs 142±0.07μs 0.90 bench_lib.Pad.time_pad((4, 4, 4, 4), 1, 'linear_ramp') - 729±6μs 657±1μs 0.90 bench_ufunc_strides.Unary.time_ufunc(, 4, 4, 'f') - 63.6±0.9μs 56.2±1μs 0.88 bench_ufunc_strides.Unary.time_ufunc(, 2, 4, 'd') - 730±40μs 605±3μs 0.83 bench_lib.Pad.time_pad((1024, 1024), 1, 'reflect') SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY. PERFORMANCE DECREASED. ``` --- .../src/umath/loops_minmax.dispatch.c.src | 68 +++++++++++++++---- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index c2eddcee6972..891be445e437 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -1,6 +1,8 @@ /*@targets ** $maxopt baseline ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -20,7 +22,6 @@ #define scalar_max_i(A, B) ((A > B) ? A : B) #define scalar_min_i(A, B) ((A < B) ? A : B) // fp, propagates NaNs -#if !defined(__aarch64__) #define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B) #define scalar_max_d scalar_max_f #define scalar_max_l scalar_max_f @@ -34,28 +35,61 @@ #define scalar_minp_f fminf #define scalar_minp_d fmin #define scalar_minp_l fminl -#elif defined(__aarch64__) + +// special optimization for fp scalars propagates NaNs +// since there're no C99 support for it +#ifndef NPY_DISABLE_OPTIMIZATION /**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #scalar_sfx = f, d, l# - * #asm_sfx = s, d, d# + * #type = npy_float, npy_double# + * #sfx = f32, f64# + * #c_sfx = f, d# + * #isa_sfx = s, d# + * #sse_type = __m128, __m128d# */ /**begin repeat1 - * #op = max, min, maxp, minp# - * #asm_instr = fmax, fmin, fmaxnm, fminnm# + * #op = max, min# + * #neon_instr = fmax, fmin# */ -static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ +#ifdef NPY_HAVE_SSE2 +#undef scalar_@op@_@c_sfx@ +NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { + @sse_type@ va = _mm_set_s@isa_sfx@(a); + @sse_type@ vb = _mm_set_s@isa_sfx@(b); + @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb); + // X86 handel second operand + @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va); + #ifdef NPY_HAVE_SSE41 + rv = _mm_blendv_p@isa_sfx@(va, rv, nn); + #else + rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn)); + #endif + return _mm_cvts@isa_sfx@_@sfx@(rv); +} +#endif // SSE2 +#ifdef __aarch64__ +#undef scalar_@op@_@c_sfx@ +NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { @type@ result = 0; __asm( - "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]" + "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]" : [result] "=w" (result) : [a] "w" (a), [b] "w" (b) ); return result; } +#endif // __aarch64__ /**end repeat1**/ /**end repeat**/ -#endif // !defined(__aarch64__) +#endif // NPY_DISABLE_OPTIMIZATION +// mapping to double if its possible +#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE +/**begin repeat + * #op = max, min, maxp, minp# + */ + #undef scalar_@op@_l + #define scalar_@op@_l scalar_@op@_d +/**end repeat**/ +#endif /******************************************************************************* ** extra SIMD intrinsics @@ -92,7 +126,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ */ NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) { - npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; npyv_storea_@sfx@(s, v); npyv_lanetype_@sfx@ result = s[0]; for(int i=1; i Date: Fri, 31 Dec 2021 04:15:14 +0200 Subject: [PATCH 6/8] BENCH: cover integer max/min and fmax/fmin tests also covers variant of strides sizes --- benchmarks/benchmarks/bench_reduce.py | 18 ++++++++-- benchmarks/benchmarks/bench_ufunc_strides.py | 35 ++++++++++++++------ 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py index 7b05f4fcce31..81316c492327 100644 --- a/benchmarks/benchmarks/bench_reduce.py +++ b/benchmarks/benchmarks/bench_reduce.py @@ -46,7 +46,8 @@ def time_any_slow(self): class MinMax(Benchmark): - params = [np.float32, np.float64, np.intp] + params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, + np.int64, np.uint64, np.float32, np.float64, np.intp] param_names = ['dtype'] def setup(self, dtype): @@ -58,8 +59,21 @@ def time_min(self, dtype): def time_max(self, dtype): np.max(self.d) +class FMinMax(Benchmark): + params = [np.float32, np.float64] + param_names = ['dtype'] + + def setup(self, dtype): + self.d = np.ones(20000, dtype=dtype) + + def time_min(self, dtype): + np.fmin.reduce(self.d) + + def time_max(self, dtype): + np.fmax.reduce(self.d) + class ArgMax(Benchmark): - params = [np.float32, bool] + params = [np.float32, np.float64, bool] param_names = ['dtype'] def setup(self, dtype): diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py index 75aa510a6b81..b751e4804238 100644 --- a/benchmarks/benchmarks/bench_ufunc_strides.py +++ b/benchmarks/benchmarks/bench_ufunc_strides.py @@ -44,27 +44,40 @@ def setup(self, stride, dtype): def time_log(self, stride, dtype): np.log(self.arr[::stride]) -avx_bfuncs = ['maximum', - 'minimum'] -class AVX_BFunc(Benchmark): +binary_ufuncs = [ + 'maximum', 'minimum', 'fmax', 'fmin' +] +binary_dtype = ['f', 'd'] - params = [avx_bfuncs, dtype, stride] - param_names = ['avx_based_bfunc', 'dtype', 'stride'] +class Binary(Benchmark): + param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype'] + params = [binary_ufuncs, stride, stride, stride_out, binary_dtype] timeout = 10 - def setup(self, ufuncname, dtype, stride): + def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype): np.seterr(all='ignore') try: self.f = getattr(np, ufuncname) except AttributeError: raise NotImplementedError(f"No ufunc {ufuncname} found") from None - N = 10000 - self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype) - self.arr2 = np.array(np.random.rand(stride*N), dtype=dtype) + N = 100000 + self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype) + self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype) + self.arr_out = np.empty(stride_out*N, dtype) - def time_ufunc(self, ufuncname, dtype, stride): - self.f(self.arr1[::stride], self.arr2[::stride]) + def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype): + self.f(self.arr1[::stride_in0], self.arr2[::stride_in1], + self.arr_out[::stride_out]) + + +binary_int_ufuncs = ['maximum', 'minimum'] +binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q'] + +class BinaryInt(Binary): + + param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype'] + params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype] class AVX_ldexp(Benchmark): From 1b55815e90009beaa064856b57575c267f297699 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Fri, 31 Dec 2021 04:16:00 +0200 Subject: [PATCH 7/8] ENH: remove raw x86 SIMD of max/min --- numpy/core/src/umath/loops.c.src | 33 ------ numpy/core/src/umath/simd.inc.src | 188 +----------------------------- 2 files changed, 3 insertions(+), 218 deletions(-) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 6076e0b2d31d..5f054d0a9ac6 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1658,39 +1658,6 @@ NPY_NO_EXPORT void } } -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - } - else { - if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; - *((@type@ *)op1) = in1; - } - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} -/**end repeat1**/ - NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 0e2c1ab8b31b..8b833ee56265 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -88,38 +88,6 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n ***************************************************************************** */ -/**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #EXISTS = 1, 1, 0# - */ - -/**begin repeat1 - * #func = maximum, minimum# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ -static NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); -#endif - -static NPY_INLINE int -run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ - if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) { - AVX512F_@func@_@TYPE@(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; -} -/**end repeat1**/ - -/**end repeat**/ - /**begin repeat * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# @@ -204,9 +172,9 @@ run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp */ /**begin repeat1 - * #func = negative, minimum, maximum# - * #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE*2 # - * #name = unary, unary_reduce*2# + * #func = negative# + * #check = IS_BLOCKABLE_UNARY# + * #name = unary# */ #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS @@ -678,55 +646,6 @@ sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) } /**end repeat1**/ - -/**begin repeat1 - * #kind = maximum, minimum# - * #VOP = max, min# - * #OP = >=, <=# - **/ -/* arguments swapped as unary reduce has the swapped compared to unary */ -static void -sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) -{ - const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@); - LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) { - /* Order of operations important for MSVC 2015 */ - *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i]; - } - assert(n < stride || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)); - if (i + 3 * stride <= n) { - /* load the first elements */ - @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]); - @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]); - i += 2 * stride; - - /* minps/minpd will set invalid flag if nan is encountered */ - npy_clear_floatstatus_barrier((char*)&c1); - LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) { - @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]); - @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]); - c1 = @vpre@_@VOP@_@vsuf@(c1, v1); - c2 = @vpre@_@VOP@_@vsuf@(c2, v2); - } - c1 = @vpre@_@VOP@_@vsuf@(c1, c2); - - if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) { - *op = @nan@; - } - else { - @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1); - /* Order of operations important for MSVC 2015 */ - *op = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp; - } - } - LOOP_BLOCKED_END { - /* Order of operations important for MSVC 2015 */ - *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i]; - } - npy_clear_floatstatus_barrier((char*)op); -} -/**end repeat1**/ - /**end repeat**/ /* bunch of helper functions used in ISA_exp/log_FLOAT*/ @@ -1199,107 +1118,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co /**end repeat1**/ /**end repeat**/ -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #mask = __mmask16, __mmask8# - * #vtype1 = __m512, __m512d# - * #vtype2 = __m512i, __m256i# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexsize = 512, 256# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32# - * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32# - * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32# - * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32# - * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256# - */ -/**begin repeat1 - * #func = maximum, minimum# - * #vectorf = max, min# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); - const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@); - const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@); - const npy_intp array_size = dimensions[0]; - npy_intp num_remaining_elements = array_size; - @type@* ip1 = (@type@*) args[0]; - @type@* ip2 = (@type@*) args[1]; - @type@* op = (@type@*) args[2]; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP - */ - - npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { - index_ip1[ii] = ii*stride_ip1; - index_ip2[ii] = ii*stride_ip2; - index_op[ii] = ii*stride_op; - } - @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); - @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]); - @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); - @vtype1@ zeros_f = _mm512_setzero_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype1@ x1, x2; - if (stride_ip1 == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask); - } - if (stride_ip2 == 1) { - x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); - } - else { - x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask); - } - - /* - * when only one of the argument is a nan, the maxps/maxpd instruction - * returns the second argument. The additional blend instruction fixes - * this issue to conform with NumPy behaviour. - */ - @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ); - @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2); - out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1); - - if (stride_op == 1) { - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - } - else { - /* scatter! */ - _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@); - } - - ip1 += @num_lanes@*stride_ip1; - ip2 += @num_lanes@*stride_ip2; - op += @num_lanes@*stride_op; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - /**begin repeat * #ISA = FMA, AVX512F# * #isa = fma, avx512# From 5fca2bfccf7ea55d5e6d1480fff61b1ca14770bd Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Fri, 31 Dec 2021 04:36:35 +0200 Subject: [PATCH 8/8] ENH, SIMD: serveral improvments for max/min - Avoid unroll vectorized loops max/min by x6/x8 when SIMD width > 128 to avoid memory bandwidth bottleneck - tune reduce max/min - vectorize non-contiguos max/min - fix code style - call npyv_cleanup() at end of inner loop --- .../src/umath/loops_minmax.dispatch.c.src | 201 +++++++++++------- 1 file changed, 121 insertions(+), 80 deletions(-) diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index 891be445e437..dbd158db9926 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -192,7 +192,6 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { /******************************************************************************* ** Defining the SIMD kernels ******************************************************************************/ - /**begin repeat * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64# @@ -205,6 +204,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { */ #define SCALAR_OP scalar_@intrin@_@scalar_sfx@ #if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@)) + #if @is_fp@ && !@fp_only@ #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@ @@ -217,67 +217,42 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { static inline void simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len) { - // Note, 8x unroll was chosen for best results on Apple M1 - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - npy_intp i = 0; - npyv_lanetype_@sfx@ io1 = op1[0]; - - // SIMD if possible - if((i+elemPerLoop) <= len){ - npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); - npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); - npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); - npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); - npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); - npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); - npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); - npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); - - i += elemPerLoop; - for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); - npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); - npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); - npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); - npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); - npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); - - m0 = V_INTRIN(m0, v0); - m1 = V_INTRIN(m1, v1); - m2 = V_INTRIN(m2, v2); - m3 = V_INTRIN(m3, v3); - m4 = V_INTRIN(m4, v4); - m5 = V_INTRIN(m5, v5); - m6 = V_INTRIN(m6, v6); - m7 = V_INTRIN(m7, v7); - } - - m0 = V_INTRIN(m0, m1); - m2 = V_INTRIN(m2, m3); - m4 = V_INTRIN(m4, m5); - m6 = V_INTRIN(m6, m7); - - m0 = V_INTRIN(m0, m2); - m4 = V_INTRIN(m4, m6); - - m0 = V_INTRIN(m0, m4); - - npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0); - - io1 = SCALAR_OP(io1, r); + if (len < 1) { + return; } - + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep*8; + npyv_@sfx@ acc = npyv_setall_@sfx@(op1[0]); + for (; len >= wstep; len -= wstep, ip += wstep) { + #ifdef NPY_HAVE_SSE2 + NPY_PREFETCH(ip + wstep, 0, 3); + #endif + npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0); + npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1); + npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2); + npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3); + + npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4); + npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5); + npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6); + npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7); + + npyv_@sfx@ r01 = V_INTRIN(v0, v1); + npyv_@sfx@ r23 = V_INTRIN(v2, v3); + npyv_@sfx@ r45 = V_INTRIN(v4, v5); + npyv_@sfx@ r67 = V_INTRIN(v6, v7); + acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67))); + } + for (; len >= vstep; len -= vstep, ip += vstep) { + acc = V_INTRIN(acc, npyv_load_@sfx@(ip)); + } + npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(acc); // Scalar - finish up any remaining iterations - for(; i 0; --len, ++ip) { + const npyv_lanetype_@sfx@ in2 = *ip; + r = SCALAR_OP(r, in2); } - op1[0] = io1; + op1[0] = r; } // contiguous inputs and output. @@ -285,51 +260,102 @@ static inline void simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2, npyv_lanetype_@sfx@ *op1, npy_intp len) { +#if NPY_SIMD_WIDTH == 128 // Note, 6x unroll was chosen for best results on Apple M1 const int vectorsPerLoop = 6; - const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); +#else + // To avoid memory bandwidth bottleneck + const int vectorsPerLoop = 2; +#endif + const int elemPerVector = npyv_nlanes_@sfx@; int elemPerLoop = vectorsPerLoop * elemPerVector; npy_intp i = 0; - for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]); - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]); - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]); - npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]); - npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]); - npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]); - - npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); - npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); - npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); - npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); - npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); - npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); - + for (; (i+elemPerLoop) <= len; i += elemPerLoop) { + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]); + #if NPY_SIMD_WIDTH == 128 + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]); + #endif + npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + #if NPY_SIMD_WIDTH == 128 + npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + #endif npyv_@sfx@ m0 = V_INTRIN(v0, u0); npyv_@sfx@ m1 = V_INTRIN(v1, u1); + #if NPY_SIMD_WIDTH == 128 npyv_@sfx@ m2 = V_INTRIN(v2, u2); npyv_@sfx@ m3 = V_INTRIN(v3, u3); npyv_@sfx@ m4 = V_INTRIN(v4, u4); npyv_@sfx@ m5 = V_INTRIN(v5, u5); - + #endif npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0); npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1); + #if NPY_SIMD_WIDTH == 128 npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2); npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3); npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4); npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5); + #endif + } + for (; (i+elemPerVector) <= len; i += elemPerVector) { + npyv_@sfx@ v0 = npyv_load_@sfx@(ip1 + i); + npyv_@sfx@ u0 = npyv_load_@sfx@(ip2 + i); + npyv_@sfx@ m0 = V_INTRIN(v0, u0); + npyv_store_@sfx@(op1 + i, m0); } - // Scalar - finish up any remaining iterations - for(; i= vstep; len -= vstep, ip1 += sip1*vstep, + ip2 += sip2*vstep, op1 += sop1*vstep + ) { + npyv_@sfx@ a, b; + if (sip1 == 1) { + a = npyv_load_@sfx@(ip1); + } else { + a = npyv_loadn_@sfx@(ip1, sip1); + } + if (sip2 == 1) { + b = npyv_load_@sfx@(ip2); + } else { + b = npyv_loadn_@sfx@(ip2, sip2); + } + npyv_@sfx@ r = V_INTRIN(a, b); + if (sop1 == 1) { + npyv_store_@sfx@(op1, r); + } else { + npyv_storen_@sfx@(op1, sop1, r); + } + } + for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) { + const npyv_lanetype_@sfx@ a = *ip1; + const npyv_lanetype_@sfx@ b = *ip2; + *op1 = SCALAR_OP(a, b); + } +} +#endif #undef V_INTRIN #undef V_REDUCE_INTRIN @@ -415,6 +441,20 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) ); goto clear_fp; } + // unroll scalars faster than non-contiguous vector load/store on Arm + #if !defined(NPY_HAVE_NEON) && @is_fp@ + if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) && + TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) && + TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE)) + ) { + TO_SIMD_SFX(simd_binary_@intrin@)( + (STYPE*)ip1, is1/sizeof(STYPE), + (STYPE*)ip2, is2/sizeof(STYPE), + (STYPE*)op1, os1/sizeof(STYPE), len + ); + goto clear_fp; + } + #endif } #endif // TO_SIMD_SFX #ifndef NPY_DISABLE_OPTIMIZATION @@ -495,7 +535,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) *((@type@ *)op1) = SCALAR_OP(in1, in2); } #ifdef TO_SIMD_SFX -clear_fp:; +clear_fp: + npyv_cleanup(); #endif #if @is_fp@ npy_clear_floatstatus_barrier((char*)dimensions);