diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst index 80ef2c63908d..0a40d3ff5547 100644 --- a/doc/source/reference/simd/build-options.rst +++ b/doc/source/reference/simd/build-options.rst @@ -165,6 +165,7 @@ Special Options ARMHF ``NONE`` ARM64 A.K. AARCH64 ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` + IBM/ZSYSTEM(S390X) ``NONE`` ====================================== ======================================= - ``MAX``: Enables all supported CPU features by the compiler and platform. @@ -338,7 +339,7 @@ that includes several sections, and each section has several values, representin **Platform**: - :enabled:`Architecture`: The architecture name of target CPU. It should be one of - ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64`` or ``unknown``. + ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64``, ``s390x`` or ``unknown``. - :enabled:`Compiler`: The compiler name. It should be one of gcc, clang, msvc, icc, iccw or unix-like. diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py index d74d54016a3d..9a38ef5c9ba2 100644 --- a/doc/source/reference/simd/gen_features.py +++ b/doc/source/reference/simd/gen_features.py @@ -158,6 +158,7 @@ def wrapper_tab(title, table, tab_size=4): pretty_names = { "PPC64": "IBM/POWER big-endian", "PPC64LE": "IBM/POWER little-endian", + "S390X": "IBM/ZSYSTEM(S390X)", "ARMHF": "ARMv7/A32", "AARCH64": "ARMv8/A64", "ICC": "Intel Compiler", @@ -170,7 +171,7 @@ def wrapper_tab(title, table, tab_size=4): with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd: fd.write(f'.. generated via {__file__}\n\n') for arch in ( - ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64") + ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64", "S390X") ): title = "On " + pretty_names.get(arch, arch) table = Features(arch, 'gcc').table() @@ -183,7 +184,8 @@ def wrapper_tab(title, table, tab_size=4): ("PPC64", ("clang",)), ("PPC64LE", ("clang",)), ("ARMHF", ("clang",)), - ("AARCH64", ("clang",)) + ("AARCH64", ("clang",)), + ("S390X", ("clang",)) ): arch_pname = pretty_names.get(arch, arch) for cc in cc_names: diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc index a7eae5652eef..17d1b4951135 100644 --- a/doc/source/reference/simd/generated_tables/cpu_features.inc +++ b/doc/source/reference/simd/generated_tables/cpu_features.inc @@ -1,4 +1,4 @@ -.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py +.. generated via /home/seiko/work/repos/review/numpy/doc/source/reference/simd/gen_features.py On x86 ~~~~~~ @@ -91,3 +91,16 @@ On ARMv8/A64 ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP`` ============== =========================================================== +On IBM/ZSYSTEM(S390X) +~~~~~~~~~~~~~~~~~~~~~ +.. table:: + :align: left + + ======== ============== + Name Implies + ======== ============== + ``VX`` + ``VXE`` ``VX`` + ``VXE2`` ``VX`` ``VXE`` + ======== ============== + diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src index a2383c45f61d..1385220f9dc3 100644 --- a/numpy/core/src/common/npy_cpu_features.c.src +++ b/numpy/core/src/common/npy_cpu_features.c.src @@ -62,6 +62,7 @@ npy_cpu_features_dict(void) * AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG, * AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL, * VSX, VSX2, VSX3, + * VX, VXE, VXE2, * NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM# */ if (PyDict_SetItemString(dict, "@feature@", @@ -509,6 +510,42 @@ npy__cpu_init_features(void) #endif } +/***************** ZARCH ******************/ + +#elif defined(__s390x__) + +#include +#ifndef HWCAP_S390_VXE + #define HWCAP_S390_VXE 8192 +#endif + +#ifndef HWCAP_S390_VXRS_EXT2 + #define HWCAP_S390_VXRS_EXT2 32768 +#endif + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); + + unsigned int hwcap = getauxval(AT_HWCAP); + if ((hwcap & HWCAP_S390_VX) == 0) { + return; + } + + if (hwcap & HWCAP_S390_VXRS_EXT2) { + npy__cpu_have[NPY_CPU_FEATURE_VX] = + npy__cpu_have[NPY_CPU_FEATURE_VXE] = + npy__cpu_have[NPY_CPU_FEATURE_VXE2] = 1; + return; + } + + npy__cpu_have[NPY_CPU_FEATURE_VXE] = (hwcap & HWCAP_S390_VXE) != 0; + + npy__cpu_have[NPY_CPU_FEATURE_VX] = 1; +} + + /***************** ARM ******************/ #elif defined(__arm__) || defined(__aarch64__) diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h index ce1fc822ac03..1f52a445dcad 100644 --- a/numpy/core/src/common/npy_cpu_features.h +++ b/numpy/core/src/common/npy_cpu_features.h @@ -82,6 +82,15 @@ enum npy_cpu_features // ARMv8.2 single&half-precision multiply NPY_CPU_FEATURE_ASIMDFHM = 307, + // IBM/ZARCH + NPY_CPU_FEATURE_VX = 350, + + // Vector-Enhancements Facility 1 + NPY_CPU_FEATURE_VXE = 351, + + // Vector-Enhancements Facility 2 + NPY_CPU_FEATURE_VXE2 = 352, + NPY_CPU_FEATURE_MAX }; @@ -138,6 +147,7 @@ npy_cpu_features_dict(void); * On aarch64: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD'] * On ppc64: [] * On ppc64le: ['VSX', 'VSX2'] + * On s390x: [] * On any other arch or if the optimization is disabled: [] */ NPY_VISIBILITY_HIDDEN PyObject * @@ -159,6 +169,7 @@ npy_cpu_baseline_list(void); * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM'] * On ppc64: ['VSX', 'VSX2', 'VSX3'] * On ppc64le: ['VSX3'] + * On s390x: ['VX', 'VXE', VXE2] * On any other arch or if the optimization is disabled: [] */ NPY_VISIBILITY_HIDDEN PyObject * diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py index 2ccbff41ca63..706cf7a7e705 100644 --- a/numpy/core/tests/test_cpu_features.py +++ b/numpy/core/tests/test_cpu_features.py @@ -146,6 +146,17 @@ class Test_POWER_Features(AbstractTest): def load_flags(self): self.load_flags_auxv() + +is_zarch = re.match("^(s390x)", machine, re.IGNORECASE) +@pytest.mark.skipif(not is_linux or not is_zarch, + reason="Only for Linux and IBM Z") +class Test_ZARCH_Features(AbstractTest): + features = ["VX", "VXE", "VXE2"] + + def load_flags(self): + self.load_flags_auxv() + + is_arm = re.match("^(arm|aarch64)", machine, re.IGNORECASE) @pytest.mark.skipif(not is_linux or not is_arm, reason="Only for Linux and ARM") class Test_ARM_Features(AbstractTest): diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py index b38e47c13a94..e020d96ee553 100644 --- a/numpy/distutils/ccompiler_opt.py +++ b/numpy/distutils/ccompiler_opt.py @@ -228,6 +228,7 @@ class _Config: x64 = "SSE SSE2 SSE3", ppc64 = '', # play it safe ppc64le = "VSX VSX2", + s390x = '', armhf = '', # play it safe aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD" ) @@ -293,6 +294,13 @@ class _Config: VSX2 = dict(interest=2, implies="VSX", implies_detect=False), ## Power9/ISA 3.00 VSX3 = dict(interest=3, implies="VSX2", implies_detect=False), + # IBM/Z + ## VX(z13) support + VX = dict(interest=1, headers="vecintrin.h"), + ## Vector-Enhancements Facility + VXE = dict(interest=2, implies="VX", implies_detect=False), + ## Vector-Enhancements Facility 2 + VXE2 = dict(interest=3, implies="VXE", implies_detect=False), # ARM NEON = dict(interest=1, headers="arm_neon.h"), NEON_FP16 = dict(interest=2, implies="NEON"), @@ -472,6 +480,23 @@ class attribute `conf_features`, also its override return partial + on_zarch = self.cc_on_s390x + if on_zarch: + partial = dict( + VX = dict( + flags="-march=arch11 -mzvector" + ), + VXE = dict( + flags="-march=arch12", implies_detect=False + ), + VXE2 = dict( + flags="-march=arch13", implies_detect=False + ) + ) + + return partial + + if self.cc_on_aarch64 and is_unix: return dict( NEON = dict( implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True @@ -919,6 +944,7 @@ def __init__(self): ("cc_on_ppc64", ".*(powerpc|ppc)64.*"), ("cc_on_aarch64", ".*(aarch64|arm64).*"), ("cc_on_armhf", ".*arm.*"), + ("cc_on_s390x", ".*s390x.*"), # undefined platform ("cc_on_noarch", ""), ) @@ -983,7 +1009,8 @@ def __init__(self): self.cc_is_gcc = True self.cc_march = "unknown" - for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"): + for arch in ("x86", "x64", "ppc64", "ppc64le", + "armhf", "aarch64", "s390x"): if getattr(self, "cc_on_" + arch): self.cc_march = arch break diff --git a/numpy/distutils/checks/cpu_vx.c b/numpy/distutils/checks/cpu_vx.c new file mode 100644 index 000000000000..18fb7ef94a24 --- /dev/null +++ b/numpy/distutils/checks/cpu_vx.c @@ -0,0 +1,16 @@ +#if (__VEC__ < 10301) || (__ARCH__ < 11) + #error VX not supported +#endif + +#include +int main(int argc, char **argv) +{ + __vector double x = vec_abs(vec_xl(argc, (double*)argv)); + __vector double y = vec_load_len((double*)argv, (unsigned int)argc); + + x = vec_round(vec_ceil(x) + vec_floor(y)); + __vector bool long long m = vec_cmpge(x, y); + __vector long long i = vec_signed(vec_sel(x, y, m)); + + return (int)vec_extract(i, 0); +} diff --git a/numpy/distutils/checks/cpu_vxe.c b/numpy/distutils/checks/cpu_vxe.c new file mode 100644 index 000000000000..ca41f8434c2b --- /dev/null +++ b/numpy/distutils/checks/cpu_vxe.c @@ -0,0 +1,25 @@ +#if (__VEC__ < 10302) || (__ARCH__ < 12) + #error VXE not supported +#endif + +#include +int main(int argc, char **argv) +{ + __vector float x = vec_nabs(vec_xl(argc, (float*)argv)); + __vector float y = vec_load_len((float*)argv, (unsigned int)argc); + + x = vec_round(vec_ceil(x) + vec_floor(y)); + __vector bool int m = vec_cmpge(x, y); + x = vec_sel(x, y, m); + + // need to test the existance of intrin "vflls" since vec_doublee + // is vec_doublee maps to wrong intrin "vfll". + // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871 +#if defined(__GNUC__) && !defined(__clang__) + __vector long long i = vec_signed(__builtin_s390_vflls(x)); +#else + __vector long long i = vec_signed(vec_doublee(x)); +#endif + + return (int)vec_extract(i, 0); +} diff --git a/numpy/distutils/checks/cpu_vxe2.c b/numpy/distutils/checks/cpu_vxe2.c new file mode 100644 index 000000000000..f36d57129af6 --- /dev/null +++ b/numpy/distutils/checks/cpu_vxe2.c @@ -0,0 +1,21 @@ +#if (__VEC__ < 10303) || (__ARCH__ < 13) + #error VXE2 not supported +#endif + +#include + +int main(int argc, char **argv) +{ + int val; + __vector signed short large = { 'a', 'b', 'c', 'a', 'g', 'h', 'g', 'o' }; + __vector signed short search = { 'g', 'h', 'g', 'o' }; + __vector unsigned char len = { 0 }; + __vector unsigned char res = vec_search_string_cc(large, search, len, &val); + __vector float x = vec_xl(argc, (float*)argv); + __vector int i = vec_signed(x); + + i = vec_srdb(vec_sldb(i, i, 2), i, 3); + val += (int)vec_extract(res, 1); + val += vec_extract(i, 0); + return val; +} diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py index a4fda537d5dc..dc1ab3b9bfa1 100644 --- a/numpy/distutils/command/build.py +++ b/numpy/distutils/command/build.py @@ -47,7 +47,8 @@ def initialize_options(self): - not part of dispatch-able features(--cpu-dispatch) - not supported by compiler or platform """ - self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD" + self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F" \ + " AVX512_SKX VSX VSX2 VSX3 NEON ASIMD VX VXE VXE2" def finalize_options(self): build_scripts = self.build_scripts diff --git a/numpy/distutils/fcompiler/gnu.py b/numpy/distutils/fcompiler/gnu.py index 39178071d511..d8143328e051 100644 --- a/numpy/distutils/fcompiler/gnu.py +++ b/numpy/distutils/fcompiler/gnu.py @@ -324,7 +324,7 @@ def _universal_flags(self, cmd): c_archs[c_archs.index("i386")] = "i686" # check the arches the Fortran compiler supports, and compare with # arch flags from C compiler - for arch in ["ppc", "i686", "x86_64", "ppc64"]: + for arch in ["ppc", "i686", "x86_64", "ppc64", "s390x"]: if _can_target(cmd, arch) and arch in c_archs: arch_flags.extend(["-arch", arch]) return arch_flags diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py index 1b27ab07c393..6d42cc172ebe 100644 --- a/numpy/distutils/tests/test_ccompiler_opt.py +++ b/numpy/distutils/tests/test_ccompiler_opt.py @@ -32,6 +32,7 @@ def assert_(expr, msg=''): ppc64le = ("gcc", "clang"), armhf = ("gcc", "clang"), aarch64 = ("gcc", "clang"), + s390x = ("gcc", "clang"), noarch = ("gcc",) ) @@ -382,18 +383,19 @@ def test_args_options(self): if o == "native" and self.cc_name() == "msvc": continue self.expect(o, - trap_files=".*cpu_(sse|vsx|neon).c", - x86="", ppc64="", armhf="" + trap_files=".*cpu_(sse|vsx|neon|vx).c", + x86="", ppc64="", armhf="", s390x="" ) self.expect(o, - trap_files=".*cpu_(sse3|vsx2|neon_vfpv4).c", + trap_files=".*cpu_(sse3|vsx2|neon_vfpv4|vxe).c", x86="sse sse2", ppc64="vsx", armhf="neon neon_fp16", - aarch64="", ppc64le="" + aarch64="", ppc64le="", s390x="vx" ) self.expect(o, trap_files=".*cpu_(popcnt|vsx3).c", x86="sse .* sse41", ppc64="vsx vsx2", - armhf="neon neon_fp16 .* asimd .*" + armhf="neon neon_fp16 .* asimd .*", + s390x="vx vxe vxe2" ) self.expect(o, x86_gcc=".* xop fma4 .* avx512f .* avx512_knl avx512_knm avx512_skx .*", @@ -403,13 +405,14 @@ def test_args_options(self): # in msvc, avx512_knl avx512_knm aren't supported x86_msvc=".* xop fma4 .* avx512f .* avx512_skx .*", armhf=".* asimd asimdhp asimddp .*", - ppc64="vsx vsx2 vsx3.*" + ppc64="vsx vsx2 vsx3.*", + s390x="vx vxe vxe2.*" ) # min self.expect("min", x86="sse sse2", x64="sse sse2 sse3", armhf="", aarch64="neon neon_fp16 .* asimd", - ppc64="", ppc64le="vsx vsx2" + ppc64="", ppc64le="vsx vsx2", s390x="" ) self.expect( "min", trap_files=".*cpu_(sse2|vsx2).c", @@ -420,7 +423,7 @@ def test_args_options(self): try: self.expect("native", trap_flags=".*(-march=native|-xHost|/QxHost).*", - x86=".*", ppc64=".*", armhf=".*" + x86=".*", ppc64=".*", armhf=".*", s390x=".*" ) if self.march() != "unknown": raise AssertionError( @@ -432,14 +435,15 @@ def test_args_options(self): def test_flags(self): self.expect_flags( - "sse sse2 vsx vsx2 neon neon_fp16", + "sse sse2 vsx vsx2 neon neon_fp16 vx vxe", x86_gcc="-msse -msse2", x86_icc="-msse -msse2", x86_iccw="/arch:SSE2", x86_msvc="/arch:SSE2" if self.march() == "x86" else "", ppc64_gcc= "-mcpu=power8", ppc64_clang="-maltivec -mvsx -mpower8-vector", armhf_gcc="-mfpu=neon-fp16 -mfp16-format=ieee", - aarch64="" + aarch64="", + s390="-mzvector -march=arch12" ) # testing normalize -march self.expect_flags( @@ -484,7 +488,7 @@ def test_targets_exceptions(self): try: self.expect_targets( targets, - x86="", armhf="", ppc64="" + x86="", armhf="", ppc64="", s390x="" ) if self.march() != "unknown": raise AssertionError( @@ -496,26 +500,26 @@ def test_targets_exceptions(self): def test_targets_syntax(self): for targets in ( - "/*@targets $keep_baseline sse vsx neon*/", - "/*@targets,$keep_baseline,sse,vsx,neon*/", - "/*@targets*$keep_baseline*sse*vsx*neon*/", + "/*@targets $keep_baseline sse vsx neon vx*/", + "/*@targets,$keep_baseline,sse,vsx,neon vx*/", + "/*@targets*$keep_baseline*sse*vsx*neon*vx*/", """ /* ** @targets - ** $keep_baseline, sse vsx,neon + ** $keep_baseline, sse vsx,neon, vx */ """, """ /* - ************@targets************* - ** $keep_baseline, sse vsx, neon - ********************************* + ************@targets**************** + ** $keep_baseline, sse vsx, neon, vx + ************************************ */ """, """ /* /////////////@targets///////////////// - //$keep_baseline//sse//vsx//neon + //$keep_baseline//sse//vsx//neon//vx ///////////////////////////////////// */ """, @@ -523,11 +527,11 @@ def test_targets_syntax(self): /* @targets $keep_baseline - SSE VSX NEON*/ + SSE VSX NEON VX*/ """ ) : self.expect_targets(targets, - x86="sse", ppc64="vsx", armhf="neon", unknown="" + x86="sse", ppc64="vsx", armhf="neon", s390x="vx", unknown="" ) def test_targets(self): @@ -538,10 +542,12 @@ def test_targets(self): sse sse2 sse41 avx avx2 avx512f vsx vsx2 vsx3 neon neon_fp16 asimdhp asimddp + vx vxe vxe2 */ """, - baseline="avx vsx2 asimd", - x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3" + baseline="avx vsx2 asimd vx vxe", + x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3", + s390x="vxe2" ) # test skipping non-dispatch features self.expect_targets( @@ -550,10 +556,11 @@ def test_targets(self): sse41 avx avx2 avx512f vsx2 vsx3 asimd asimdhp asimddp + vx vxe vxe2 */ """, - baseline="", dispatch="sse41 avx2 vsx2 asimd asimddp", - x86="avx2 sse41", armhf="asimddp asimd", ppc64="vsx2" + baseline="", dispatch="sse41 avx2 vsx2 asimd asimddp vxe2", + x86="avx2 sse41", armhf="asimddp asimd", ppc64="vsx2", s390x="vxe2" ) # test skipping features that not supported self.expect_targets( @@ -562,11 +569,13 @@ def test_targets(self): sse2 sse41 avx2 avx512f vsx2 vsx3 neon asimdhp asimddp + vx vxe vxe2 */ """, baseline="", - trap_files=".*(avx2|avx512f|vsx3|asimddp).c", - x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon" + trap_files=".*(avx2|avx512f|vsx3|asimddp|vxe2).c", + x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon", + s390x="vxe vx" ) # test skipping features that implies each other self.expect_targets( @@ -598,14 +607,16 @@ def test_targets_policies(self): sse2 sse42 avx2 avx512f vsx2 vsx3 neon neon_vfpv4 asimd asimddp + vx vxe vxe2 */ """, - baseline="sse41 avx2 vsx2 asimd vsx3", + baseline="sse41 avx2 vsx2 asimd vsx3 vxe", x86="avx512f avx2 sse42 sse2", ppc64="vsx3 vsx2", armhf="asimddp asimd neon_vfpv4 neon", # neon, neon_vfpv4, asimd implies each other - aarch64="asimddp asimd" + aarch64="asimddp asimd", + s390x="vxe2 vxe vx" ) # 'keep_sort', leave the sort as-is self.expect_targets( @@ -615,13 +626,15 @@ def test_targets_policies(self): avx512f sse42 avx2 sse2 vsx2 vsx3 asimd neon neon_vfpv4 asimddp + vxe vxe2 */ """, x86="avx512f sse42 avx2 sse2", ppc64="vsx2 vsx3", armhf="asimd neon neon_vfpv4 asimddp", # neon, neon_vfpv4, asimd implies each other - aarch64="asimd asimddp" + aarch64="asimd asimddp", + s390x="vxe vxe2" ) # 'autovec', skipping features that can't be # vectorized by the compiler @@ -736,11 +749,13 @@ def test_targets_multi(self): (sse41 avx sse42) (sse3 avx2 avx512f) (vsx vsx3 vsx2) (asimddp neon neon_vfpv4 asimd asimdhp) + (vx vxe vxe2) */ """, x86="avx avx512f", ppc64="vsx3", armhf=r"\(asimdhp asimddp\)", + s390x="vxe2" ) # test compiler variety and avoiding duplicating self.expect_targets(