numpy · mattip · Dec 14, 2021 · Dec 9, 2021 · Dec 9, 2021 · Dec 13, 2021
diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst
@@ -165,6 +165,7 @@ Special Options
        ARMHF                                  ``NONE``
        ARM64 A.K. AARCH64                     ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
                                               ``ASIMD``
+       IBM/ZSYSTEM(S390X)                     ``NONE``
       ======================================  =======================================
 
 - ``MAX``: Enables all supported CPU features by the compiler and platform.
@@ -338,7 +339,7 @@ that includes several sections, and each section has several values, representin
 **Platform**:
 
 - :enabled:`Architecture`: The architecture name of target CPU. It should be one of
-  ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64`` or ``unknown``.
+  ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64``, ``s390x`` or ``unknown``.
 
 - :enabled:`Compiler`: The compiler name. It should be one of
   gcc, clang, msvc, icc, iccw or unix-like.

diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py
@@ -158,6 +158,7 @@ def wrapper_tab(title, table, tab_size=4):
     pretty_names = {
         "PPC64": "IBM/POWER big-endian",
         "PPC64LE": "IBM/POWER little-endian",
+        "S390X": "IBM/ZSYSTEM(S390X)",
         "ARMHF": "ARMv7/A32",
         "AARCH64": "ARMv8/A64",
         "ICC": "Intel Compiler",
@@ -170,7 +171,7 @@ def wrapper_tab(title, table, tab_size=4):
     with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
         fd.write(f'.. generated via {__file__}\n\n')
         for arch in (
-            ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
+            ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64", "S390X")
         ):
             title = "On " + pretty_names.get(arch, arch)
             table = Features(arch, 'gcc').table()
@@ -183,7 +184,8 @@ def wrapper_tab(title, table, tab_size=4):
             ("PPC64", ("clang",)),
             ("PPC64LE", ("clang",)),
             ("ARMHF", ("clang",)),
-            ("AARCH64", ("clang",))
+            ("AARCH64", ("clang",)),
+            ("S390X", ("clang",))
         ):
             arch_pname = pretty_names.get(arch, arch)
             for cc in cc_names:

diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -1,4 +1,4 @@
-.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+.. generated via /home/seiko/work/repos/review/numpy/doc/source/reference/simd/gen_features.py
 
 On x86
 ~~~~~~
@@ -91,3 +91,16 @@ On ARMv8/A64
     ``ASIMDFHM``   ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
     ============== ===========================================================
 
+On IBM/ZSYSTEM(S390X)
+~~~~~~~~~~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ======== ==============
+    Name     Implies       
+    ======== ==============
+    ``VX``                 
+    ``VXE``  ``VX``        
+    ``VXE2`` ``VX`` ``VXE``
+    ======== ==============
+
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
@@ -62,6 +62,7 @@ npy_cpu_features_dict(void)
      *            AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG,
      *            AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL,
      *            VSX, VSX2, VSX3,
+     *            VX, VXE, VXE2,
      *            NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM#
     */
         if (PyDict_SetItemString(dict, "@feature@",
@@ -509,6 +510,42 @@ npy__cpu_init_features(void)
 #endif
 }
 
+/***************** ZARCH ******************/
+
+#elif defined(__s390x__)
+
+#include <sys/auxv.h>
+#ifndef HWCAP_S390_VXE
+    #define HWCAP_S390_VXE 8192
+#endif
+
+#ifndef HWCAP_S390_VXRS_EXT2
+    #define HWCAP_S390_VXRS_EXT2 32768
+#endif
+
+static void
+npy__cpu_init_features(void)
+{
+    memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+
+    unsigned int hwcap = getauxval(AT_HWCAP);
+    if ((hwcap & HWCAP_S390_VX) == 0) {
+        return;
+    }
+
+    if (hwcap & HWCAP_S390_VXRS_EXT2) {
+       npy__cpu_have[NPY_CPU_FEATURE_VX]  =
+       npy__cpu_have[NPY_CPU_FEATURE_VXE] =
+       npy__cpu_have[NPY_CPU_FEATURE_VXE2] = 1;
+       return;
+    }
+
+    npy__cpu_have[NPY_CPU_FEATURE_VXE] = (hwcap & HWCAP_S390_VXE) != 0;
+
+    npy__cpu_have[NPY_CPU_FEATURE_VX]  = 1;
+}
+
+
 /***************** ARM ******************/
 
 #elif defined(__arm__) || defined(__aarch64__)

diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
@@ -82,6 +82,15 @@ enum npy_cpu_features
     // ARMv8.2 single&half-precision multiply
     NPY_CPU_FEATURE_ASIMDFHM          = 307,
 
+    // IBM/ZARCH
+    NPY_CPU_FEATURE_VX                = 350,
+
+    // Vector-Enhancements Facility 1
+    NPY_CPU_FEATURE_VXE               = 351,
+
+    // Vector-Enhancements Facility 2
+    NPY_CPU_FEATURE_VXE2              = 352,
+
     NPY_CPU_FEATURE_MAX
 };
 
@@ -138,6 +147,7 @@ npy_cpu_features_dict(void);
  * On aarch64: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD']
  * On ppc64: []
  * On ppc64le: ['VSX', 'VSX2']
+ * On s390x: []
  * On any other arch or if the optimization is disabled: []
  */
 NPY_VISIBILITY_HIDDEN PyObject *
@@ -159,6 +169,7 @@ npy_cpu_baseline_list(void);
  * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
  * On ppc64:  ['VSX', 'VSX2', 'VSX3']
  * On ppc64le: ['VSX3']
+ * On s390x: ['VX', 'VXE', VXE2]
  * On any other arch or if the optimization is disabled: []
  */
 NPY_VISIBILITY_HIDDEN PyObject *

diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py
@@ -146,6 +146,17 @@ class Test_POWER_Features(AbstractTest):
     def load_flags(self):
         self.load_flags_auxv()
 
+
+is_zarch = re.match("^(s390x)", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_zarch,
+                    reason="Only for Linux and IBM Z")
+class Test_ZARCH_Features(AbstractTest):
+    features = ["VX", "VXE", "VXE2"]
+
+    def load_flags(self):
+        self.load_flags_auxv()
+
+
 is_arm = re.match("^(arm|aarch64)", machine, re.IGNORECASE)
 @pytest.mark.skipif(not is_linux or not is_arm, reason="Only for Linux and ARM")
 class Test_ARM_Features(AbstractTest):

diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
@@ -228,6 +228,7 @@ class _Config:
         x64 = "SSE SSE2 SSE3",
         ppc64 = '', # play it safe
         ppc64le = "VSX VSX2",
+        s390x = '',
         armhf = '', # play it safe
         aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD"
     )
@@ -293,6 +294,13 @@ class _Config:
         VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
         ## Power9/ISA 3.00
         VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
+        # IBM/Z
+        ## VX(z13) support
+        VX = dict(interest=1, headers="vecintrin.h"),
+        ## Vector-Enhancements Facility
+        VXE = dict(interest=2, implies="VX", implies_detect=False),
+        ## Vector-Enhancements Facility 2
+        VXE2 = dict(interest=3, implies="VXE", implies_detect=False),
         # ARM
         NEON  = dict(interest=1, headers="arm_neon.h"),
         NEON_FP16 = dict(interest=2, implies="NEON"),
@@ -472,6 +480,23 @@ class attribute `conf_features`, also its override
 
             return partial
 
+        on_zarch = self.cc_on_s390x
+        if on_zarch:
+            partial = dict(
+                VX = dict(
+                    flags="-march=arch11 -mzvector"
+                ),
+                VXE = dict(
+                    flags="-march=arch12", implies_detect=False
+                ),
+                VXE2 = dict(
+                    flags="-march=arch13", implies_detect=False
+                )
+            )
+
+            return partial
+
+
         if self.cc_on_aarch64 and is_unix: return dict(
             NEON = dict(
                 implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True
@@ -919,6 +944,7 @@ def __init__(self):
             ("cc_on_ppc64",    ".*(powerpc|ppc)64.*"),
             ("cc_on_aarch64",  ".*(aarch64|arm64).*"),
             ("cc_on_armhf",    ".*arm.*"),
+            ("cc_on_s390x",    ".*s390x.*"),
             # undefined platform
             ("cc_on_noarch",    ""),
         )
@@ -983,7 +1009,8 @@ def __init__(self):
             self.cc_is_gcc = True
 
         self.cc_march = "unknown"
-        for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"):
+        for arch in ("x86", "x64", "ppc64", "ppc64le", 
+                     "armhf", "aarch64", "s390x"):
             if getattr(self, "cc_on_" + arch):
                 self.cc_march = arch
                 break

diff --git a/numpy/distutils/checks/cpu_vx.c b/numpy/distutils/checks/cpu_vx.c
@@ -0,0 +1,16 @@
+#if (__VEC__ < 10301) || (__ARCH__ < 11)
+    #error VX not supported
+#endif
+
+#include <vecintrin.h>
+int main(int argc, char **argv)
+{
+    __vector double x = vec_abs(vec_xl(argc, (double*)argv));
+    __vector double y = vec_load_len((double*)argv, (unsigned int)argc);
+
+    x = vec_round(vec_ceil(x) + vec_floor(y));
+    __vector bool long long m = vec_cmpge(x, y);
+    __vector long long i = vec_signed(vec_sel(x, y, m));
+
+    return (int)vec_extract(i, 0);
+}
diff --git a/numpy/distutils/checks/cpu_vxe.c b/numpy/distutils/checks/cpu_vxe.c
@@ -0,0 +1,25 @@
+#if (__VEC__ < 10302) || (__ARCH__ < 12)
+    #error VXE not supported
+#endif
+
+#include <vecintrin.h>
+int main(int argc, char **argv)
+{
+    __vector float x = vec_nabs(vec_xl(argc, (float*)argv));
+    __vector float y = vec_load_len((float*)argv, (unsigned int)argc);
+
+    x = vec_round(vec_ceil(x) + vec_floor(y));
+    __vector bool int m = vec_cmpge(x, y);
+    x = vec_sel(x, y, m);
+
+    // need to test the existance of intrin "vflls" since vec_doublee
+    // is vec_doublee maps to wrong intrin "vfll".
+    // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871
+#if defined(__GNUC__) && !defined(__clang__)
+    __vector long long i = vec_signed(__builtin_s390_vflls(x));
+#else
+    __vector long long i = vec_signed(vec_doublee(x));
+#endif
+
+    return (int)vec_extract(i, 0);
+}
diff --git a/numpy/distutils/checks/cpu_vxe2.c b/numpy/distutils/checks/cpu_vxe2.c
@@ -0,0 +1,21 @@
+#if (__VEC__ < 10303) || (__ARCH__ < 13)
+    #error VXE2 not supported
+#endif
+
+#include <vecintrin.h>
+
+int main(int argc, char **argv)
+{
+    int val;
+    __vector signed short large = { 'a', 'b', 'c', 'a', 'g', 'h', 'g', 'o' };
+    __vector signed short search = { 'g', 'h', 'g', 'o' };
+    __vector unsigned char len = { 0 };
+    __vector unsigned char res = vec_search_string_cc(large, search, len, &val);
+    __vector float x = vec_xl(argc, (float*)argv);
+    __vector int i = vec_signed(x);
+
+    i = vec_srdb(vec_sldb(i, i, 2), i, 3);
+    val += (int)vec_extract(res, 1);
+    val += vec_extract(i, 0);
+    return val;
+}
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
@@ -47,7 +47,8 @@ def initialize_options(self):
             - not part of dispatch-able features(--cpu-dispatch)
             - not supported by compiler or platform
         """
-        self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD"
+        self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F" \
+                         " AVX512_SKX VSX VSX2 VSX3 NEON ASIMD VX VXE VXE2"
 
     def finalize_options(self):
         build_scripts = self.build_scripts

diff --git a/numpy/distutils/fcompiler/gnu.py b/numpy/distutils/fcompiler/gnu.py
@@ -324,7 +324,7 @@ def _universal_flags(self, cmd):
             c_archs[c_archs.index("i386")] = "i686"
         # check the arches the Fortran compiler supports, and compare with
         # arch flags from C compiler
-        for arch in ["ppc", "i686", "x86_64", "ppc64"]:
+        for arch in ["ppc", "i686", "x86_64", "ppc64", "s390x"]:
             if _can_target(cmd, arch) and arch in c_archs:
                 arch_flags.extend(["-arch", arch])
         return arch_flags