Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH: Extending CPU feature detection framework to support IBM Z SIMD #20552

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/reference/simd/build-options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ Special Options
ARMHF ``NONE``
ARM64 A.K. AARCH64 ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
``ASIMD``
IBM/ZSYSTEM(S390X) ``NONE``
====================================== =======================================

- ``MAX``: Enables all supported CPU features by the compiler and platform.
Expand Down Expand Up @@ -338,7 +339,7 @@ that includes several sections, and each section has several values, representin
**Platform**:

- :enabled:`Architecture`: The architecture name of target CPU. It should be one of
``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64`` or ``unknown``.
``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64``, ``s390x`` or ``unknown``.

- :enabled:`Compiler`: The compiler name. It should be one of
gcc, clang, msvc, icc, iccw or unix-like.
Expand Down
6 changes: 4 additions & 2 deletions doc/source/reference/simd/gen_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def wrapper_tab(title, table, tab_size=4):
pretty_names = {
"PPC64": "IBM/POWER big-endian",
"PPC64LE": "IBM/POWER little-endian",
"S390X": "IBM/ZSYSTEM(S390X)",
"ARMHF": "ARMv7/A32",
"AARCH64": "ARMv8/A64",
"ICC": "Intel Compiler",
Expand All @@ -170,7 +171,7 @@ def wrapper_tab(title, table, tab_size=4):
with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
fd.write(f'.. generated via {__file__}\n\n')
for arch in (
("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64", "S390X")
):
title = "On " + pretty_names.get(arch, arch)
table = Features(arch, 'gcc').table()
Expand All @@ -183,7 +184,8 @@ def wrapper_tab(title, table, tab_size=4):
("PPC64", ("clang",)),
("PPC64LE", ("clang",)),
("ARMHF", ("clang",)),
("AARCH64", ("clang",))
("AARCH64", ("clang",)),
("S390X", ("clang",))
):
arch_pname = pretty_names.get(arch, arch)
for cc in cc_names:
Expand Down
15 changes: 14 additions & 1 deletion doc/source/reference/simd/generated_tables/cpu_features.inc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
.. generated via /home/seiko/work/repos/review/numpy/doc/source/reference/simd/gen_features.py

On x86
~~~~~~
Expand Down Expand Up @@ -91,3 +91,16 @@ On ARMv8/A64
``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
============== ===========================================================

On IBM/ZSYSTEM(S390X)
~~~~~~~~~~~~~~~~~~~~~
.. table::
:align: left

======== ==============
Name Implies
======== ==============
``VX``
``VXE`` ``VX``
``VXE2`` ``VX`` ``VXE``
======== ==============

37 changes: 37 additions & 0 deletions numpy/core/src/common/npy_cpu_features.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ npy_cpu_features_dict(void)
* AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG,
* AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL,
* VSX, VSX2, VSX3,
* VX, VXE, VXE2,
* NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM#
*/
if (PyDict_SetItemString(dict, "@feature@",
Expand Down Expand Up @@ -509,6 +510,42 @@ npy__cpu_init_features(void)
#endif
}

/***************** ZARCH ******************/

#elif defined(__s390x__)

#include <sys/auxv.h>
#ifndef HWCAP_S390_VXE
#define HWCAP_S390_VXE 8192
#endif

#ifndef HWCAP_S390_VXRS_EXT2
#define HWCAP_S390_VXRS_EXT2 32768
#endif

static void
npy__cpu_init_features(void)
{
memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);

unsigned int hwcap = getauxval(AT_HWCAP);
if ((hwcap & HWCAP_S390_VX) == 0) {
return;
}

if (hwcap & HWCAP_S390_VXRS_EXT2) {
npy__cpu_have[NPY_CPU_FEATURE_VX] =
npy__cpu_have[NPY_CPU_FEATURE_VXE] =
npy__cpu_have[NPY_CPU_FEATURE_VXE2] = 1;
return;
}

npy__cpu_have[NPY_CPU_FEATURE_VXE] = (hwcap & HWCAP_S390_VXE) != 0;

npy__cpu_have[NPY_CPU_FEATURE_VX] = 1;
}


/***************** ARM ******************/

#elif defined(__arm__) || defined(__aarch64__)
Expand Down
11 changes: 11 additions & 0 deletions numpy/core/src/common/npy_cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ enum npy_cpu_features
// ARMv8.2 single&half-precision multiply
NPY_CPU_FEATURE_ASIMDFHM = 307,

// IBM/ZARCH
NPY_CPU_FEATURE_VX = 350,

// Vector-Enhancements Facility 1
NPY_CPU_FEATURE_VXE = 351,

// Vector-Enhancements Facility 2
NPY_CPU_FEATURE_VXE2 = 352,

NPY_CPU_FEATURE_MAX
};

Expand Down Expand Up @@ -138,6 +147,7 @@ npy_cpu_features_dict(void);
* On aarch64: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD']
* On ppc64: []
* On ppc64le: ['VSX', 'VSX2']
* On s390x: []
* On any other arch or if the optimization is disabled: []
*/
NPY_VISIBILITY_HIDDEN PyObject *
Expand All @@ -159,6 +169,7 @@ npy_cpu_baseline_list(void);
* On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
* On ppc64: ['VSX', 'VSX2', 'VSX3']
* On ppc64le: ['VSX3']
* On s390x: ['VX', 'VXE', VXE2]
* On any other arch or if the optimization is disabled: []
*/
NPY_VISIBILITY_HIDDEN PyObject *
Expand Down
11 changes: 11 additions & 0 deletions numpy/core/tests/test_cpu_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ class Test_POWER_Features(AbstractTest):
def load_flags(self):
self.load_flags_auxv()


is_zarch = re.match("^(s390x)", machine, re.IGNORECASE)
@pytest.mark.skipif(not is_linux or not is_zarch,
reason="Only for Linux and IBM Z")
class Test_ZARCH_Features(AbstractTest):
features = ["VX", "VXE", "VXE2"]

def load_flags(self):
self.load_flags_auxv()


is_arm = re.match("^(arm|aarch64)", machine, re.IGNORECASE)
@pytest.mark.skipif(not is_linux or not is_arm, reason="Only for Linux and ARM")
class Test_ARM_Features(AbstractTest):
Expand Down
29 changes: 28 additions & 1 deletion numpy/distutils/ccompiler_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ class _Config:
x64 = "SSE SSE2 SSE3",
ppc64 = '', # play it safe
ppc64le = "VSX VSX2",
s390x = '',
armhf = '', # play it safe
aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD"
)
Expand Down Expand Up @@ -293,6 +294,13 @@ class _Config:
VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
## Power9/ISA 3.00
VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
# IBM/Z
## VX(z13) support
VX = dict(interest=1, headers="vecintrin.h"),
## Vector-Enhancements Facility
VXE = dict(interest=2, implies="VX", implies_detect=False),
## Vector-Enhancements Facility 2
VXE2 = dict(interest=3, implies="VXE", implies_detect=False),
# ARM
NEON = dict(interest=1, headers="arm_neon.h"),
NEON_FP16 = dict(interest=2, implies="NEON"),
Expand Down Expand Up @@ -472,6 +480,23 @@ class attribute `conf_features`, also its override

return partial

on_zarch = self.cc_on_s390x
if on_zarch:
partial = dict(
VX = dict(
flags="-march=arch11 -mzvector"
),
VXE = dict(
flags="-march=arch12", implies_detect=False
),
VXE2 = dict(
flags="-march=arch13", implies_detect=False
)
)

return partial


if self.cc_on_aarch64 and is_unix: return dict(
NEON = dict(
implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True
Expand Down Expand Up @@ -919,6 +944,7 @@ def __init__(self):
("cc_on_ppc64", ".*(powerpc|ppc)64.*"),
("cc_on_aarch64", ".*(aarch64|arm64).*"),
("cc_on_armhf", ".*arm.*"),
("cc_on_s390x", ".*s390x.*"),
# undefined platform
("cc_on_noarch", ""),
)
Expand Down Expand Up @@ -983,7 +1009,8 @@ def __init__(self):
self.cc_is_gcc = True

self.cc_march = "unknown"
for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"):
for arch in ("x86", "x64", "ppc64", "ppc64le",
"armhf", "aarch64", "s390x"):
if getattr(self, "cc_on_" + arch):
self.cc_march = arch
break
Expand Down
16 changes: 16 additions & 0 deletions numpy/distutils/checks/cpu_vx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#if (__VEC__ < 10301) || (__ARCH__ < 11)
#error VX not supported
#endif

#include <vecintrin.h>
int main(int argc, char **argv)
{
__vector double x = vec_abs(vec_xl(argc, (double*)argv));
__vector double y = vec_load_len((double*)argv, (unsigned int)argc);

x = vec_round(vec_ceil(x) + vec_floor(y));
__vector bool long long m = vec_cmpge(x, y);
__vector long long i = vec_signed(vec_sel(x, y, m));

return (int)vec_extract(i, 0);
}
25 changes: 25 additions & 0 deletions numpy/distutils/checks/cpu_vxe.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#if (__VEC__ < 10302) || (__ARCH__ < 12)
#error VXE not supported
#endif

#include <vecintrin.h>
int main(int argc, char **argv)
{
__vector float x = vec_nabs(vec_xl(argc, (float*)argv));
__vector float y = vec_load_len((float*)argv, (unsigned int)argc);

x = vec_round(vec_ceil(x) + vec_floor(y));
__vector bool int m = vec_cmpge(x, y);
x = vec_sel(x, y, m);

// need to test the existance of intrin "vflls" since vec_doublee
// is vec_doublee maps to wrong intrin "vfll".
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871
#if defined(__GNUC__) && !defined(__clang__)
__vector long long i = vec_signed(__builtin_s390_vflls(x));
#else
__vector long long i = vec_signed(vec_doublee(x));
#endif

return (int)vec_extract(i, 0);
}
21 changes: 21 additions & 0 deletions numpy/distutils/checks/cpu_vxe2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#if (__VEC__ < 10303) || (__ARCH__ < 13)
#error VXE2 not supported
#endif

#include <vecintrin.h>

int main(int argc, char **argv)
{
int val;
__vector signed short large = { 'a', 'b', 'c', 'a', 'g', 'h', 'g', 'o' };
__vector signed short search = { 'g', 'h', 'g', 'o' };
__vector unsigned char len = { 0 };
__vector unsigned char res = vec_search_string_cc(large, search, len, &val);
__vector float x = vec_xl(argc, (float*)argv);
__vector int i = vec_signed(x);

i = vec_srdb(vec_sldb(i, i, 2), i, 3);
val += (int)vec_extract(res, 1);
val += vec_extract(i, 0);
return val;
}
3 changes: 2 additions & 1 deletion numpy/distutils/command/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def initialize_options(self):
- not part of dispatch-able features(--cpu-dispatch)
- not supported by compiler or platform
"""
self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD"
self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F" \
" AVX512_SKX VSX VSX2 VSX3 NEON ASIMD VX VXE VXE2"

def finalize_options(self):
build_scripts = self.build_scripts
Expand Down
2 changes: 1 addition & 1 deletion numpy/distutils/fcompiler/gnu.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def _universal_flags(self, cmd):
c_archs[c_archs.index("i386")] = "i686"
# check the arches the Fortran compiler supports, and compare with
# arch flags from C compiler
for arch in ["ppc", "i686", "x86_64", "ppc64"]:
for arch in ["ppc", "i686", "x86_64", "ppc64", "s390x"]:
if _can_target(cmd, arch) and arch in c_archs:
arch_flags.extend(["-arch", arch])
return arch_flags
Expand Down
Loading