From 809d00d0f6323b6a044f76e60c4f3ddda6a6310c Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Thu, 5 Oct 2023 19:17:56 +0300 Subject: [PATCH] BUG: Resolve build issue on ppc64 with Power9 or higher as baseline This fix addresses two issues: * Corrects the use of unsupported instructions by the assembler in half-precision to double-precision conversion. * Resolves a code error related to variable naming during conversion. It also: * Implement a compile-time test on PPC64 to determine support for Half/Double VSX3 instructions by the assembler * Modify half-precision tests to deal with FP HW exceptions --- meson_cpu/ppc64/meson.build | 3 +++ numpy/core/src/common/half.hpp | 23 ++++++++---------- numpy/core/tests/test_half.py | 24 ++++++++++++------- numpy/distutils/ccompiler_opt.py | 3 ++- .../distutils/checks/extra_vsx3_half_double.c | 12 ++++++++++ 5 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 numpy/distutils/checks/extra_vsx3_half_double.c diff --git a/meson_cpu/ppc64/meson.build b/meson_cpu/ppc64/meson.build index d14b23703fe3..986a57ee184c 100644 --- a/meson_cpu/ppc64/meson.build +++ b/meson_cpu/ppc64/meson.build @@ -26,6 +26,9 @@ VSX3 = mod_features.new( 'VSX3', 3, implies: VSX2, args: {'val': '-mcpu=power9', 'match': '.*[mcpu=|vsx].*'}, detect: {'val': 'VSX3', 'match': 'VSX.*'}, test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0], + extra_tests: { + 'VSX3_HALF_DOUBLE': files(source_root + '/numpy/distutils/checks/extra_vsx3_half_double.c')[0] + } ) VSX4 = mod_features.new( 'VSX4', 4, implies: VSX3, args: {'val': '-mcpu=power10', 'match': '.*[mcpu=|vsx].*'}, diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp index 4d16e3bcc1c2..13dcd074283c 100644 --- a/numpy/core/src/common/half.hpp +++ b/numpy/core/src/common/half.hpp @@ -36,7 +36,7 @@ class Half final { #endif ) || ( std::is_same_v && - #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3) + #if defined(NPY_HAVE_AVX512FP16) || (defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX3_HALF_DOUBLE)) true #else false @@ -73,11 +73,8 @@ class Half final { #if defined(NPY_HAVE_AVX512FP16) __m128d md = _mm_load_sd(&f); bits_ = static_cast(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md)))); - #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) - __vector double vf64 = vec_splats(f); - __vector unsigned short vf16; - __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64)); - bits_ = vec_extract(vf16, 0); + #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX3_HALF_DOUBLE) + __asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits_) : "wa" (f)); #else bits_ = half_private::FromDoubleBits(BitCast(f)); #endif @@ -96,7 +93,7 @@ class Half final { __vector float vf32; __asm__ __volatile__("xvcvhpsp %x0,%x1" : "=wa"(vf32) - : "wa"(vec_splats(bits_.u))); + : "wa"(vec_splats(bits_))); return vec_extract(vf32, 0); #else return BitCast(half_private::ToFloatBits(bits_)); @@ -110,12 +107,12 @@ class Half final { double ret; _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_)))); return ret; - #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) - __vector float vf64; - __asm__ __volatile__("xvcvhpdp %x0,%x1" - : "=wa"(vf32) - : "wa"(vec_splats(bits_))); - return vec_extract(vf64, 0); + #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX3_HALF_DOUBLE) + double f64; + __asm__ __volatile__("xscvhpdp %x0,%x1" + : "=wa"(f64) + : "wa"(bits_)); + return f64; #else return BitCast(half_private::ToDoubleBits(bits_)); #endif diff --git a/numpy/core/tests/test_half.py b/numpy/core/tests/test_half.py index ca849ad52ead..3e72eba8948a 100644 --- a/numpy/core/tests/test_half.py +++ b/numpy/core/tests/test_half.py @@ -21,8 +21,11 @@ def setup_method(self): # An array of all possible float16 values self.all_f16 = np.arange(0x10000, dtype=uint16) self.all_f16.dtype = float16 - self.all_f32 = np.array(self.all_f16, dtype=float32) - self.all_f64 = np.array(self.all_f16, dtype=float64) + + # NaN value can cause an invalid FP exception if HW is been used + with np.errstate(invalid='ignore'): + self.all_f32 = np.array(self.all_f16, dtype=float32) + self.all_f64 = np.array(self.all_f16, dtype=float64) # An array of all non-NaN float16 values, in sorted order self.nonan_f16 = np.concatenate( @@ -44,14 +47,19 @@ def test_half_conversions(self): # value is preserved when converting to/from other floats. # Convert from float32 back to float16 - b = np.array(self.all_f32, dtype=float16) - assert_equal(self.all_f16.view(dtype=uint16), - b.view(dtype=uint16)) + with np.errstate(invalid='ignore'): + b = np.array(self.all_f32, dtype=float16) + # avoid testing NaNs due to differ bits wither Q/SNaNs + b_nn = b == b + assert_equal(self.all_f16[b_nn].view(dtype=uint16), + b[b_nn].view(dtype=uint16)) # Convert from float64 back to float16 - b = np.array(self.all_f64, dtype=float16) - assert_equal(self.all_f16.view(dtype=uint16), - b.view(dtype=uint16)) + with np.errstate(invalid='ignore'): + b = np.array(self.all_f64, dtype=float16) + b_nn = b == b + assert_equal(self.all_f16[b_nn].view(dtype=uint16), + b[b_nn].view(dtype=uint16)) # Convert float16 to longdouble and back # This doesn't necessarily preserve the extra NaN bits, diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py index 1e9de3c45bc0..d7a02c07be56 100644 --- a/numpy/distutils/ccompiler_opt.py +++ b/numpy/distutils/ccompiler_opt.py @@ -301,7 +301,8 @@ class _Config: ## Power8/ISA 2.07 VSX2 = dict(interest=2, implies="VSX", implies_detect=False), ## Power9/ISA 3.00 - VSX3 = dict(interest=3, implies="VSX2", implies_detect=False), + VSX3 = dict(interest=3, implies="VSX2", implies_detect=False, + extra_checks="VSX3_HALF_DOUBLE"), ## Power10/ISA 3.1 VSX4 = dict(interest=4, implies="VSX3", implies_detect=False, extra_checks="VSX4_MMA"), diff --git a/numpy/distutils/checks/extra_vsx3_half_double.c b/numpy/distutils/checks/extra_vsx3_half_double.c new file mode 100644 index 000000000000..514a2b18f96c --- /dev/null +++ b/numpy/distutils/checks/extra_vsx3_half_double.c @@ -0,0 +1,12 @@ +/** + * Assembler may not fully support the following VSX3 scalar + * instructions, even though compilers report VSX3 support. + */ +int main(void) +{ + unsigned short bits = 0xFF; + double f; + __asm__ __volatile__("xscvhpdp %x0,%x1" : "=wa"(f) : "wa"(bits)); + __asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits) : "wa" (f)); + return bits; +}