From 809d00d0f6323b6a044f76e60c4f3ddda6a6310c Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Thu, 5 Oct 2023 19:17:56 +0300
Subject: [PATCH] BUG: Resolve build issue on ppc64 with Power9 or higher as
 baseline

  This fix addresses two issues:
   * Corrects the use of unsupported instructions by the assembler
     in half-precision to double-precision conversion.
   * Resolves a code error related to variable naming during conversion.

  It also:
   * Implement a compile-time test on PPC64 to determine support for Half/Double
     VSX3 instructions by the assembler
   * Modify half-precision tests to deal with FP HW exceptions
---
 meson_cpu/ppc64/meson.build                   |  3 +++
 numpy/core/src/common/half.hpp                | 23 ++++++++----------
 numpy/core/tests/test_half.py                 | 24 ++++++++++++-------
 numpy/distutils/ccompiler_opt.py              |  3 ++-
 .../distutils/checks/extra_vsx3_half_double.c | 12 ++++++++++
 5 files changed, 43 insertions(+), 22 deletions(-)
 create mode 100644 numpy/distutils/checks/extra_vsx3_half_double.c

diff --git a/meson_cpu/ppc64/meson.build b/meson_cpu/ppc64/meson.build
index d14b23703fe3..986a57ee184c 100644
--- a/meson_cpu/ppc64/meson.build
+++ b/meson_cpu/ppc64/meson.build
@@ -26,6 +26,9 @@ VSX3 = mod_features.new(
   'VSX3', 3, implies: VSX2, args: {'val': '-mcpu=power9', 'match': '.*[mcpu=|vsx].*'},
   detect: {'val': 'VSX3', 'match': 'VSX.*'},
   test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0],
+  extra_tests: {
+    'VSX3_HALF_DOUBLE': files(source_root + '/numpy/distutils/checks/extra_vsx3_half_double.c')[0]
+  }
 )
 VSX4 = mod_features.new(
   'VSX4', 4, implies: VSX3, args: {'val': '-mcpu=power10', 'match': '.*[mcpu=|vsx].*'},
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
index 4d16e3bcc1c2..13dcd074283c 100644
--- a/numpy/core/src/common/half.hpp
+++ b/numpy/core/src/common/half.hpp
@@ -36,7 +36,7 @@ class Half final {
         #endif
         ) || (
             std::is_same_v<T, double> &&
-        #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3)
+        #if defined(NPY_HAVE_AVX512FP16) || (defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX3_HALF_DOUBLE))
             true
         #else
             false
@@ -73,11 +73,8 @@ class Half final {
     #if defined(NPY_HAVE_AVX512FP16)
         __m128d md = _mm_load_sd(&f);
         bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md))));
-    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
-        __vector double vf64 = vec_splats(f);
-        __vector unsigned short vf16;
-        __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64));
-        bits_ = vec_extract(vf16, 0);
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX3_HALF_DOUBLE)
+        __asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits_) : "wa" (f));
     #else
         bits_ = half_private::FromDoubleBits(BitCast<uint64_t>(f));
     #endif
@@ -96,7 +93,7 @@ class Half final {
         __vector float vf32;
         __asm__ __volatile__("xvcvhpsp %x0,%x1"
                              : "=wa"(vf32)
-                             : "wa"(vec_splats(bits_.u)));
+                             : "wa"(vec_splats(bits_)));
         return vec_extract(vf32, 0);
     #else
         return BitCast<float>(half_private::ToFloatBits(bits_));
@@ -110,12 +107,12 @@ class Half final {
         double ret;
         _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_))));
         return ret;
-    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
-        __vector float vf64;
-        __asm__ __volatile__("xvcvhpdp %x0,%x1"
-                             : "=wa"(vf32)
-                             : "wa"(vec_splats(bits_)));
-        return vec_extract(vf64, 0);
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX3_HALF_DOUBLE)
+        double f64;
+        __asm__ __volatile__("xscvhpdp %x0,%x1"
+                             : "=wa"(f64)
+                             : "wa"(bits_));
+        return f64;
     #else
         return BitCast<double>(half_private::ToDoubleBits(bits_));
     #endif
diff --git a/numpy/core/tests/test_half.py b/numpy/core/tests/test_half.py
index ca849ad52ead..3e72eba8948a 100644
--- a/numpy/core/tests/test_half.py
+++ b/numpy/core/tests/test_half.py
@@ -21,8 +21,11 @@ def setup_method(self):
         # An array of all possible float16 values
         self.all_f16 = np.arange(0x10000, dtype=uint16)
         self.all_f16.dtype = float16
-        self.all_f32 = np.array(self.all_f16, dtype=float32)
-        self.all_f64 = np.array(self.all_f16, dtype=float64)
+
+        # NaN value can cause an invalid FP exception if HW is been used
+        with np.errstate(invalid='ignore'):
+            self.all_f32 = np.array(self.all_f16, dtype=float32)
+            self.all_f64 = np.array(self.all_f16, dtype=float64)
 
         # An array of all non-NaN float16 values, in sorted order
         self.nonan_f16 = np.concatenate(
@@ -44,14 +47,19 @@ def test_half_conversions(self):
         # value is preserved when converting to/from other floats.
 
         # Convert from float32 back to float16
-        b = np.array(self.all_f32, dtype=float16)
-        assert_equal(self.all_f16.view(dtype=uint16),
-                     b.view(dtype=uint16))
+        with np.errstate(invalid='ignore'):
+            b = np.array(self.all_f32, dtype=float16)
+        # avoid testing NaNs due to differ bits wither Q/SNaNs
+        b_nn = b == b
+        assert_equal(self.all_f16[b_nn].view(dtype=uint16),
+                     b[b_nn].view(dtype=uint16))
 
         # Convert from float64 back to float16
-        b = np.array(self.all_f64, dtype=float16)
-        assert_equal(self.all_f16.view(dtype=uint16),
-                     b.view(dtype=uint16))
+        with np.errstate(invalid='ignore'):
+            b = np.array(self.all_f64, dtype=float16)
+        b_nn = b == b
+        assert_equal(self.all_f16[b_nn].view(dtype=uint16),
+                     b[b_nn].view(dtype=uint16))
 
         # Convert float16 to longdouble and back
         # This doesn't necessarily preserve the extra NaN bits,
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index 1e9de3c45bc0..d7a02c07be56 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -301,7 +301,8 @@ class _Config:
         ## Power8/ISA 2.07
         VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
         ## Power9/ISA 3.00
-        VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
+        VSX3 = dict(interest=3, implies="VSX2", implies_detect=False,
+                    extra_checks="VSX3_HALF_DOUBLE"),
         ## Power10/ISA 3.1
         VSX4 = dict(interest=4, implies="VSX3", implies_detect=False,
                     extra_checks="VSX4_MMA"),
diff --git a/numpy/distutils/checks/extra_vsx3_half_double.c b/numpy/distutils/checks/extra_vsx3_half_double.c
new file mode 100644
index 000000000000..514a2b18f96c
--- /dev/null
+++ b/numpy/distutils/checks/extra_vsx3_half_double.c
@@ -0,0 +1,12 @@
+/**
+ * Assembler may not fully support the following VSX3 scalar
+ * instructions, even though compilers report VSX3 support.
+ */
+int main(void)
+{
+    unsigned short bits = 0xFF;
+    double f;
+    __asm__ __volatile__("xscvhpdp %x0,%x1" : "=wa"(f) : "wa"(bits));
+    __asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits) : "wa" (f));
+    return bits;
+}