From a3e099495357b16489298bf2d40030b3415a14f0 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 13 May 2019 15:57:43 -0700
Subject: [PATCH 1/2] ENH: use gather instruction in AVX exp/log instead of
 loadu

(1) AVX2 and AVX512F supports gather_ps instruction to load strided data
into a register. Rather than resort to scalar, these provide good
benefit when the input array is strided.

(2) Added some tests to validate AVX based algorithms for exp and log.
The tests compare the output of float32 against glibc's float64
implementation and verify maxulp error.
---
 numpy/core/src/umath/loops.c.src  |  7 ++-
 numpy/core/src/umath/simd.inc.src | 79 ++++++++++++++++++++++++++-----
 numpy/core/tests/test_umath.py    | 27 ++++++++++-
 3 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index a2649ed93d8d..f84d74efe113 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1626,12 +1626,11 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
             /*
              * We use the AVX function to compute exp/log for scalar elements as well.
              * This is needed to ensure the output of strided and non-strided
-             * cases match. But this worsens the performance of strided arrays.
-             * There is plan to fix this in a subsequent patch by using gather
-             * instructions for strided arrays in the AVX function.
+             * cases match. SIMD code handles strided input cases, but not
+             * strided output.
              */
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1);
+            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
 #else
             const npy_float in1 = *(npy_float *)ip1;
             *(npy_float *)op1 = @scalarf@(in1);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 1c6ac44260f5..8cf059095b2d 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -51,6 +51,15 @@ abs_ptrdiff(char *a, char *b)
      ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
       ((abs_ptrdiff(args[1], args[0]) == 0))))
 
+/*
+ * output should be contiguous, can handle strided input data
+ */
+#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[1] == (esize) && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+      ((abs_ptrdiff(args[1], args[0]) == 0))))
+
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
     (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
      npy_is_aligned(args[1], (esize)) && \
@@ -134,15 +143,15 @@ abs_ptrdiff(char *a, char *b)
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
-@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
+@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
 #endif
 
 static NPY_INLINE int
 run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
-        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
         return 1;
     }
     else
@@ -1127,14 +1136,24 @@ avx2_get_full_load_mask(void)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + total_elem - num_elem;
+    float* addr = maskint + total_elem - num_lanes;
     return _mm256_loadu_ps(addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_masked_gather(__m256 src,
+                   npy_float* addr,
+                   __m256i vindex,
+                   __m256 mask,
+                   const int scale)
+{
+    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, scale);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_masked_load(__m256 mask, npy_float* addr)
 {
@@ -1220,6 +1239,16 @@ avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
     return (0x0001 << num_elem) - 0x0001;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_masked_gather(__m512 src,
+                     npy_float* addr,
+                     __m512i vindex,
+                     __mmask16 kmask,
+                     const int scale)
+{
+    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, scale);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_masked_load(__mmask16 mask, npy_float* addr)
 {
@@ -1310,11 +1339,18 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
+@ISA@_exp_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
 {
+    const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
     npy_float xmax = 88.72283935546875f;
     npy_float xmin = -87.3365478515625f;
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++)
+        indexarr[ii] = ii*stride;
 
     /* Load up frequently used constants */
     @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
@@ -1333,6 +1369,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
     @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
     @vtype@ poly, num_poly, denom_poly, quadrant;
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
     @vtype@i exponent;
 
     @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
@@ -1344,8 +1381,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         if (num_remaining_elements < num_lanes)
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                         num_lanes);
-        @vtype@ x  = @isa@_masked_load(load_mask, ip);
+                                                    num_lanes);
+        @vtype@ x;
+        if (stride == 1)
+            x = @isa@_masked_load(load_mask, ip);
+        else
+            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask, 4);
 
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1396,7 +1437,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
-        ip += num_lanes;
+        ip += num_lanes*stride;
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }
@@ -1420,9 +1461,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
+@ISA@_log_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
 {
+    const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++)
+        indexarr[ii] = ii*stride;
 
     /* Load up frequently used constants */
     @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
@@ -1443,6 +1491,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
     @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
     @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
     @vtype@ poly, num_poly, denom_poly, exponent;
 
     @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
@@ -1455,8 +1504,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         if (num_remaining_elements < num_lanes)
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                         num_lanes);
-        @vtype@ x_in  = @isa@_masked_load(load_mask, ip);
+                                                    num_lanes);
+        @vtype@ x_in;
+        if (stride == 1)
+            x_in = @isa@_masked_load(load_mask, ip);
+        else
+            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask, 4);
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
         zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
@@ -1506,7 +1559,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
-        ip += num_lanes;
+        ip += num_lanes*stride;
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2c963b5eb993..cd2034d9cee9 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -13,7 +13,7 @@
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
-    assert_allclose, assert_no_warnings, suppress_warnings,
+    assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
     _gen_alignment_data
     )
 
@@ -678,6 +678,31 @@ def test_log_values(self):
             assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
+class TestExpLogFloat32(object):
+    def test_exp_float32(self):
+        np.random.seed(42)
+        x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
+        x_f64 = np.float64(x_f32)
+        assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)
+
+    def test_log_float32(self):
+        np.random.seed(42)
+        x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
+        x_f64 = np.float64(x_f32)
+        assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
+
+    def test_strided_exp_log_float32(self):
+        np.random.seed(42)
+        strides = np.random.randint(low=-100, high=100, size=100)
+        sizes = np.random.randint(low=1, high=2000, size=100)
+        for ii in sizes:
+            x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
+            exp_true = np.exp(x_f32)
+            log_true = np.log(x_f32)
+            for jj in strides:
+                assert_equal(np.exp(x_f32[::jj]), exp_true[::jj])
+                assert_equal(np.log(x_f32[::jj]), log_true[::jj])
+
 class TestLogAddExp(_FilterInvalids):
     def test_logaddexp_values(self):
         x = [1, 2, 3, 4, 5]

From 59b2a1d08592b11b82b82b6c53aead689648262e Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Sat, 18 May 2019 07:10:57 -0700
Subject: [PATCH 2/2] BUG: fixing build issues on clang6.0

---
 numpy/core/src/umath/simd.inc.src | 49 +++++++++++++++++++------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8cf059095b2d..6ddd6fcc61cb 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -1148,10 +1148,9 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_masked_gather(__m256 src,
                    npy_float* addr,
                    __m256i vindex,
-                   __m256 mask,
-                   const int scale)
+                   __m256 mask)
 {
-    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, scale);
+    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
@@ -1243,10 +1242,9 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_masked_gather(__m512 src,
                      npy_float* addr,
                      __m512i vindex,
-                     __mmask16 kmask,
-                     const int scale)
+                     __mmask16 kmask)
 {
-    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, scale);
+    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
@@ -1349,8 +1347,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     npy_float xmax = 88.72283935546875f;
     npy_float xmin = -87.3365478515625f;
     npy_int indexarr[16];
-    for (npy_int ii = 0; ii < 16; ii++)
+    for (npy_int ii = 0; ii < 16; ii++) {
         indexarr[ii] = ii*stride;
+    }
 
     /* Load up frequently used constants */
     @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
@@ -1379,14 +1378,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
     while (num_remaining_elements > 0) {
 
-        if (num_remaining_elements < num_lanes)
+        if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
                                                     num_lanes);
+        }
+
         @vtype@ x;
-        if (stride == 1)
+        if (stride == 1) {
             x = @isa@_masked_load(load_mask, ip);
-        else
-            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask, 4);
+        }
+        else {
+            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+        }
 
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1442,8 +1445,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         num_remaining_elements -= num_lanes;
     }
 
-    if (@mask_to_int@(overflow_mask))
+    if (@mask_to_int@(overflow_mask)) {
         npy_set_floatstatus_overflow();
+    }
 }
 
 /*
@@ -1469,8 +1473,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
     npy_int indexarr[16];
-    for (npy_int ii = 0; ii < 16; ii++)
+    for (npy_int ii = 0; ii < 16; ii++) {
         indexarr[ii] = ii*stride;
+    }
 
     /* Load up frequently used constants */
     @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
@@ -1502,14 +1507,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
     while (num_remaining_elements > 0) {
 
-        if (num_remaining_elements < num_lanes)
+        if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
                                                     num_lanes);
+        }
+
         @vtype@ x_in;
-        if (stride == 1)
+        if (stride == 1) {
             x_in = @isa@_masked_load(load_mask, ip);
-        else
-            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask, 4);
+        }
+        else {
+            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+        }
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
         zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
@@ -1564,10 +1573,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         num_remaining_elements -= num_lanes;
     }
 
-    if (@mask_to_int@(invalid_mask))
+    if (@mask_to_int@(invalid_mask)) {
         npy_set_floatstatus_invalid();
-    if (@mask_to_int@(divide_by_zero_mask))
+    }
+    if (@mask_to_int@(divide_by_zero_mask)) {
         npy_set_floatstatus_divbyzero();
+    }
 }
 #endif
 /**end repeat**/