From 97ba579bd17043b8885ff8e13970a2a38bd7a981 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 9 Nov 2020 16:35:36 +0800
Subject: [PATCH 1/4] Optimize the performance of multiply

---
 .../core/src/multiarray/einsum_sumprod.c.src  | 193 +++++++++---------
 1 file changed, 91 insertions(+), 102 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index c58e742874d0..f5478bf8f81c 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -17,7 +17,8 @@
 
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
-
+#include "simd/simd.h"
+#include "common.h"
 
 #ifdef NPY_HAVE_SSE_INTRINSICS
 #define EINSUM_USE_SSE1 1
@@ -41,6 +42,28 @@
 
 #define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
 
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
+
+/**
+ * This macro is used to enable a scalar loop which advances 4 elements at a
+ * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
+ * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
+ * that finishes up all the remaining scalars. The purpose of the unrolled loop
+ * is to enable auto-vectorization in cases when all of the following are true:
+ *
+ *  - optimization is allowed
+ *  - either:
+ *    - we did not run the SIMD loop at all, due to NPV being disabled.
+ *    - the SIMD loop was larger than 128bit, so there are likely to be many
+ *      elements left to process.
+ */
+#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
+
 /**********************************************/
 
 /**begin repeat
@@ -56,6 +79,10 @@
  *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *             npy_float, npy_float, npy_double, npy_longdouble,
  *             npy_float, npy_double, npy_longdouble#
+ * #sfx  = s8, s16, s32, long, s64,
+ *        u8, u16, u32, ulong, u64,
+ *        half, f32, f64, longdouble,
+ *        f32, f64, clongdouble#
  * #to = ,,,,,
  *       ,,,,,
  *       npy_float_to_half,,,,
@@ -76,6 +103,10 @@
  *            0*5,
  *            0,0,1,0,
  *            0*3#
+ * #NPYV_CHK = 0*5,
+ *             0*5,
+ *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0*3#
  */
 
 /**begin repeat1
@@ -250,115 +281,73 @@ static void
     @type@ *data0 = (@type@ *)dataptr[0];
     @type@ *data1 = (@type@ *)dataptr[1];
     @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b;
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
                                                             (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@]) *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+    // NPYV check for @type@, in X86, 128bits intrinsincs have a side effect in optimization
+#if @NPYV_CHK@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_@sfx@;
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
     }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    /**end repeat2**/
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
     }
 #endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-        data_out += 8;
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b + c);
     }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
 }
 
 /* Some extra specializations for the two operand case */

From 594dd5d97ec9989f19de96f064930a955478b9a4 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 16 Nov 2020 19:24:40 +0800
Subject: [PATCH 2/4] fix misleading comment

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index f5478bf8f81c..c9ab71e28523 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -283,7 +283,7 @@ static void
     @type@ *data_out = (@type@ *)dataptr[2];
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
                                                             (int)count);
-    // NPYV check for @type@, in X86, 128bits intrinsincs have a side effect in optimization
+    // NPYV check for @type@
 #if @NPYV_CHK@
     /* Use aligned instructions if possible */
     const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&

From 95d6052902fc4763cbceee51ec08a3fff3dc6b1f Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 19 Nov 2020 11:04:30 +0800
Subject: [PATCH 3/4] optimize the remaining elements using npyv_load_tillz

---
 .../core/src/multiarray/einsum_sumprod.c.src  | 27 ++++++-------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index c9ab71e28523..efe9a59db609 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -49,21 +49,6 @@
     #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 #endif
 
-/**
- * This macro is used to enable a scalar loop which advances 4 elements at a
- * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
- * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
- * that finishes up all the remaining scalars. The purpose of the unrolled loop
- * is to enable auto-vectorization in cases when all of the following are true:
- *
- *  - optimization is allowed
- *  - either:
- *    - we did not run the SIMD loop at all, due to NPV being disabled.
- *    - the SIMD loop was larger than 128bit, so there are likely to be many
- *      elements left to process.
- */
-#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
-
 /**********************************************/
 
 /**begin repeat
@@ -318,10 +303,14 @@ static void
         }
     }
     /**end repeat2**/
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+        npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+        npyv_@sfx@ c = npyv_load_tillz_@sfx@(data_out, count);
+        npyv_store_till_@sfx@(data_out, count, npyv_muladd_@sfx@(a, b, c));
+    }
     npyv_cleanup();
-#endif // NPYV check for @type@
-
-#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+#else
     for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -341,7 +330,7 @@ static void
         data_out[@i@] = @to@(abc@i@);
         /**end repeat2**/
     }
-#endif
+#endif // NPYV check for @type@
     for (; count > 0; --count, ++data0, ++data1, ++data_out) {
         const @type@ a = @from@(*data0);
         const @type@ b = @from@(*data1);

From f921f0d13bb34d82503bfa2b3bff24d095bb9385 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 19 Nov 2020 16:44:08 +0800
Subject: [PATCH 4/4] add guard #ifndef NPY_DISABLE_OPTIMIZATION

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index efe9a59db609..caba0e00ad29 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -311,6 +311,7 @@ static void
     }
     npyv_cleanup();
 #else
+#ifndef NPY_DISABLE_OPTIMIZATION
     for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -330,13 +331,15 @@ static void
         data_out[@i@] = @to@(abc@i@);
         /**end repeat2**/
     }
-#endif // NPYV check for @type@
+#endif // !NPY_DISABLE_OPTIMIZATION
     for (; count > 0; --count, ++data0, ++data1, ++data_out) {
         const @type@ a = @from@(*data0);
         const @type@ b = @from@(*data1);
         const @type@ c = @from@(*data_out);
         *data_out = @to@(a * b + c);
     }
+#endif // NPYV check for @type@
+
 }
 
 /* Some extra specializations for the two operand case */