From e26dcf74c4cb5158dd9de8091f049d369b8b2361 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 11 Aug 2020 12:10:37 +0800
Subject: [PATCH 01/27] new npyv intrinsics

---
 numpy/core/src/common/simd/avx2/arithmetic.h  | 25 +++++++++++++++++++
 .../core/src/common/simd/avx512/arithmetic.h  | 25 +++++++++++++++++++
 numpy/core/src/common/simd/neon/arithmetic.h  | 19 ++++++++++++++
 numpy/core/src/common/simd/sse/arithmetic.h   | 24 ++++++++++++++++++
 numpy/core/src/common/simd/vsx/arithmetic.h   | 15 +++++++++++
 5 files changed, 108 insertions(+)

diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 9d8b4ab5e62b..d61b4a0f66ae 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -62,6 +62,13 @@
 #define npyv_mul_f32 _mm256_mul_ps
 #define npyv_mul_f64 _mm256_mul_pd
 
+#ifdef NPY_HAVE_FMA3
+    #define npyv_muladd_f32 _mm256_fmadd_ps
+    #define npyv_muladd_f64 _mm256_fmadd_pd
+#else
+    #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c)
+    #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c)
+#endif
 // saturated
 // TODO: after implment Packs intrins
 
@@ -72,4 +79,22 @@
 #define npyv_div_f32 _mm256_div_ps
 #define npyv_div_f64 _mm256_div_pd
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m256 a)
+{
+    __m128 t1 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a,1));
+    __m128 t2 = _mm_movehdup_ps(t1);
+    __m128 t3 = _mm_add_ps(t1, t2);
+    __m128 t4 = _mm_movehl_ps(t3, t3);
+    __m128 t5 = _mm_add_ss(t3, t4);
+    return _mm_cvtss_f32(t5);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m256d a)
+{
+    __m128d t1 = _mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a,1));
+    __m128d t2 = _mm_unpackhi_pd(t1, t1);
+    __m128d t3 = _mm_add_sd(t2, t1);
+    return _mm_cvtsd_f64(t3);
+}
 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index fcaef0efd9a4..4b89c06c7321 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -103,6 +103,9 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_mul_f32 _mm512_mul_ps
 #define npyv_mul_f64 _mm512_mul_pd
 
+#define npyv_muladd_f32 _mm512_fmadd_ps
+#define npyv_muladd_f64 _mm512_fmadd_pd
+
 // saturated
 // TODO: after implment Packs intrins
 
@@ -112,5 +115,27 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 // TODO: emulate integer division
 #define npyv_div_f32 _mm512_div_ps
 #define npyv_div_f64 _mm512_div_pd
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 sum32 = _mm512_add_ps(a, h64);
+    __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+    __m512 sum16 = _mm512_add_ps(sum32, h32);
+    __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+    __m512 sum8  = _mm512_add_ps(sum16, h16);
+    __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+    __m512 sum4  = _mm512_add_ps(sum8, h4);
+    return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+}
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512d sum32 = _mm512_add_pd(a, h64);
+    __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+    __m512d sum16 = _mm512_add_pd(sum32, h32);
+    __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+    __m512d sum8  = _mm512_add_pd(sum16, h16);
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+}
 
 #endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index ec8b8ecd0cfe..c7409e78fb53 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -60,6 +60,12 @@
 #define npyv_mul_f32 vmulq_f32
 #define npyv_mul_f64 vmulq_f64
 
+#ifdef NPY_HAVE_NEON_VFPV4
+    #define npyv_muladd_f32(A, B, C) vfmaq_f32(C, A, B)
+#else
+    #define npyv_muladd_f32(A, B, C) vmlaq_f32(C, A, B)
+#endif
+#define npyv_muladd_f64(A, B, C) vfmaq_f64(C, A, B)
 /***************************
  * Division
  ***************************/
@@ -75,4 +81,17 @@
 #endif
 #define npyv_div_f64 vdivq_f64
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(float32x4_t a)
+{
+    float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+    return vget_lane_f32(vpadd_f32(r, r), 0);
+}
+#ifdef __aarch64__
+    NPY_FINLINE double npyv_sum_f64(float64x2_t a)
+    {
+        return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
+    }
+#endif
+
 #endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 12d0af05cd15..62dc0d8cf4b0 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -82,6 +82,13 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 #define npyv_mul_f32 _mm_mul_ps
 #define npyv_mul_f64 _mm_mul_pd
 
+#ifdef NPY_HAVE_FMA3
+    #define npyv_muladd_f32 _mm_fmadd_ps
+    #define npyv_muladd_f64 _mm_fmadd_pd
+#else
+    #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c)
+    #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c)
+#endif
 // saturated
 // TODO: after implment Packs intrins
 
@@ -92,4 +99,21 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 #define npyv_div_f32 _mm_div_ps
 #define npyv_div_f64 _mm_div_pd
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m128 a)
+{
+    __m128 t1 = _mm_movehl_ps(a, a);
+    __m128 t2 = _mm_add_ps(a, t1);
+    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+    __m128 t4 = _mm_add_ss(t2, t3);
+    return _mm_cvtss_f32(t4);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m128d a)
+{
+    __m128  t0 = _mm_castpd_ps(a);
+    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
+    __m128d t2 = _mm_add_sd(a,t1);
+    return _mm_cvtsd_f64(t2);
+}
 #endif // _NPY_SIMD_SSE_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index dd23b5b11e95..eb1aa20b11d7 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -94,10 +94,25 @@
 #define npyv_mul_f32 vec_mul
 #define npyv_mul_f64 vec_mul
 
+#define npyv_muladd_f32 vec_madd
+#define npyv_muladd_f64 vec_madd
+
 /***************************
  * Division
  ***************************/
 #define npyv_div_f32 vec_div
 #define npyv_div_f64 vec_div
 
+// TODO: Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1) +
+    vec_extract(a, 2) + vec_extract(a, 3);
+}
+
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1);
+}
+
 #endif // _NPY_SIMD_VSX_ARITHMETIC_H

From 47118fb6530e55f5f7164d2e652ff8b94f55d025 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 11 Aug 2020 12:11:00 +0800
Subject: [PATCH 02/27] einsum dispatch and usimd process

---
 benchmarks/benchmarks/bench_linalg.py         |   19 +-
 numpy/core/setup.py                           |    1 +
 numpy/core/src/multiarray/einsum.c.src        | 1913 +----------------
 .../core/src/multiarray/einsum.dispatch.c.src | 1530 +++++++++++++
 numpy/core/src/multiarray/einsum_p.h          |   48 +
 5 files changed, 1603 insertions(+), 1908 deletions(-)
 create mode 100644 numpy/core/src/multiarray/einsum.dispatch.c.src
 create mode 100644 numpy/core/src/multiarray/einsum_p.h

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index dc2849d58380..a64fc05f78f1 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -112,11 +112,14 @@ class Einsum(Benchmark):
     def setup(self, dtype):
         self.a = np.arange(2900, dtype=dtype)
         self.b = np.arange(3000, dtype=dtype)
+        self.b1 = np.arange(240000, dtype=dtype).reshape(400, 600)
         self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
         self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40)
+        self.c2 = np.arange(480000, dtype=dtype)
+        self.c3 = np.arange(600, dtype=dtype)
         self.d = np.arange(10000, dtype=dtype).reshape(10,100,10)
 
-    #outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
+    # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
     def time_einsum_outer(self, dtype):
         np.einsum("i,j", self.a, self.b, optimize=True)
 
@@ -130,4 +133,16 @@ def time_einsum_sum_mul(self, dtype):
 
     # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
     def time_einsum_sum_mul2(self, dtype):
-        np.einsum("i...,->", self.d, 300, optimize=True)
\ No newline at end of file
+        np.einsum("i...,->", self.d, 300, optimize=True)
+    
+    # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
+    def time_einsum_mul(self, dtype):
+        np.einsum("i,->i", self.c2, 300, optimize=True)
+    
+    # trigger contig_contig_outstride0_two
+    def time_einsum_contig_contig(self, dtype):
+        np.einsum("ji,i->", self.b1, self.c3, optimize=True)
+
+    # trigger sum_of_products_contig_outstride0_one
+    def time_einsum_contig_outstride0(self, dtype):
+        np.einsum("i->", self.c2, optimize=True)
\ No newline at end of file
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index aede12080017..e854fc0cae5b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -853,6 +853,7 @@ def get_mathlib_info(*args):
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
+            join('src', 'multiarray', 'einsum.dispatch.c.src'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 2538e05c626a..94b22641daef 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -8,1917 +8,18 @@
  * See LICENSE.txt for the license.
  */
 
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/npy_common.h>
-#include <numpy/arrayobject.h>
-#include <numpy/halffloat.h>
-#include <npy_pycompat.h>
-
-#include <ctype.h>
-
-#include "convert.h"
-#include "common.h"
-#include "ctors.h"
-
-#ifdef NPY_HAVE_SSE_INTRINSICS
-#define EINSUM_USE_SSE1 1
-#else
-#define EINSUM_USE_SSE1 0
-#endif
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#define EINSUM_USE_SSE2 1
-#else
-#define EINSUM_USE_SSE2 0
-#endif
-
-#if EINSUM_USE_SSE1
-#include <xmmintrin.h>
-#endif
-
-#if EINSUM_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
-
-/********** PRINTF DEBUG TRACING **************/
-#define NPY_EINSUM_DBG_TRACING 0
-
-#if NPY_EINSUM_DBG_TRACING
-#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
-#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
-#else
-#define NPY_EINSUM_DBG_PRINT(s)
-#define NPY_EINSUM_DBG_PRINT1(s, p1)
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
-#endif
-/**********************************************/
-
-/**begin repeat
- * #name = byte, short, int, long, longlong,
- *         ubyte, ushort, uint, ulong, ulonglong,
- *         half, float, double, longdouble,
- *         cfloat, cdouble, clongdouble#
- * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble#
- * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *             npy_float, npy_float, npy_double, npy_longdouble,
- *             npy_float, npy_double, npy_longdouble#
- * #to = ,,,,,
- *       ,,,,,
- *       npy_float_to_half,,,,
- *       ,,#
- * #from = ,,,,,
- *         ,,,,,
- *         npy_half_to_float,,,,
- *         ,,#
- * #complex = 0*5,
- *            0*5,
- *            0*4,
- *            1*3#
- * #float32 = 0*5,
- *            0*5,
- *            0,1,0,0,
- *            0*3#
- * #float64 = 0*5,
- *            0*5,
- *            0,0,1,0,
- *            0*3#
- */
-
-/**begin repeat1
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-static void
-@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data_out += stride_out;
-#  elif @nop@ == 2
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#  elif @nop@ == 3
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) *
-                                         @from@(*(@type@ *)data2) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
-                                         ((@temptype@ *)data_out)[0];
-        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
-                                         ((@temptype@ *)data_out)[1];
-        data0 += stride0;
-        data_out += stride_out;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-}
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_one(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data_out = (@type@ *)dataptr[1];
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            data_out[@i@] = @to@(@from@(data0[@i@]) +
-                                 @from@(data_out[@i@]));
-#else
-            ((@temptype@ *)data_out + 2*@i@)[0] =
-                                    ((@temptype@ *)data0 + 2*@i@)[0] +
-                                    ((@temptype@ *)data_out + 2*@i@)[0];
-            ((@temptype@ *)data_out + 2*@i@)[1] =
-                                    ((@temptype@ *)data0 + 2*@i@)[1] +
-                                    ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#if !@complex@
-        data_out[@i@] = @to@(@from@(data0[@i@]) +
-                             @from@(data_out[@i@]));
-#else /* complex */
-        ((@temptype@ *)data_out + 2*@i@)[0] =
-                                ((@temptype@ *)data0 + 2*@i@)[0] +
-                                ((@temptype@ *)data_out + 2*@i@)[0];
-        ((@temptype@ *)data_out + 2*@i@)[1] =
-                                ((@temptype@ *)data0 + 2*@i@)[1] +
-                                ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 2 && !@complex@
-
-static void
-@name@_sum_of_products_contig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@]) *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-/* Some extra specializations for the two operand case */
-static void
-@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value0_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value0_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(value0 *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value0_sse = _mm_set_ps1(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value0_sse = _mm_set1_pd(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(value0 *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    if (count > 0) {
-        goto finish_after_unrolled_loop;
-    }
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value1_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value1_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@])*
-                                 value1  +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value1_sse = _mm_set_ps1(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value1_sse = _mm_set1_pd(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@])*
-                             value1  +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 3 && !@complex@
-
-static void
-@name@_sum_of_products_contig_three(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data2 = (@type@ *)dataptr[2];
-    @type@ *data_out = (@type@ *)dataptr[3];
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) *
-                             @from@(data2[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-        data0 += 8;
-        data1 += 8;
-        data2 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-    if (count-- == 0) {
-        return;
-    }
-    data_out[@i@] = @to@(@from@(data0[@i@]) *
-                         @from@(data1[@i@]) *
-                         @from@(data2[@i@]) +
-                         @from@(data_out[@i@]));
-/**end repeat2**/
-}
-
-#else /* @nop@ > 3 || @complex */
-
-static void
-@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#else /* complex */
-#  if @nop@ <= 3
-#    define _SUMPROD_NOP @nop@
-#  else
-#    define _SUMPROD_NOP nop
-#  endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#  undef _SUMPROD_NOP
-#endif
-    }
-}
-
-#endif /* functions for various @nop@ */
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-    @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
-    @temptype@ accum = 0;
-    @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            accum += @from@(data0[@i@]);
-#else /* complex */
-            accum_re += data0[2*@i@+0];
-            accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
-        case 0:
-#if @complex@
-            ((@temptype@ *)dataptr[1])[0] += accum_re;
-            ((@temptype@ *)dataptr[1])[1] += accum_im;
-#else
-            *((@type@ *)dataptr[1]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[1])));
-#endif
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#  if !@complex@
-        accum += @from@(data0[@i@]);
-#  else /* complex */
-        accum_re += data0[2*@i@+0];
-        accum_im += data0[2*@i@+1];
-#  endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
-        data0 += 8;
-#else
-        data0 += 8*2;
-#endif
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#endif /* @nop@ == 1 */
-
-static void
-@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-#else
-    @temptype@ accum = 0;
-#endif
-
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        accum += @from@(*(@type@ *)data0);
-        data0 += stride0;
-#  elif @nop@ == 2
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1);
-        data0 += stride0;
-        data1 += stride1;
-#  elif @nop@ == 3
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1) *
-                 @from@(*(@type@ *)data2);
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        accum += temp;
-        for (i = 0; i < nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        accum_re += ((@temptype@ *)data0)[0];
-        accum_im += ((@temptype@ *)data0)[1];
-        data0 += stride0;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        accum_re += re;
-        accum_im += im;
-        for (i = 0; i < _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-
-#if @complex@
-#  if @nop@ <= 3
-    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
-    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
-#  else
-    ((@temptype@ *)dataptr[nop])[0] += accum_re;
-    ((@temptype@ *)dataptr[nop])[1] += accum_im;
-#  endif
-#else
-#  if @nop@ <= 3
-    *((@type@ *)dataptr[@nop@]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[@nop@])));
-#  else
-    *((@type@ *)dataptr[nop]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[nop])));
-#  endif
-#endif
-
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-
-/* Do OR of ANDs for the boolean type */
-
-/**begin repeat
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-
-static void
-bool_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        *(npy_bool *)data_out = *(npy_bool *)data0 ||
-                                  *(npy_bool *)data_out;
-        data0 += stride0;
-        data_out += stride_out;
-#elif @nop@ == 2
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#elif @nop@ == 3
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1 &&
-                                   *(npy_bool *)data2) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-}
-
-static void
-bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-#endif
-
-#if (@nop@ <= 3)
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat1
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#  if @nop@ == 1
-            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
-                                            ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 2
-            ((npy_bool *)data_out)[@i@] =
-                            (((npy_bool *)data0)[@i@] &&
-                             ((npy_bool *)data1)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 3
-            ((npy_bool *)data_out)[@i@] =
-                           (((npy_bool *)data0)[@i@] &&
-                            ((npy_bool *)data1)[@i@] &&
-                            ((npy_bool *)data2)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  endif
-/**end repeat1**/
-        case 0:
-            return;
-    }
-#endif
-
-/* Unroll the loop by 8 for fixed-size nop */
-#if (@nop@ <= 3)
-    while (count >= 8) {
-        count -= 8;
-#else
-    while (count--) {
-#endif
-
-#  if @nop@ == 1
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
-                                        (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 2
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                        ((*((npy_bool *)data0 + @i@)) &&
-                         (*((npy_bool *)data1 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 3
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                       ((*((npy_bool *)data0 + @i@)) &&
-                        (*((npy_bool *)data1 + @i@)) &&
-                        (*((npy_bool *)data2 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data2 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(npy_bool);
-        }
-#  endif
-    }
-
-    /* If the loop was unrolled, we need to finish it off */
-#if (@nop@ <= 3)
-    goto finish_after_unrolled_loop;
-#endif
-}
-
-static void
-bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-    npy_bool accum = 0;
-
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        accum = *(npy_bool *)data0 || accum;
-        data0 += stride0;
-#elif @nop@ == 2
-        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
-        data0 += stride0;
-        data1 += stride1;
-#elif @nop@ == 3
-        accum = (*(npy_bool *)data0 &&
-                 *(npy_bool *)data1 &&
-                 *(npy_bool *)data2) || accum;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        accum = temp || accum;
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-
-#  if @nop@ <= 3
-    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
-#  else
-    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
-#  endif
-}
-
-/**end repeat**/
-
-typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
-
-/* These tables need to match up with the type enum */
-static sum_of_products_fn
-_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-    &@name@_sum_of_products_contig_outstride0_one,
-#else
-    NULL,
-#endif
-/**end repeat**/
-}; /* End of _contig_outstride0_unary_specialization_table */
-
-static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        0, 0, 0,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_stride0_contig_outstride0_two,
-    &@name@_sum_of_products_stride0_contig_outcontig_two,
-    &@name@_sum_of_products_contig_stride0_outstride0_two,
-    &@name@_sum_of_products_contig_stride0_outcontig_two,
-    &@name@_sum_of_products_contig_contig_outstride0_two,
-},
-#else
-    {NULL, NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _binary_specialization_table */
-
-static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_outstride0_any,
-    &@name@_sum_of_products_outstride0_one,
-    &@name@_sum_of_products_outstride0_two,
-    &@name@_sum_of_products_outstride0_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _outstride0_specialized_table */
-
-static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_contig_any,
-    &@name@_sum_of_products_contig_one,
-    &@name@_sum_of_products_contig_two,
-    &@name@_sum_of_products_contig_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _allcontig_specialized_table */
-
-static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_any,
-    &@name@_sum_of_products_one,
-    &@name@_sum_of_products_two,
-    &@name@_sum_of_products_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _unnspecialized_table */
+#include "einsum_p.h"
 
 static sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num,
-                             npy_intp itemsize, npy_intp const *fixed_strides)
+get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)
 {
-    int iop;
-
-    if (type_num >= NPY_NTYPES) {
-        return NULL;
-    }
-
-    /* contiguous reduction */
-    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
-        sum_of_products_fn ret =
-            _contig_outstride0_unary_specialization_table[type_num];
-        if (ret != NULL) {
-            return ret;
-        }
-    }
-
-    /* nop of 2 has more specializations */
-    if (nop == 2) {
-        /* Encode the zero/contiguous strides */
-        int code;
-        code = (fixed_strides[0] == 0) ? 0 :
-                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
-        code += (fixed_strides[1] == 0) ? 0 :
-                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
-        code += (fixed_strides[2] == 0) ? 0 :
-                    (fixed_strides[2] == itemsize) ? 1 : 8;
-        if (code >= 2 && code < 7) {
-            sum_of_products_fn ret =
-                        _binary_specialization_table[type_num][code-2];
-            if (ret != NULL) {
-                return ret;
-            }
-        }
-    }
-
-    /* Inner loop with an output stride of 0 */
-    if (fixed_strides[nop] == 0) {
-        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* Check for all contiguous */
-    for (iop = 0; iop < nop + 1; ++iop) {
-        if (fixed_strides[iop] != itemsize) {
-            break;
-        }
-    }
-
-    /* Contiguous loop */
-    if (iop == nop + 1) {
-        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* None of the above specializations caught it, general loops */
-    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+    #ifndef NPY_DISABLE_OPTIMIZATION
+        #include "einsum.dispatch.h"
+    #endif
+    NPY_CPU_DISPATCH_CALL(return einsum_get_sum_of_products_function,
+        (nop, type_num, itemsize, fixed_strides))
 }
 
-
 /*
  * Parses the subscripts for one operand into an output of 'ndim'
  * labels. The resulting 'op_labels' array will have:
diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src
new file mode 100644
index 000000000000..bfb5075e3594
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum.dispatch.c.src
@@ -0,0 +1,1530 @@
+/*
+ * This file contains the implementation of the 'einsum' function,
+ * which provides an einstein-summation operation.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+/**
+ * @targets $maxopt baseline
+ * SSE2 (AVX2 FMA3) AVX512F
+ * NEON NEON_VFPV4
+ * VSX VSX2
+ */
+#include "einsum_p.h"
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *             npy_float, npy_float, npy_double, npy_longdouble,
+ *             npy_float, npy_double, npy_longdouble#
+* #sfx  = s8, s16, s32, long, s64,
+ *        u8, u16, u32, ulong, u64,
+ *        half, f32, f64, longdouble,
+ *        f32, f64, clongdouble#
+ * #to = ,,,,,
+ *       ,,,,,
+ *       npy_float_to_half,,,,
+ *       ,,#
+ * #from = ,,,,,
+ *         ,,,,,
+ *         npy_half_to_float,,,,
+ *         ,,#
+ * #complex = 0*5,
+ *            0*5,
+ *            0*4,
+ *            1*3#
+ * #float32 = 0*5,
+ *            0*5,
+ *            0,1,0,0,
+ *            0*3#
+ * #float64 = 0*5,
+ *            0*5,
+ *            0,0,1,0,
+ *            0*3#
+ * #NPYV_CHK = 0*5,
+ *             0*5,
+ *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0*3#
+ * #unroll_by = 0*5,
+ *              0*5,
+ *              0,2, 4, 0,
+ *              0*3#
+ */
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+static void
+@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif @nop@ == 2
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif @nop@ == 3
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) *
+                                         @from@(*(@type@ *)data2) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
+                                         ((@temptype@ *)data_out)[0];
+        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
+                                         ((@temptype@ *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data_out = (@type@ *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            data_out[@i@] = @to@(@from@(data0[@i@]) +
+                                 @from@(data_out[@i@]));
+#else
+            ((@temptype@ *)data_out + 2*@i@)[0] =
+                                    ((@temptype@ *)data0 + 2*@i@)[0] +
+                                    ((@temptype@ *)data_out + 2*@i@)[0];
+            ((@temptype@ *)data_out + 2*@i@)[1] =
+                                    ((@temptype@ *)data0 + 2*@i@)[1] +
+                                    ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#if !@complex@
+        data_out[@i@] = @to@(@from@(data0[@i@]) +
+                             @from@(data_out[@i@]));
+#else /* complex */
+        ((@temptype@ *)data_out + 2*@i@)[0] =
+                                ((@temptype@ *)data0 + 2*@i@)[0] +
+                                ((@temptype@ *)data_out + 2*@i@)[0];
+        ((@temptype@ *)data_out + 2*@i@)[1] =
+                                ((@temptype@ *)data0 + 2*@i@)[1] +
+                                ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 2 && !@complex@
+
+static void
+@name@_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                           EINSUM_IS_ALIGNED(data_out);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
+            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
+            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, b0, c0);
+            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, b1, c1);
+            npyv_@st@_@sfx@(data_out, abc0);
+            npyv_@st@_@sfx@(data_out + vstep, abc1);
+        }
+    #endif
+    }
+    /**end repeat2**/
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
+    }
+#endif
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b + c);
+    }
+}
+
+/* Some extra specializations for the two operand case */
+static void
+@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+    const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar);
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) {
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
+            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
+            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(va_scalar, b0, c0);
+            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(va_scalar, b1, c1);
+            npyv_@st@_@sfx@(data_out, abc0);
+            npyv_@st@_@sfx@(data_out + vstep, abc1);
+        }
+    #endif
+    }
+    /**end repeat2**/
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    for (; count >= 4; count -= 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a_scalar * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
+    }
+#endif
+    for (; count > 0; --count, ++data1, ++data_out) {
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a_scalar * b + c);
+    }
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]);
+    @type@ *data_out = (@type@ *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+    const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar);
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
+            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
+            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, vb_scalar, c0);
+            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, vb_scalar, c1);
+            npyv_@st@_@sfx@(data_out, abc0);
+            npyv_@st@_@sfx@(data_out + vstep, abc1);
+        }
+    #endif
+    }
+    /**end repeat2**/
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    for (; count >= 4; count -= 4, data0 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b_scalar + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
+    }
+#endif
+    for (; count > 0; --count, ++data0, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b_scalar + c);
+    }
+}
+
+static void
+@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum);
+            npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3);
+            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2);
+                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum);
+                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
+        }
+    #endif
+    }
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
+        /**end repeat2**/
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif
+    for (; count > 0; --count, ++data0, ++data1) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        accum += a * b;
+    }
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
+}
+
+static void
+@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data1);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ b01   = npyv_add_@sfx@(b0, b1);
+            npyv_@sfx@ b23   = npyv_add_@sfx@(b2, b3);
+            npyv_@sfx@ b0123 = npyv_add_@sfx@(b01, b23);
+                      vaccum = npyv_add_@sfx@(b0123, vaccum);
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) {
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1);
+                    vaccum = npyv_add_@sfx@(b01, vaccum);
+        }
+    #endif
+    }
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    for (; count >= 4; count -= 4, data1 += 4) {
+        const @type@ b01 = @from@(data1[0]) + @from@(data1[1]);
+        const @type@ b23 = @from@(data1[2]) + @from@(data1[3]);
+        accum += b01 + b23;
+    }
+#endif
+    for (; count > 0; --count, ++data1) {
+        accum += @from@(*data1);
+    }
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + a_scalar * accum);
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]);
+    @temptype@ accum = 0;
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
+                                                    (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data0);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
+            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
+            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
+                      vaccum = npyv_add_@sfx@(a0123, vaccum);
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
+                    vaccum = npyv_add_@sfx@(a01, vaccum);
+        }
+    #endif
+    }
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    for (; count >= 4; count -= 4, data0 += 4) {
+        const @type@ a01 = @from@(data0[0]) + @from@(data0[1]);
+        const @type@ a23 = @from@(data0[2]) + @from@(data0[3]);
+        accum += a01 + a23;
+    }
+#endif
+    for (; count > 0; --count, ++data0) {
+        accum += @from@(*data0);
+    }
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + b_scalar * accum);
+}
+
+#elif @nop@ == 3 && !@complex@
+
+static void
+@name@_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data2 = (@type@ *)dataptr[2];
+    @type@ *data_out = (@type@ *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) *
+                             @from@(data2[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@]) *
+                         @from@(data1[@i@]) *
+                         @from@(data2[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+#else /* @nop@ > 3 || @complex */
+
+static void
+@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#else /* complex */
+#  if @nop@ <= 3
+#    define _SUMPROD_NOP @nop@
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various @nop@ */
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+    @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#else
+    @temptype@ accum = 0;
+    @type@ *data0 = (@type@ *)dataptr[0];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    #ifndef NPY_HAVE_NEON
+    const int is_aligned = EINSUM_IS_ALIGNED(data0);
+    #else
+    // ARM/Neon don't have instructions for aligned memory access
+    const int is_aligned = 0;
+    #endif
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const int vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
+            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
+            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
+                      vaccum = npyv_add_@sfx@(a0123, vaccum);
+        }
+    #elif @unroll_by@ == 2
+        const int vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
+                    vaccum = npyv_add_@sfx@(a01, vaccum);
+        }
+    #endif
+    }
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+npyv_cleanup();
+#endif // NPYV check for @type@
+/**
+ * Unroll by four/eight scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the change to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+    #if @complex@
+        for (; count > 4; count -= 4, data0 += 4*2) {
+            const @temptype@ re01 = data0[0] + data0[2];
+            const @temptype@ re23 = data0[4] + data0[6];
+            const @temptype@ im13 = data0[1] + data0[3];
+            const @temptype@ im57 = data0[5] + data0[7];
+            accum_re += re01 + re23;
+            accum_im += im13 + im57;
+        }
+    #else
+        for (; count > 4; count -= 4, data0 += 4) {
+            const @temptype@ a01 = @from@(data0[0]) + @from@(data0[1]);
+            const @temptype@ a23 = @from@(data0[2]) + @from@(data0[3]);
+            accum +=  a01 + a23;
+        }
+    #endif  // complex
+#endif
+#if @complex@
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((@temptype@ *)dataptr[1])[0] += accum_re;
+    ((@temptype@ *)dataptr[1])[1] += accum_im;
+#else
+    for (; count > 0; --count, ++data0) {
+        accum += @from@(*data0);
+    }
+    *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1])));
+#endif // complex
+}
+
+#endif /* @nop@ == 1 */
+
+static void
+@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+#else
+    @temptype@ accum = 0;
+#endif
+
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        accum += @from@(*(@type@ *)data0);
+        data0 += stride0;
+#  elif @nop@ == 2
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif @nop@ == 3
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1) *
+                 @from@(*(@type@ *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        accum_re += ((@temptype@ *)data0)[0];
+        accum_im += ((@temptype@ *)data0)[1];
+        data0 += stride0;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if @complex@
+#  if @nop@ <= 3
+    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
+    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
+#  else
+    ((@temptype@ *)dataptr[nop])[0] += accum_re;
+    ((@temptype@ *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if @nop@ <= 3
+    *((@type@ *)dataptr[@nop@]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[@nop@])));
+#  else
+    *((@type@ *)dataptr[nop]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/* Do OR of ANDs for the boolean type */
+
+/**begin repeat
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+
+static void
+bool_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif @nop@ == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif @nop@ == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+#endif
+
+#if (@nop@ <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat1
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#  if @nop@ == 1
+            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
+                                            ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 2
+            ((npy_bool *)data_out)[@i@] =
+                            (((npy_bool *)data0)[@i@] &&
+                             ((npy_bool *)data1)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 3
+            ((npy_bool *)data_out)[@i@] =
+                           (((npy_bool *)data0)[@i@] &&
+                            ((npy_bool *)data1)[@i@] &&
+                            ((npy_bool *)data2)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  endif
+/**end repeat1**/
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (@nop@ <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+                                        (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                        ((*((npy_bool *)data0 + @i@)) &&
+                         (*((npy_bool *)data1 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                       ((*((npy_bool *)data0 + @i@)) &&
+                        (*((npy_bool *)data1 + @i@)) &&
+                        (*((npy_bool *)data2 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (@nop@ <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif @nop@ == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif @nop@ == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if @nop@ <= 3
+    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+/**end repeat**/
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
+
+/* These tables need to match up with the type enum */
+static sum_of_products_fn
+_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+    &@name@_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+/**end repeat**/
+}; /* End of _contig_outstride0_unary_specialization_table */
+
+static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_stride0_contig_outstride0_two,
+    &@name@_sum_of_products_stride0_contig_outcontig_two,
+    &@name@_sum_of_products_contig_stride0_outstride0_two,
+    &@name@_sum_of_products_contig_stride0_outcontig_two,
+    &@name@_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _binary_specialization_table */
+
+static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_outstride0_any,
+    &@name@_sum_of_products_outstride0_one,
+    &@name@_sum_of_products_outstride0_two,
+    &@name@_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _outstride0_specialized_table */
+
+static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_contig_any,
+    &@name@_sum_of_products_contig_one,
+    &@name@_sum_of_products_contig_two,
+    &@name@_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _allcontig_specialized_table */
+
+static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_any,
+    &@name@_sum_of_products_one,
+    &@name@_sum_of_products_two,
+    &@name@_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _unnspecialized_table */
+
+NPY_NO_EXPORT sum_of_products_fn NPY_CPU_DISPATCH_CURFX(einsum_get_sum_of_products_function)
+(int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides)
+{
+    int iop;
+
+    if (type_num >= NPY_NTYPES) {
+        return NULL;
+    }
+
+    /* contiguous reduction */
+    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
+        sum_of_products_fn ret =
+            _contig_outstride0_unary_specialization_table[type_num];
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* nop of 2 has more specializations */
+    if (nop == 2) {
+        /* Encode the zero/contiguous strides */
+        int code;
+        code = (fixed_strides[0] == 0) ? 0 :
+                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
+        code += (fixed_strides[1] == 0) ? 0 :
+                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
+        code += (fixed_strides[2] == 0) ? 0 :
+                    (fixed_strides[2] == itemsize) ? 1 : 8;
+        if (code >= 2 && code < 7) {
+            sum_of_products_fn ret =
+                        _binary_specialization_table[type_num][code-2];
+            if (ret != NULL) {
+                return ret;
+            }
+        }
+    }
+
+    /* Inner loop with an output stride of 0 */
+    if (fixed_strides[nop] == 0) {
+        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* Check for all contiguous */
+    for (iop = 0; iop < nop + 1; ++iop) {
+        if (fixed_strides[iop] != itemsize) {
+            break;
+        }
+    }
+
+    /* Contiguous loop */
+    if (iop == nop + 1) {
+        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* None of the above specializations caught it, general loops */
+    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+}
diff --git a/numpy/core/src/multiarray/einsum_p.h b/numpy/core/src/multiarray/einsum_p.h
new file mode 100644
index 000000000000..7abe1b7f7548
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_p.h
@@ -0,0 +1,48 @@
+#ifndef _NPY_EINSUM_P_H_
+#define _NPY_EINSUM_P_H_
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/npy_common.h>
+#include <numpy/arrayobject.h>
+#include <numpy/halffloat.h>
+#include <npy_pycompat.h>
+
+#include <ctype.h>
+
+#include "simd/simd.h"
+#include "convert.h"
+#include "common.h"
+#include "ctors.h"
+
+#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_EINSUM_DBG_TRACING 0
+
+#if NPY_EINSUM_DBG_TRACING
+#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
+#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
+#else
+#define NPY_EINSUM_DBG_PRINT(s)
+#define NPY_EINSUM_DBG_PRINT1(s, p1)
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
+#endif
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "einsum.dispatch.h"
+#endif
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function,
+    (int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides))
+
+#endif // _NPY_EINSUM_P_H_

From 55200fcf36b436dfa92773f14193243020d40f8a Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 11 Aug 2020 16:59:37 +0800
Subject: [PATCH 03/27] add float32 benchmark case

---
 benchmarks/benchmarks/bench_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index a64fc05f78f1..56f9a2e095c8 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -108,7 +108,7 @@ def time_numpy_linalg_lstsq_a__b_float64(self):
 
 class Einsum(Benchmark):
     param_names = ['dtype']
-    params = [[np.float64]]
+    params = [[np.float32, np.float64]]
     def setup(self, dtype):
         self.a = np.arange(2900, dtype=dtype)
         self.b = np.arange(3000, dtype=dtype)

From ae53e350dc4b367356e1268d4a5ca9a085e43cbc Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 12 Aug 2020 16:57:02 +0800
Subject: [PATCH 04/27] fix typos

---
 numpy/core/src/common/simd/vsx/arithmetic.h | 2 +-
 numpy/core/src/multiarray/einsum_p.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index eb1aa20b11d7..9a67386f0986 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -103,7 +103,7 @@
 #define npyv_div_f32 vec_div
 #define npyv_div_f64 vec_div
 
-// TODO: Horizontal add: Calculates the sum of all vector elements.
+// Horizontal add: Calculates the sum of all vector elements.
 NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     return vec_extract(a, 0) + vec_extract(a, 1) +
diff --git a/numpy/core/src/multiarray/einsum_p.h b/numpy/core/src/multiarray/einsum_p.h
index 7abe1b7f7548..6b50f01baa45 100644
--- a/numpy/core/src/multiarray/einsum_p.h
+++ b/numpy/core/src/multiarray/einsum_p.h
@@ -43,6 +43,6 @@
 typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
 
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function,
-    (int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides))
+    (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides))
 
 #endif // _NPY_EINSUM_P_H_

From 2e713b0b4b119dda516be1e1df1b9cbc18628f1f Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 13 Aug 2020 14:53:41 +0800
Subject: [PATCH 05/27] add avx512 reduce sum comments

---
 numpy/core/src/common/simd/avx512/arithmetic.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 4b89c06c7321..4e04e9f3f4db 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -115,6 +115,18 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 // TODO: emulate integer division
 #define npyv_div_f32 _mm512_div_ps
 #define npyv_div_f64 _mm512_div_pd
+
+/***************************
+ * Reduce Sum
+ * there are three ways to implement reduce sum for AVX512:
+ * 1- split(256) /add /split(128) /add /hadd /hadd /extract
+ * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
+ * 3- _mm512_reduce_add_ps/pd
+ * The first one is been widely used by many projects while the second one is used by Intel Compiler and here
+ * the reason why the second preferred by intel compiler maybe because the latency of hadd increased by (2-3)
+ * starting from Skylake-X which makes two extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more clarification.
+ * The third one is almost the same as the second one but only works for intel compiler/GCC 7.1/Clang 4.
+ ***************************/
 NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));

From 5e7cbd1f51074500b5d5a304e75f84deb46fecf0 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 20 Aug 2020 17:07:07 +0800
Subject: [PATCH 06/27] add non_contigous arrays ,improve reduce the sum

---
 benchmarks/benchmarks/bench_linalg.py         | 41 ++++++++++++-------
 numpy/core/src/common/simd/avx2/arithmetic.h  | 21 +++++-----
 .../core/src/common/simd/avx512/arithmetic.h  | 12 ++++--
 numpy/core/src/common/simd/sse/arithmetic.h   | 17 +++++---
 4 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index 56f9a2e095c8..04889265b591 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -110,39 +110,50 @@ class Einsum(Benchmark):
     param_names = ['dtype']
     params = [[np.float32, np.float64]]
     def setup(self, dtype):
-        self.a = np.arange(2900, dtype=dtype)
-        self.b = np.arange(3000, dtype=dtype)
-        self.b1 = np.arange(240000, dtype=dtype).reshape(400, 600)
-        self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
-        self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40)
-        self.c2 = np.arange(480000, dtype=dtype)
-        self.c3 = np.arange(600, dtype=dtype)
-        self.d = np.arange(10000, dtype=dtype).reshape(10,100,10)
+        self.one_dim_small = np.arange(600, dtype=dtype)
+        self.one_dim = np.arange(3000, dtype=dtype)
+        self.one_dim_big = np.arange(480000, dtype=dtype)
+        self.two_dim_small = np.arange(1200, dtype=dtype).reshape(30, 40)
+        self.two_dim = np.arange(240000, dtype=dtype).reshape(400, 600)
+        self.three_dim_small = np.arange(10000, dtype=dtype).reshape(10,100,10)
+        self.three_dim = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
+        # non_contigous arrays
+        self.non_contigous_dim1_small = np.arange(1, 80, 2, dtype=dtype)
+        self.non_contigous_dim1 = np.arange(1, 4000, 2, dtype=dtype)
+        self.non_contigous_dim2 = np.arange(1, 2400, 2, dtype=dtype).reshape(30, 40)
+        self.non_contigous_dim3 = np.arange(1, 48000, 2, dtype=dtype).reshape(20, 30, 40)
 
     # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
     def time_einsum_outer(self, dtype):
-        np.einsum("i,j", self.a, self.b, optimize=True)
+        np.einsum("i,j", self.one_dim, self.one_dim, optimize=True)
+        np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)
 
     # multiply(a, b):trigger sum_of_products_contig_two
     def time_einsum_multiply(self, dtype):
-        np.einsum("..., ...", self.c1, self.c , optimize=True)
+        np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True)
+        np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)
     
     # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two
     def time_einsum_sum_mul(self, dtype):
-        np.einsum(",i...->", 300, self.d, optimize=True)
+        np.einsum(",i...->", 300, self.three_dim_small, optimize=True)
+        np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)
 
     # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
     def time_einsum_sum_mul2(self, dtype):
-        np.einsum("i...,->", self.d, 300, optimize=True)
+        np.einsum("i...,->", self.three_dim_small, 300, optimize=True)
+        np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)
     
     # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
     def time_einsum_mul(self, dtype):
-        np.einsum("i,->i", self.c2, 300, optimize=True)
+        np.einsum("i,->i", self.one_dim_big, 300, optimize=True)
+        np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)
     
     # trigger contig_contig_outstride0_two
     def time_einsum_contig_contig(self, dtype):
-        np.einsum("ji,i->", self.b1, self.c3, optimize=True)
+        np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True)
+        np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)
 
     # trigger sum_of_products_contig_outstride0_one
     def time_einsum_contig_outstride0(self, dtype):
-        np.einsum("i->", self.c2, optimize=True)
\ No newline at end of file
+        np.einsum("i->", self.one_dim_big, optimize=True)
+        np.einsum("i->", self.non_contigous_dim1, optimize=True)
\ No newline at end of file
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index d61b4a0f66ae..084e854b4559 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -82,19 +82,20 @@
 // Horizontal add: Calculates the sum of all vector elements.
 NPY_FINLINE float npyv_sum_f32(__m256 a)
 {
-    __m128 t1 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a,1));
-    __m128 t2 = _mm_movehdup_ps(t1);
-    __m128 t3 = _mm_add_ps(t1, t2);
-    __m128 t4 = _mm_movehl_ps(t3, t3);
-    __m128 t5 = _mm_add_ss(t3, t4);
-    return _mm_cvtss_f32(t5);
+    __m256 sum_halves = _mm256_hadd_ps(a, a);
+    sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
+    __m128 lo = _mm256_castps256_ps128(sum_halves);
+    __m128 hi = _mm256_extractf128_ps(sum_halves, 1);
+    __m128 sum = _mm_add_ps(lo, hi);
+    return _mm_cvtss_f32(sum);
 }
 
 NPY_FINLINE double npyv_sum_f64(__m256d a)
 {
-    __m128d t1 = _mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a,1));
-    __m128d t2 = _mm_unpackhi_pd(t1, t1);
-    __m128d t3 = _mm_add_sd(t2, t1);
-    return _mm_cvtsd_f64(t3);
+    __m256d sum_halves = _mm256_hadd_pd(a, a);
+    __m128d lo = _mm256_castpd256_pd128(sum_halves);
+    __m128d hi = _mm256_extractf128_pd(sum_halves, 1);
+    __m128d sum = _mm_add_pd(lo, hi);
+    return _mm_cvtsd_f64(sum);
 }
 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 4e04e9f3f4db..def5df093b45 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -122,10 +122,14 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
  * 1- split(256) /add /split(128) /add /hadd /hadd /extract
  * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
  * 3- _mm512_reduce_add_ps/pd
- * The first one is been widely used by many projects while the second one is used by Intel Compiler and here
- * the reason why the second preferred by intel compiler maybe because the latency of hadd increased by (2-3)
- * starting from Skylake-X which makes two extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more clarification.
- * The third one is almost the same as the second one but only works for intel compiler/GCC 7.1/Clang 4.
+ * The first one is been widely used by many projects
+ * 
+ * the second one is used by Intel Compiler, maybe because the
+ * latency of hadd increased by (2-3) starting from Skylake-X which makes two
+ * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
+ * 
+ * The third one is almost the same as the second one but only works for
+ * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
  ***************************/
 NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 62dc0d8cf4b0..74c9539240b8 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -102,18 +102,23 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 // Horizontal add: Calculates the sum of all vector elements.
 NPY_FINLINE float npyv_sum_f32(__m128 a)
 {
+#ifdef NPY_HAVE_SSE3
+    __m128 sum_halves = _mm_hadd_ps(a, a);
+    return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
+#else
     __m128 t1 = _mm_movehl_ps(a, a);
     __m128 t2 = _mm_add_ps(a, t1);
     __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
     __m128 t4 = _mm_add_ss(t2, t3);
-    return _mm_cvtss_f32(t4);
+    return _mm_cvtss_f32(t4); 
+#endif
 }
-
 NPY_FINLINE double npyv_sum_f64(__m128d a)
 {
-    __m128  t0 = _mm_castpd_ps(a);
-    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
-    __m128d t2 = _mm_add_sd(a,t1);
-    return _mm_cvtsd_f64(t2);
+#ifdef NPY_HAVE_SSE3
+    return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
+#else
+    return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
+#endif
 }
 #endif // _NPY_SIMD_SSE_ARITHMETIC_H

From 90602314902f68e6e0971bf05ce695ec71a52ba8 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 24 Aug 2020 11:35:06 +0800
Subject: [PATCH 07/27] rebase after split for a better review

---
 numpy/core/setup.py                           |    2 +-
 .../core/src/common/simd/avx512/arithmetic.h  |    8 +
 numpy/core/src/multiarray/einsum.c.src        |   10 +
 .../core/src/multiarray/einsum.dispatch.c.src |    4 +-
 numpy/core/src/multiarray/einsum_p.h          |   48 -
 .../core/src/multiarray/einsum_sumprod.c.src  | 1897 -----------------
 numpy/core/src/multiarray/einsum_sumprod.h    |   27 +-
 7 files changed, 45 insertions(+), 1951 deletions(-)
 delete mode 100644 numpy/core/src/multiarray/einsum_p.h
 delete mode 100644 numpy/core/src/multiarray/einsum_sumprod.c.src

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a4a84397d9ee..43ba1e22e661 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -855,7 +855,7 @@ def get_mathlib_info(*args):
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
-            join('src', 'multiarray', 'einsum_sumprod.c.src'),
+            join('src', 'multiarray', 'einsum.dispatch.c.src'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index def5df093b45..94ff185fcf5a 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -133,6 +133,9 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
  ***************************/
 NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
+#ifdef NPY_HAVE_AVX512F_REDUCE
+    return _mm512_reduce_add_ps(a);
+#else
     __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
     __m512 sum32 = _mm512_add_ps(a, h64);
     __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
@@ -142,9 +145,13 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
     __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
     __m512 sum4  = _mm512_add_ps(sum8, h4);
     return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+#endif
 }
 NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
 {
+#ifdef NPY_HAVE_AVX512F_REDUCE
+    return _mm512_reduce_add_pd(a);
+#else
     __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
     __m512d sum32 = _mm512_add_pd(a, h64);
     __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
@@ -152,6 +159,7 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
     __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
     __m512d sum8  = _mm512_add_pd(sum16, h16);
     return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+#endif
 }
 
 #endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index cfbee0fe9a18..01ca3111eb1e 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -27,6 +27,16 @@
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
 
+static sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)
+{
+    #ifndef NPY_DISABLE_OPTIMIZATION
+        #include "einsum.dispatch.h"
+    #endif
+    NPY_CPU_DISPATCH_CALL(return einsum_get_sum_of_products_function,
+        (nop, type_num, itemsize, fixed_strides))
+}
+
 /*
  * Parses the subscripts for one operand into an output of 'ndim'
  * labels. The resulting 'op_labels' array will have:
diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src
index bfb5075e3594..1c692518e3a8 100644
--- a/numpy/core/src/multiarray/einsum.dispatch.c.src
+++ b/numpy/core/src/multiarray/einsum.dispatch.c.src
@@ -13,8 +13,8 @@
  * NEON NEON_VFPV4
  * VSX VSX2
  */
-#include "einsum_p.h"
-
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
 /**begin repeat
  * #name = byte, short, int, long, longlong,
  *         ubyte, ushort, uint, ulong, ulonglong,
diff --git a/numpy/core/src/multiarray/einsum_p.h b/numpy/core/src/multiarray/einsum_p.h
deleted file mode 100644
index 6b50f01baa45..000000000000
--- a/numpy/core/src/multiarray/einsum_p.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _NPY_EINSUM_P_H_
-#define _NPY_EINSUM_P_H_
-
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/npy_common.h>
-#include <numpy/arrayobject.h>
-#include <numpy/halffloat.h>
-#include <npy_pycompat.h>
-
-#include <ctype.h>
-
-#include "simd/simd.h"
-#include "convert.h"
-#include "common.h"
-#include "ctors.h"
-
-#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
-
-/********** PRINTF DEBUG TRACING **************/
-#define NPY_EINSUM_DBG_TRACING 0
-
-#if NPY_EINSUM_DBG_TRACING
-#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
-#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
-#else
-#define NPY_EINSUM_DBG_PRINT(s)
-#define NPY_EINSUM_DBG_PRINT1(s, p1)
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-    #include "einsum.dispatch.h"
-#endif
-
-typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function,
-    (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides))
-
-#endif // _NPY_EINSUM_P_H_
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
deleted file mode 100644
index c58e742874d0..000000000000
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ /dev/null
@@ -1,1897 +0,0 @@
-/*
- * This file provides optimized sum of product implementations used internally
- * by einsum.
- *
- * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
- * The University of British Columbia
- *
- * See LICENSE.txt for the license.
- */
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-
-#include <numpy/npy_common.h>
-#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
-#include <numpy/halffloat.h>
-
-#include "einsum_sumprod.h"
-#include "einsum_debug.h"
-
-
-#ifdef NPY_HAVE_SSE_INTRINSICS
-#define EINSUM_USE_SSE1 1
-#else
-#define EINSUM_USE_SSE1 0
-#endif
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#define EINSUM_USE_SSE2 1
-#else
-#define EINSUM_USE_SSE2 0
-#endif
-
-#if EINSUM_USE_SSE1
-#include <xmmintrin.h>
-#endif
-
-#if EINSUM_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
-
-/**********************************************/
-
-/**begin repeat
- * #name = byte, short, int, long, longlong,
- *         ubyte, ushort, uint, ulong, ulonglong,
- *         half, float, double, longdouble,
- *         cfloat, cdouble, clongdouble#
- * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble#
- * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *             npy_float, npy_float, npy_double, npy_longdouble,
- *             npy_float, npy_double, npy_longdouble#
- * #to = ,,,,,
- *       ,,,,,
- *       npy_float_to_half,,,,
- *       ,,#
- * #from = ,,,,,
- *         ,,,,,
- *         npy_half_to_float,,,,
- *         ,,#
- * #complex = 0*5,
- *            0*5,
- *            0*4,
- *            1*3#
- * #float32 = 0*5,
- *            0*5,
- *            0,1,0,0,
- *            0*3#
- * #float64 = 0*5,
- *            0*5,
- *            0,0,1,0,
- *            0*3#
- */
-
-/**begin repeat1
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-static void
-@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data_out += stride_out;
-#  elif @nop@ == 2
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#  elif @nop@ == 3
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) *
-                                         @from@(*(@type@ *)data2) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
-                                         ((@temptype@ *)data_out)[0];
-        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
-                                         ((@temptype@ *)data_out)[1];
-        data0 += stride0;
-        data_out += stride_out;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-}
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_one(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data_out = (@type@ *)dataptr[1];
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            data_out[@i@] = @to@(@from@(data0[@i@]) +
-                                 @from@(data_out[@i@]));
-#else
-            ((@temptype@ *)data_out + 2*@i@)[0] =
-                                    ((@temptype@ *)data0 + 2*@i@)[0] +
-                                    ((@temptype@ *)data_out + 2*@i@)[0];
-            ((@temptype@ *)data_out + 2*@i@)[1] =
-                                    ((@temptype@ *)data0 + 2*@i@)[1] +
-                                    ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#if !@complex@
-        data_out[@i@] = @to@(@from@(data0[@i@]) +
-                             @from@(data_out[@i@]));
-#else /* complex */
-        ((@temptype@ *)data_out + 2*@i@)[0] =
-                                ((@temptype@ *)data0 + 2*@i@)[0] +
-                                ((@temptype@ *)data_out + 2*@i@)[0];
-        ((@temptype@ *)data_out + 2*@i@)[1] =
-                                ((@temptype@ *)data0 + 2*@i@)[1] +
-                                ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 2 && !@complex@
-
-static void
-@name@_sum_of_products_contig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@]) *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-/* Some extra specializations for the two operand case */
-static void
-@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value0_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value0_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(value0 *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value0_sse = _mm_set_ps1(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value0_sse = _mm_set1_pd(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(value0 *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    if (count > 0) {
-        goto finish_after_unrolled_loop;
-    }
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value1_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value1_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@])*
-                                 value1  +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value1_sse = _mm_set_ps1(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value1_sse = _mm_set1_pd(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@])*
-                             value1  +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 3 && !@complex@
-
-static void
-@name@_sum_of_products_contig_three(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data2 = (@type@ *)dataptr[2];
-    @type@ *data_out = (@type@ *)dataptr[3];
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) *
-                             @from@(data2[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-        data0 += 8;
-        data1 += 8;
-        data2 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-    if (count-- == 0) {
-        return;
-    }
-    data_out[@i@] = @to@(@from@(data0[@i@]) *
-                         @from@(data1[@i@]) *
-                         @from@(data2[@i@]) +
-                         @from@(data_out[@i@]));
-/**end repeat2**/
-}
-
-#else /* @nop@ > 3 || @complex */
-
-static void
-@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#else /* complex */
-#  if @nop@ <= 3
-#    define _SUMPROD_NOP @nop@
-#  else
-#    define _SUMPROD_NOP nop
-#  endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#  undef _SUMPROD_NOP
-#endif
-    }
-}
-
-#endif /* functions for various @nop@ */
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-    @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
-    @temptype@ accum = 0;
-    @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            accum += @from@(data0[@i@]);
-#else /* complex */
-            accum_re += data0[2*@i@+0];
-            accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
-        case 0:
-#if @complex@
-            ((@temptype@ *)dataptr[1])[0] += accum_re;
-            ((@temptype@ *)dataptr[1])[1] += accum_im;
-#else
-            *((@type@ *)dataptr[1]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[1])));
-#endif
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#  if !@complex@
-        accum += @from@(data0[@i@]);
-#  else /* complex */
-        accum_re += data0[2*@i@+0];
-        accum_im += data0[2*@i@+1];
-#  endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
-        data0 += 8;
-#else
-        data0 += 8*2;
-#endif
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#endif /* @nop@ == 1 */
-
-static void
-@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-#else
-    @temptype@ accum = 0;
-#endif
-
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        accum += @from@(*(@type@ *)data0);
-        data0 += stride0;
-#  elif @nop@ == 2
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1);
-        data0 += stride0;
-        data1 += stride1;
-#  elif @nop@ == 3
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1) *
-                 @from@(*(@type@ *)data2);
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        accum += temp;
-        for (i = 0; i < nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        accum_re += ((@temptype@ *)data0)[0];
-        accum_im += ((@temptype@ *)data0)[1];
-        data0 += stride0;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        accum_re += re;
-        accum_im += im;
-        for (i = 0; i < _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-
-#if @complex@
-#  if @nop@ <= 3
-    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
-    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
-#  else
-    ((@temptype@ *)dataptr[nop])[0] += accum_re;
-    ((@temptype@ *)dataptr[nop])[1] += accum_im;
-#  endif
-#else
-#  if @nop@ <= 3
-    *((@type@ *)dataptr[@nop@]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[@nop@])));
-#  else
-    *((@type@ *)dataptr[nop]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[nop])));
-#  endif
-#endif
-
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-
-/* Do OR of ANDs for the boolean type */
-
-/**begin repeat
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-
-static void
-bool_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        *(npy_bool *)data_out = *(npy_bool *)data0 ||
-                                  *(npy_bool *)data_out;
-        data0 += stride0;
-        data_out += stride_out;
-#elif @nop@ == 2
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#elif @nop@ == 3
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1 &&
-                                   *(npy_bool *)data2) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-}
-
-static void
-bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-#endif
-
-#if (@nop@ <= 3)
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat1
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#  if @nop@ == 1
-            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
-                                            ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 2
-            ((npy_bool *)data_out)[@i@] =
-                            (((npy_bool *)data0)[@i@] &&
-                             ((npy_bool *)data1)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 3
-            ((npy_bool *)data_out)[@i@] =
-                           (((npy_bool *)data0)[@i@] &&
-                            ((npy_bool *)data1)[@i@] &&
-                            ((npy_bool *)data2)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  endif
-/**end repeat1**/
-        case 0:
-            return;
-    }
-#endif
-
-/* Unroll the loop by 8 for fixed-size nop */
-#if (@nop@ <= 3)
-    while (count >= 8) {
-        count -= 8;
-#else
-    while (count--) {
-#endif
-
-#  if @nop@ == 1
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
-                                        (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 2
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                        ((*((npy_bool *)data0 + @i@)) &&
-                         (*((npy_bool *)data1 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 3
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                       ((*((npy_bool *)data0 + @i@)) &&
-                        (*((npy_bool *)data1 + @i@)) &&
-                        (*((npy_bool *)data2 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data2 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(npy_bool);
-        }
-#  endif
-    }
-
-    /* If the loop was unrolled, we need to finish it off */
-#if (@nop@ <= 3)
-    goto finish_after_unrolled_loop;
-#endif
-}
-
-static void
-bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-    npy_bool accum = 0;
-
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        accum = *(npy_bool *)data0 || accum;
-        data0 += stride0;
-#elif @nop@ == 2
-        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
-        data0 += stride0;
-        data1 += stride1;
-#elif @nop@ == 3
-        accum = (*(npy_bool *)data0 &&
-                 *(npy_bool *)data1 &&
-                 *(npy_bool *)data2) || accum;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        accum = temp || accum;
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-
-#  if @nop@ <= 3
-    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
-#  else
-    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
-#  endif
-}
-
-/**end repeat**/
-
-/* These tables need to match up with the type enum */
-static sum_of_products_fn
-_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-    &@name@_sum_of_products_contig_outstride0_one,
-#else
-    NULL,
-#endif
-/**end repeat**/
-}; /* End of _contig_outstride0_unary_specialization_table */
-
-static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        0, 0, 0,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_stride0_contig_outstride0_two,
-    &@name@_sum_of_products_stride0_contig_outcontig_two,
-    &@name@_sum_of_products_contig_stride0_outstride0_two,
-    &@name@_sum_of_products_contig_stride0_outcontig_two,
-    &@name@_sum_of_products_contig_contig_outstride0_two,
-},
-#else
-    {NULL, NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _binary_specialization_table */
-
-static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_outstride0_any,
-    &@name@_sum_of_products_outstride0_one,
-    &@name@_sum_of_products_outstride0_two,
-    &@name@_sum_of_products_outstride0_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _outstride0_specialized_table */
-
-static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_contig_any,
-    &@name@_sum_of_products_contig_one,
-    &@name@_sum_of_products_contig_two,
-    &@name@_sum_of_products_contig_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _allcontig_specialized_table */
-
-static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_any,
-    &@name@_sum_of_products_one,
-    &@name@_sum_of_products_two,
-    &@name@_sum_of_products_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _unnspecialized_table */
-
-NPY_VISIBILITY_HIDDEN sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num,
-                             npy_intp itemsize, npy_intp const *fixed_strides)
-{
-    int iop;
-
-    if (type_num >= NPY_NTYPES) {
-        return NULL;
-    }
-
-    /* contiguous reduction */
-    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
-        sum_of_products_fn ret =
-            _contig_outstride0_unary_specialization_table[type_num];
-        if (ret != NULL) {
-            return ret;
-        }
-    }
-
-    /* nop of 2 has more specializations */
-    if (nop == 2) {
-        /* Encode the zero/contiguous strides */
-        int code;
-        code = (fixed_strides[0] == 0) ? 0 :
-                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
-        code += (fixed_strides[1] == 0) ? 0 :
-                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
-        code += (fixed_strides[2] == 0) ? 0 :
-                    (fixed_strides[2] == itemsize) ? 1 : 8;
-        if (code >= 2 && code < 7) {
-            sum_of_products_fn ret =
-                        _binary_specialization_table[type_num][code-2];
-            if (ret != NULL) {
-                return ret;
-            }
-        }
-    }
-
-    /* Inner loop with an output stride of 0 */
-    if (fixed_strides[nop] == 0) {
-        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* Check for all contiguous */
-    for (iop = 0; iop < nop + 1; ++iop) {
-        if (fixed_strides[iop] != itemsize) {
-            break;
-        }
-    }
-
-    /* Contiguous loop */
-    if (iop == nop + 1) {
-        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* None of the above specializations caught it, general loops */
-    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
-}
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index c6cf18ec6094..0a4e5bbd2718 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -1,12 +1,33 @@
 #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
 
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 #include <numpy/npy_common.h>
+#include <numpy/arrayobject.h>
+#include <numpy/halffloat.h>
+#include <npy_pycompat.h>
+
+#include <ctype.h>
+
+#include "simd/simd.h"
+#include "convert.h"
+#include "common.h"
+#include "ctors.h"
+
+#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#include <numpy/npy_common.h>
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "einsum.dispatch.h"
+#endif
 
 typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
 
-NPY_VISIBILITY_HIDDEN sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num,
-                             npy_intp itemsize, npy_intp const *fixed_strides);
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function,
+    (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides))
 
 #endif

From 1990c13d5d45ab925555566c1f106d501e649ba3 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 25 Aug 2020 11:59:24 +0800
Subject: [PATCH 08/27] headers reconstruct

---
 .../core/src/multiarray/einsum.dispatch.c.src  |  7 +++++++
 numpy/core/src/multiarray/einsum_sumprod.h     | 18 ------------------
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src
index 1c692518e3a8..4a95d59c507c 100644
--- a/numpy/core/src/multiarray/einsum.dispatch.c.src
+++ b/numpy/core/src/multiarray/einsum.dispatch.c.src
@@ -13,8 +13,15 @@
  * NEON NEON_VFPV4
  * VSX VSX2
  */
+
+#define _MULTIARRAYMODULE
+#include <npy_pycompat.h>
+#include <numpy/halffloat.h>
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
+#include "simd/simd.h"
+#include "common.h"
+
 /**begin repeat
  * #name = byte, short, int, long, longlong,
  *         ubyte, ushort, uint, ulong, ulonglong,
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index 0a4e5bbd2718..5a863767e29d 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -1,25 +1,7 @@
 #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/npy_common.h>
-#include <numpy/arrayobject.h>
-#include <numpy/halffloat.h>
-#include <npy_pycompat.h>
-
-#include <ctype.h>
-
-#include "simd/simd.h"
-#include "convert.h"
-#include "common.h"
-#include "ctors.h"
 
 #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
-#include <numpy/npy_common.h>
 
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "einsum.dispatch.h"

From 7b756af0c709d2caa9d484f11f274c44c690399e Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 25 Aug 2020 14:59:01 +0800
Subject: [PATCH 09/27] use for loop replace begin repeat for readability

---
 .../core/src/multiarray/einsum.dispatch.c.src | 68 ++++++-------------
 1 file changed, 19 insertions(+), 49 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src
index 4a95d59c507c..613c6026043b 100644
--- a/numpy/core/src/multiarray/einsum.dispatch.c.src
+++ b/numpy/core/src/multiarray/einsum.dispatch.c.src
@@ -262,23 +262,13 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
-            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
-            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
-            /**end repeat3**/
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
-            /**end repeat3**/
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
-            /**end repeat3**/
+            for (int i = 0; i < 4; i++) {
+                npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i);
+                npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i);
+                npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
+                npyv_@sfx@ abc = npyv_muladd_@sfx@(a, b, c);
+                npyv_@st@_@sfx@(data_out + vstep * i, abc);
+            }
         }
     #elif @unroll_by@ == 2
         const int vstepx2 = vstep * 2;
@@ -367,22 +357,12 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
-            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
-            /**end repeat3**/
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@);
-            /**end repeat3**/
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
-            /**end repeat3**/
+            for (int i = 0; i < 4; i++) {
+                npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i);
+                npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
+                npyv_@sfx@ abc = npyv_muladd_@sfx@(va_scalar, b, c);
+                npyv_@st@_@sfx@(data_out + vstep * i, abc);
+            }
         }
     #elif @unroll_by@ == 2
         const int vstepx2 = vstep * 2;
@@ -464,22 +444,12 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
-            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
-            /**end repeat3**/
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@);
-            /**end repeat3**/
-            /**begin repeat3
-             * #i = 0, 1, 2, 3#
-             */
-            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
-            /**end repeat3**/
+            for (int i = 0; i < 4; i++) {
+                npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i);
+                npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
+                npyv_@sfx@ abc = npyv_muladd_@sfx@(a, vb_scalar, c);
+                npyv_@st@_@sfx@(data_out + vstep * i, abc);
+            }
         }
     #elif @unroll_by@ == 2
         const int vstepx2 = vstep * 2;

From 4877e4062093e637f5b0715078fdeac19c211df5 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 26 Aug 2020 14:41:29 +0800
Subject: [PATCH 10/27] add ivdeps and handle header dependency

---
 benchmarks/benchmarks/bench_linalg.py          |  2 +-
 .../core/src/multiarray/einsum.dispatch.c.src  | 18 +++++++++++++-----
 numpy/core/src/multiarray/einsum_sumprod.h     |  3 +++
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index 04889265b591..4ce14ac3ddf9 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -156,4 +156,4 @@ def time_einsum_contig_contig(self, dtype):
     # trigger sum_of_products_contig_outstride0_one
     def time_einsum_contig_outstride0(self, dtype):
         np.einsum("i->", self.one_dim_big, optimize=True)
-        np.einsum("i->", self.non_contigous_dim1, optimize=True)
\ No newline at end of file
+        np.einsum("i->", self.non_contigous_dim1, optimize=True)
diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src
index 613c6026043b..e7741bb6f509 100644
--- a/numpy/core/src/multiarray/einsum.dispatch.c.src
+++ b/numpy/core/src/multiarray/einsum.dispatch.c.src
@@ -1,12 +1,13 @@
 /*
- * This file contains the implementation of the 'einsum' function,
- * which provides an einstein-summation operation.
+ * This file provides optimized sum of product implementations used internally
+ * by einsum.
  *
  * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
  * The University of British Columbia
  *
  * See LICENSE.txt for the license.
  */
+ 
 /**
  * @targets $maxopt baseline
  * SSE2 (AVX2 FMA3) AVX512F
@@ -14,13 +15,11 @@
  * VSX VSX2
  */
 
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
-#include <npy_pycompat.h>
 #include <numpy/halffloat.h>
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
-#include "simd/simd.h"
-#include "common.h"
 
 /**begin repeat
  * #name = byte, short, int, long, longlong,
@@ -262,6 +261,9 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+#ifdef __GNUC__
+#pragma GCC ivdep
+#endif
             for (int i = 0; i < 4; i++) {
                 npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i);
                 npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i);
@@ -357,6 +359,9 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
+#ifdef __GNUC__
+#pragma GCC ivdep
+#endif
             for (int i = 0; i < 4; i++) {
                 npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i);
                 npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
@@ -444,6 +449,9 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
+#ifdef __GNUC__
+#pragma GCC ivdep
+#endif
             for (int i = 0; i < 4; i++) {
                 npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i);
                 npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index 5a863767e29d..5683b5b1851c 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -1,6 +1,9 @@
 #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 
+#include "simd/simd.h"
+#include "common.h"
+
 #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 
 #ifndef NPY_DISABLE_OPTIMIZATION

From 954e642a82d6138d22aead5d0ae608b8bd77cd48 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 27 Aug 2020 10:19:57 +0800
Subject: [PATCH 11/27] revert to faster simd code

---
 .../core/src/multiarray/einsum.dispatch.c.src | 81 ++++++++++++-------
 numpy/core/src/multiarray/einsum_sumprod.h    |  8 +-
 2 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src
index e7741bb6f509..d73ec872b1fa 100644
--- a/numpy/core/src/multiarray/einsum.dispatch.c.src
+++ b/numpy/core/src/multiarray/einsum.dispatch.c.src
@@ -20,6 +20,10 @@
 #include <numpy/halffloat.h>
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
+#include "simd/simd.h"
+#include "common.h"
+
+#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 
 /**begin repeat
  * #name = byte, short, int, long, longlong,
@@ -261,16 +265,23 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
-#ifdef __GNUC__
-#pragma GCC ivdep
-#endif
-            for (int i = 0; i < 4; i++) {
-                npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i);
-                npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i);
-                npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
-                npyv_@sfx@ abc = npyv_muladd_@sfx@(a, b, c);
-                npyv_@st@_@sfx@(data_out + vstep * i, abc);
-            }
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
     #elif @unroll_by@ == 2
         const int vstepx2 = vstep * 2;
@@ -359,15 +370,22 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
-#ifdef __GNUC__
-#pragma GCC ivdep
-#endif
-            for (int i = 0; i < 4; i++) {
-                npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i);
-                npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
-                npyv_@sfx@ abc = npyv_muladd_@sfx@(va_scalar, b, c);
-                npyv_@st@_@sfx@(data_out + vstep * i, abc);
-            }
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
     #elif @unroll_by@ == 2
         const int vstepx2 = vstep * 2;
@@ -449,15 +467,22 @@ static void
     #if @unroll_by@ == 4
         const int vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
-#ifdef __GNUC__
-#pragma GCC ivdep
-#endif
-            for (int i = 0; i < 4; i++) {
-                npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i);
-                npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i);
-                npyv_@sfx@ abc = npyv_muladd_@sfx@(a, vb_scalar, c);
-                npyv_@st@_@sfx@(data_out + vstep * i, abc);
-            }
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
     #elif @unroll_by@ == 2
         const int vstepx2 = vstep * 2;
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index 5683b5b1851c..59c374434613 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -1,10 +1,8 @@
 #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 
-#include "simd/simd.h"
-#include "common.h"
-
-#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#include "npy_cpu_dispatch.h"
+#include <numpy/npy_common.h>
 
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "einsum.dispatch.h"
@@ -12,7 +10,7 @@
 
 typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
 
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function,
+NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN sum_of_products_fn einsum_get_sum_of_products_function,
     (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides))
 
 #endif

From 50c6b7ee1468be16d18e022546fb90d1ba9835ea Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 28 Aug 2020 11:39:25 +0800
Subject: [PATCH 12/27] changed to baseline solution

---
 numpy/core/setup.py                                  |  2 +-
 numpy/core/src/multiarray/einsum.c.src               | 10 ----------
 .../{einsum.dispatch.c.src => einsum_sumprod.c.src}  | 12 +++---------
 numpy/core/src/multiarray/einsum_sumprod.h           | 11 +++--------
 4 files changed, 7 insertions(+), 28 deletions(-)
 rename numpy/core/src/multiarray/{einsum.dispatch.c.src => einsum_sumprod.c.src} (99%)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 43ba1e22e661..a4a84397d9ee 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -855,7 +855,7 @@ def get_mathlib_info(*args):
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
-            join('src', 'multiarray', 'einsum.dispatch.c.src'),
+            join('src', 'multiarray', 'einsum_sumprod.c.src'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 01ca3111eb1e..cfbee0fe9a18 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -27,16 +27,6 @@
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
 
-static sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)
-{
-    #ifndef NPY_DISABLE_OPTIMIZATION
-        #include "einsum.dispatch.h"
-    #endif
-    NPY_CPU_DISPATCH_CALL(return einsum_get_sum_of_products_function,
-        (nop, type_num, itemsize, fixed_strides))
-}
-
 /*
  * Parses the subscripts for one operand into an output of 'ndim'
  * labels. The resulting 'op_labels' array will have:
diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
similarity index 99%
rename from numpy/core/src/multiarray/einsum.dispatch.c.src
rename to numpy/core/src/multiarray/einsum_sumprod.c.src
index d73ec872b1fa..8fff213bec2a 100644
--- a/numpy/core/src/multiarray/einsum.dispatch.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -8,13 +8,6 @@
  * See LICENSE.txt for the license.
  */
  
-/**
- * @targets $maxopt baseline
- * SSE2 (AVX2 FMA3) AVX512F
- * NEON NEON_VFPV4
- * VSX VSX2
- */
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/halffloat.h>
@@ -1481,8 +1474,9 @@ static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
 /**end repeat**/
 }; /* End of _unnspecialized_table */
 
-NPY_NO_EXPORT sum_of_products_fn NPY_CPU_DISPATCH_CURFX(einsum_get_sum_of_products_function)
-(int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides)
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides)
 {
     int iop;
 
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index 59c374434613..a9fdc733d182 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -1,16 +1,11 @@
 #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
 
-#include "npy_cpu_dispatch.h"
 #include <numpy/npy_common.h>
 
-#ifndef NPY_DISABLE_OPTIMIZATION
-    #include "einsum.dispatch.h"
-#endif
-
 typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
 
-NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN sum_of_products_fn einsum_get_sum_of_products_function,
-    (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides))
-
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides);
 #endif

From 23e28c0112fa5d6d50d1f55020cc630a8d144129 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 31 Aug 2020 10:45:32 +0800
Subject: [PATCH 13/27] remove redundant typedef

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 2 --
 1 file changed, 2 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 8fff213bec2a..67b40e0fb178 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -1294,8 +1294,6 @@ bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
 
 /**end repeat**/
 
-typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
-
 /* These tables need to match up with the type enum */
 static sum_of_products_fn
 _contig_outstride0_unary_specialization_table[NPY_NTYPES] = {

From d298c8e63987f7d292380a34936d8374cf8c8f2b Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 10 Sep 2020 21:06:03 +0800
Subject: [PATCH 14/27] remove redundant intrinsics

---
 numpy/core/src/common/simd/avx2/arithmetic.h   | 7 -------
 numpy/core/src/common/simd/avx512/arithmetic.h | 3 ---
 numpy/core/src/common/simd/neon/arithmetic.h   | 6 ------
 numpy/core/src/common/simd/sse/arithmetic.h    | 8 +-------
 4 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 21a1fe8818bc..f00d8e153fe4 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -62,13 +62,6 @@
 #define npyv_mul_f32 _mm256_mul_ps
 #define npyv_mul_f64 _mm256_mul_pd
 
-#ifdef NPY_HAVE_FMA3
-    #define npyv_muladd_f32 _mm256_fmadd_ps
-    #define npyv_muladd_f64 _mm256_fmadd_pd
-#else
-    #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c)
-    #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c)
-#endif
 // saturated
 // TODO: after implment Packs intrins
 
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 68752bf86146..a783e98ae94e 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -103,9 +103,6 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_mul_f32 _mm512_mul_ps
 #define npyv_mul_f64 _mm512_mul_pd
 
-#define npyv_muladd_f32 _mm512_fmadd_ps
-#define npyv_muladd_f64 _mm512_fmadd_pd
-
 // saturated
 // TODO: after implment Packs intrins
 
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 7234928de964..ff31311d5dcf 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -60,12 +60,6 @@
 #define npyv_mul_f32 vmulq_f32
 #define npyv_mul_f64 vmulq_f64
 
-#ifdef NPY_HAVE_NEON_VFPV4
-    #define npyv_muladd_f32(A, B, C) vfmaq_f32(C, A, B)
-#else
-    #define npyv_muladd_f32(A, B, C) vmlaq_f32(C, A, B)
-#endif
-#define npyv_muladd_f64(A, B, C) vfmaq_f64(C, A, B)
 /***************************
  * Division
  ***************************/
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 32d2ec560807..e1e158ff41df 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -82,13 +82,6 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 #define npyv_mul_f32 _mm_mul_ps
 #define npyv_mul_f64 _mm_mul_pd
 
-#ifdef NPY_HAVE_FMA3
-    #define npyv_muladd_f32 _mm_fmadd_ps
-    #define npyv_muladd_f64 _mm_fmadd_pd
-#else
-    #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c)
-    #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c)
-#endif
 // saturated
 // TODO: after implment Packs intrins
 
@@ -113,6 +106,7 @@ NPY_FINLINE float npyv_sum_f32(__m128 a)
     return _mm_cvtss_f32(t4); 
 #endif
 }
+
 NPY_FINLINE double npyv_sum_f64(__m128d a)
 {
 #ifdef NPY_HAVE_SSE3

From 6dac52e03bb4859ad0fb86345c93df0d8635e739 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 11 Sep 2020 09:24:16 +0800
Subject: [PATCH 15/27] add blank lines

---
 numpy/core/src/common/simd/vsx/arithmetic.h | 3 ---
 numpy/core/src/multiarray/einsum.c.src      | 1 +
 numpy/core/src/multiarray/einsum_sumprod.h  | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 70e3c05d42ab..5454b2eef2fc 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -94,9 +94,6 @@
 #define npyv_mul_f32 vec_mul
 #define npyv_mul_f64 vec_mul
 
-#define npyv_muladd_f32 vec_madd
-#define npyv_muladd_f64 vec_madd
-
 /***************************
  * Division
  ***************************/
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index cfbee0fe9a18..6ad375f670a5 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -27,6 +27,7 @@
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
 
+
 /*
  * Parses the subscripts for one operand into an output of 'ndim'
  * labels. The resulting 'op_labels' array will have:
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index a9fdc733d182..c6cf18ec6094 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -8,4 +8,5 @@ typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
 NPY_VISIBILITY_HIDDEN sum_of_products_fn
 get_sum_of_products_function(int nop, int type_num,
                              npy_intp itemsize, npy_intp const *fixed_strides);
+
 #endif

From 985e5b26492df2df347bac29a0c2859339421e2c Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 14 Sep 2020 10:25:15 +0800
Subject: [PATCH 16/27] add format

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 67b40e0fb178..4d1d3f5f63ab 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -7,7 +7,7 @@
  *
  * See LICENSE.txt for the license.
  */
- 
+
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/halffloat.h>
@@ -293,7 +293,7 @@ static void
     #endif
     }
     /**end repeat2**/
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four scalars in case of:
@@ -395,7 +395,7 @@ static void
     #endif
     }
     /**end repeat2**/
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four scalars in case of:
@@ -492,7 +492,7 @@ static void
     #endif
     }
     /**end repeat2**/
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four scalars in case of:
@@ -583,7 +583,7 @@ static void
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four scalars in case of:
@@ -662,7 +662,7 @@ static void
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four scalars in case of:
@@ -735,7 +735,7 @@ static void
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four scalars in case of:
@@ -911,7 +911,7 @@ static void
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
-npyv_cleanup();
+    npyv_cleanup();
 #endif // NPYV check for @type@
 /**
  * Unroll by four/eight scalars in case of:

From 88c27475c2d60c9246945bf7ebf7fc61cd594a5f Mon Sep 17 00:00:00 2001
From: Chunlin <fangchunlin@huawei.com>
Date: Mon, 14 Sep 2020 19:29:59 +0800
Subject: [PATCH 17/27] Update numpy/core/src/common/simd/avx512/arithmetic.h

Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
---
 .../core/src/common/simd/avx512/arithmetic.h  | 51 +++++++++----------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index a783e98ae94e..39d93be257d3 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -128,36 +128,33 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
  * The third one is almost the same as the second one but only works for
  * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
  ***************************/
-NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
-{
 #ifdef NPY_HAVE_AVX512F_REDUCE
-    return _mm512_reduce_add_ps(a);
+    #define npyv_sum_f32 _mm512_reduce_add_ps
+    #define npyv_sum_f64 _mm512_reduce_add_pd
 #else
-    __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
-    __m512 sum32 = _mm512_add_ps(a, h64);
-    __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
-    __m512 sum16 = _mm512_add_ps(sum32, h32);
-    __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
-    __m512 sum8  = _mm512_add_ps(sum16, h16);
-    __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
-    __m512 sum4  = _mm512_add_ps(sum8, h4);
-    return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+    NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+    {
+        __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512 sum32 = _mm512_add_ps(a, h64);
+        __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum16 = _mm512_add_ps(sum32, h32);
+        __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum8  = _mm512_add_ps(sum16, h16);
+        __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512 sum4  = _mm512_add_ps(sum8, h4);
+        return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+    }
+    NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+    {
+        __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512d sum32 = _mm512_add_pd(a, h64);
+        __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512d sum16 = _mm512_add_pd(sum32, h32);
+        __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512d sum8  = _mm512_add_pd(sum16, h16);
+        return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+    }
 #endif
-}
-NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
-{
-#ifdef NPY_HAVE_AVX512F_REDUCE
-    return _mm512_reduce_add_pd(a);
-#else
-    __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
-    __m512d sum32 = _mm512_add_pd(a, h64);
-    __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
-    __m512d sum16 = _mm512_add_pd(sum32, h32);
-    __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
-    __m512d sum8  = _mm512_add_pd(sum16, h16);
-    return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
-#endif
-}
 
 /***************************
  * FUSED

From 54943e09f864417ce56c14268dfe00576a144929 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 15 Sep 2020 09:36:31 +0800
Subject: [PATCH 18/27] modify the int to npy_intp

---
 .../core/src/multiarray/einsum_sumprod.c.src  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 4d1d3f5f63ab..c5e3dc70e0a5 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -256,7 +256,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -277,7 +277,7 @@ static void
             /**end repeat3**/
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) {
             npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
             npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
@@ -361,7 +361,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -381,7 +381,7 @@ static void
             /**end repeat3**/
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) {
             npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
             npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
@@ -458,7 +458,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -478,7 +478,7 @@ static void
             /**end repeat3**/
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) {
             npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
             npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
@@ -556,7 +556,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -570,7 +570,7 @@ static void
                     vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) {
             npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
             npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
@@ -638,7 +638,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -651,7 +651,7 @@ static void
                       vaccum = npyv_add_@sfx@(b0123, vaccum);
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) {
             npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
             npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
@@ -711,7 +711,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -724,7 +724,7 @@ static void
                       vaccum = npyv_add_@sfx@(a0123, vaccum);
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
             npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
             npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
@@ -887,7 +887,7 @@ static void
      */
     @cond@ {
     #if @unroll_by@ == 4
-        const int vstepx4 = vstep * 4;
+        const npy_intp vstepx4 = vstep * 4;
         for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
             /**begin repeat3
              * #i = 0, 1, 2, 3#
@@ -900,7 +900,7 @@ static void
                       vaccum = npyv_add_@sfx@(a0123, vaccum);
         }
     #elif @unroll_by@ == 2
-        const int vstepx2 = vstep * 2;
+        const npy_intp vstepx2 = vstep * 2;
         for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
             npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
             npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);

From e993af2dca9b658cb08aa0111bc031a97dbe6430 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 18 Sep 2020 11:27:02 +0800
Subject: [PATCH 19/27] split benchmark and define common macro

---
 benchmarks/benchmarks/bench_linalg.py         |  35 ++++-
 numpy/core/src/multiarray/common.h            |   4 +
 .../core/src/multiarray/einsum_sumprod.c.src  | 129 +++++-------------
 3 files changed, 69 insertions(+), 99 deletions(-)

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index 4ce14ac3ddf9..602a0cb6bb98 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -126,34 +126,55 @@ def setup(self, dtype):
     # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
     def time_einsum_outer(self, dtype):
         np.einsum("i,j", self.one_dim, self.one_dim, optimize=True)
-        np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)
 
     # multiply(a, b):trigger sum_of_products_contig_two
     def time_einsum_multiply(self, dtype):
         np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True)
-        np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)
     
     # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two
     def time_einsum_sum_mul(self, dtype):
         np.einsum(",i...->", 300, self.three_dim_small, optimize=True)
-        np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)
 
     # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
     def time_einsum_sum_mul2(self, dtype):
         np.einsum("i...,->", self.three_dim_small, 300, optimize=True)
-        np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)
     
     # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
     def time_einsum_mul(self, dtype):
         np.einsum("i,->i", self.one_dim_big, 300, optimize=True)
-        np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)
     
     # trigger contig_contig_outstride0_two
     def time_einsum_contig_contig(self, dtype):
         np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True)
-        np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)
 
     # trigger sum_of_products_contig_outstride0_one
     def time_einsum_contig_outstride0(self, dtype):
         np.einsum("i->", self.one_dim_big, optimize=True)
-        np.einsum("i->", self.non_contigous_dim1, optimize=True)
+
+    # outer(a,b): non_contigous arrays
+    def time_einsum_noncon_outer(self, dtype):
+        np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)
+
+    # multiply(a, b):non_contigous arrays
+    def time_einsum_noncon_multiply(self, dtype):
+        np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)
+    
+    # sum and multiply:non_contigous arrays
+    def time_einsum_noncon_sum_mul(self, dtype):
+        np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)
+
+    # sum and multiply:non_contigous arrays
+    def time_einsum_noncon_sum_mul2(self, dtype):
+        np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)
+    
+    # scalar mul: non_contigous arrays
+    def time_einsum_noncon_mul(self, dtype):
+        np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)
+    
+    # contig_contig_outstride0_two: non_contigous arrays
+    def time_einsum_noncon_contig_contig(self, dtype):
+        np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)
+
+    # sum_of_products_contig_outstride0_one：non_contigous arrays
+    def time_einsum_noncon_contig_outstride0(self, dtype):
+        np.einsum("i->", self.non_contigous_dim1, optimize=True)
\ No newline at end of file
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index ef9bc79da325..b36cbcae01a6 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -205,7 +205,11 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
      * This test is faster than a direct modulo.
      * Note alignment value of 0 is allowed and returns False.
      */
+#ifdef NPY_HAVE_NEON
+    return 0;
+#else
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
+#endif
 }
 
 /* Get equivalent "uint" alignment given an itemsize, for use in copy code */
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index c5e3dc70e0a5..b69a2c15d906 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -17,6 +17,14 @@
 #include "common.h"
 
 #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+/**
+ * Unroll by four/eight scalars in case of:
+ *  - The SIMD width is higher than 128bit since we unroll by x2/x4
+ *    and that may lead to performance loss on small arrays.
+ *  - To give the chance to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
 
 /**begin repeat
  * #name = byte, short, int, long, longlong,
@@ -240,13 +248,8 @@ static void
                                                             (int)count);
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
-                           EINSUM_IS_ALIGNED(data_out);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
+                        EINSUM_IS_ALIGNED(data_out);
     const int vstep = npyv_nlanes_@sfx@;
 
     /**begin repeat2
@@ -290,19 +293,15 @@ static void
             npyv_@st@_@sfx@(data_out, abc0);
             npyv_@st@_@sfx@(data_out + vstep, abc1);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -345,12 +344,7 @@ static void
 
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
     const int vstep = npyv_nlanes_@sfx@;
     const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar);
 
@@ -392,19 +386,15 @@ static void
             npyv_@st@_@sfx@(data_out, abc0);
             npyv_@st@_@sfx@(data_out + vstep, abc1);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     for (; count >= 4; count -= 4, data1 += 4, data_out += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -442,12 +432,7 @@ static void
                                                     (int)count);
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
     const int vstep = npyv_nlanes_@sfx@;
     const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar);
 
@@ -489,19 +474,15 @@ static void
             npyv_@st@_@sfx@(data_out, abc0);
             npyv_@st@_@sfx@(data_out + vstep, abc1);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     for (; count >= 4; count -= 4, data0 += 4, data_out += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -540,12 +521,7 @@ static void
                                                     (int)count);
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
     const int vstep = npyv_nlanes_@sfx@;
     npyv_@sfx@ vaccum = npyv_zero_@sfx@();
 
@@ -579,20 +555,16 @@ static void
             npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum);
                     vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -622,12 +594,7 @@ static void
                                                     (int)count);
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data1);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
     const int vstep = npyv_nlanes_@sfx@;
     npyv_@sfx@ vaccum = npyv_zero_@sfx@();
 
@@ -658,20 +625,16 @@ static void
             npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1);
                     vaccum = npyv_add_@sfx@(b01, vaccum);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     for (; count >= 4; count -= 4, data1 += 4) {
         const @type@ b01 = @from@(data1[0]) + @from@(data1[1]);
         const @type@ b23 = @from@(data1[2]) + @from@(data1[3]);
@@ -695,12 +658,7 @@ static void
                                                     (int)count);
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data0);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
     const int vstep = npyv_nlanes_@sfx@;
     npyv_@sfx@ vaccum = npyv_zero_@sfx@();
 
@@ -731,20 +689,16 @@ static void
             npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
                     vaccum = npyv_add_@sfx@(a01, vaccum);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     for (; count >= 4; count -= 4, data0 += 4) {
         const @type@ a01 = @from@(data0[0]) + @from@(data0[1]);
         const @type@ a23 = @from@(data0[2]) + @from@(data0[3]);
@@ -871,12 +825,7 @@ static void
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    #ifndef NPY_HAVE_NEON
     const int is_aligned = EINSUM_IS_ALIGNED(data0);
-    #else
-    // ARM/Neon don't have instructions for aligned memory access
-    const int is_aligned = 0;
-    #endif
     const int vstep = npyv_nlanes_@sfx@;
     npyv_@sfx@ vaccum = npyv_zero_@sfx@();
 
@@ -907,20 +856,16 @@ static void
             npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
                     vaccum = npyv_add_@sfx@(a01, vaccum);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
     #endif
     }
     /**end repeat2**/
     accum = npyv_sum_@sfx@(vaccum);
     npyv_cleanup();
 #endif // NPYV check for @type@
-/**
- * Unroll by four/eight scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the change to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
-#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
     #if @complex@
         for (; count > 4; count -= 4, data0 += 4*2) {
             const @temptype@ re01 = data0[0] + data0[2];

From 38f7382c3bc508b0ba96dd3b76e336b77f0dfca6 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 18 Sep 2020 11:59:15 +0800
Subject: [PATCH 20/27] avx2 test

---
 numpy/core/src/common/simd/simd.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 2f39c8427b5d..4d20143534d4 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -14,7 +14,8 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
+#define NPY_HAVE_AVX2
+#include <immintrin.h>
 // lane type by intrin suffix
 typedef npy_uint8  npyv_lanetype_u8;
 typedef npy_int8   npyv_lanetype_s8;

From c6c1e303d92e4769a811ea09d9eadc96ffef8119 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 18 Sep 2020 18:05:34 +0800
Subject: [PATCH 21/27] explain for auto-vectorize part

---
 numpy/core/src/multiarray/common.h             |  4 ----
 numpy/core/src/multiarray/einsum_sumprod.c.src | 14 +++++++++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index b36cbcae01a6..ef9bc79da325 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -205,11 +205,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
      * This test is faster than a direct modulo.
      * Note alignment value of 0 is allowed and returns False.
      */
-#ifdef NPY_HAVE_NEON
-    return 0;
-#else
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
-#endif
 }
 
 /* Get equivalent "uint" alignment given an itemsize, for use in copy code */
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index b69a2c15d906..072b4da2a5dc 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -16,11 +16,19 @@
 #include "simd/simd.h"
 #include "common.h"
 
-#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
 /**
  * Unroll by four/eight scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
+ *  - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents
+ *    the number of times the loop is unrolled, which may lead to
+ *    un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4)
+ *    if The SIMD width is higher than 128bit, The performance loss on remained
+ *    arrays is nonnegligible, so we choose to use the compiler auto-vectorize.
  *  - To give the chance to the compiler to
  *    auto-vectorize in case of NPYV wasn't available.
  */

From f18ade4b08cf0db63d8897b86025ebaa516a2ed6 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 18 Sep 2020 18:12:38 +0800
Subject: [PATCH 22/27] add explantion

---
 numpy/core/src/multiarray/common.h            |  4 ----
 .../core/src/multiarray/einsum_sumprod.c.src  | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index b36cbcae01a6..ef9bc79da325 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -205,11 +205,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
      * This test is faster than a direct modulo.
      * Note alignment value of 0 is allowed and returns False.
      */
-#ifdef NPY_HAVE_NEON
-    return 0;
-#else
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
-#endif
 }
 
 /* Get equivalent "uint" alignment given an itemsize, for use in copy code */
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index b69a2c15d906..10c8f7a9fb10 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -16,6 +16,25 @@
 #include "simd/simd.h"
 #include "common.h"
 
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
+
+/**
+ * Unroll by four/eight scalars in case of:
+ *  - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents
+ *    the number of times the loop that unrolled, Eg: for float32, 2 simd loop
+ *    is unrolled, for float64, 4 simd loop is unrolled, which may lead to
+ *    un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4)
+ *    if The SIMD width is higher than 128bit, The performance loss on remained
+ *    arrays is nonnegligible, so we choose to use the compiler auto-vectorize.
+ *  - To give the chance to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+
 #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 /**
  * Unroll by four/eight scalars in case of:

From 33b7d2a129369539799e586162399c584b16a88f Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Sat, 19 Sep 2020 09:04:43 +0800
Subject: [PATCH 23/27] remove duplicated message

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 10c8f7a9fb10..777b24c8b01a 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -34,15 +34,6 @@
  *  - To give the chance to the compiler to
  *    auto-vectorize in case of NPYV wasn't available.
  */
-
-#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
-/**
- * Unroll by four/eight scalars in case of:
- *  - The SIMD width is higher than 128bit since we unroll by x2/x4
- *    and that may lead to performance loss on small arrays.
- *  - To give the chance to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
- */
 #define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
 
 /**begin repeat

From 5a692ed2be497050fcad732cff682c49837eccb7 Mon Sep 17 00:00:00 2001
From: Chunlin <fangchunlin@huawei.com>
Date: Tue, 29 Sep 2020 15:26:29 +0800
Subject: [PATCH 24/27] Update benchmarks/benchmarks/bench_linalg.py

Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
---
 benchmarks/benchmarks/bench_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index 602a0cb6bb98..a72cccb5f7ed 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -177,4 +177,4 @@ def time_einsum_noncon_contig_contig(self, dtype):
 
     # sum_of_products_contig_outstride0_one：non_contigous arrays
     def time_einsum_noncon_contig_outstride0(self, dtype):
-        np.einsum("i->", self.non_contigous_dim1, optimize=True)
\ No newline at end of file
+        np.einsum("i->", self.non_contigous_dim1, optimize=True)

From 20d5cdae0c19345c3e6825394a686b48d8983acc Mon Sep 17 00:00:00 2001
From: Chunlin <fangchunlin@huawei.com>
Date: Wed, 30 Sep 2020 10:31:00 +0800
Subject: [PATCH 25/27] Update numpy/core/src/multiarray/einsum_sumprod.c.src

Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
---
 .../core/src/multiarray/einsum_sumprod.c.src  | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 777b24c8b01a..d347a69a1322 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -24,15 +24,17 @@
 #endif
 
 /**
- * Unroll by four/eight scalars in case of:
- *  - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents
- *    the number of times the loop that unrolled, Eg: for float32, 2 simd loop
- *    is unrolled, for float64, 4 simd loop is unrolled, which may lead to
- *    un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4)
- *    if The SIMD width is higher than 128bit, The performance loss on remained
- *    arrays is nonnegligible, so we choose to use the compiler auto-vectorize.
- *  - To give the chance to the compiler to
- *    auto-vectorize in case of NPYV wasn't available.
+ * This macro is used to enable a scalar loop which advances 4 elements at a
+ * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
+ * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
+ * that finishes up all the remaining scalars. The purpose of the unrolled loop
+ * is to enable auto-vectorization in cases when all of the following are true:
+ *
+ *  - optimization is allowed
+ *  - either:
+ *    - we did not run the SIMD loop at all, due to NPV being disabled.
+ *    - the SIMD loop was larger than 128bit, so there are likely to be many
+ *      elements left to process.
  */
 #define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
 

From 7ff73245031cfd1004082a657f83d4e9f3751b9d Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 12 Oct 2020 09:47:14 +0800
Subject: [PATCH 26/27] fix typos

---
 numpy/core/src/common/npy_cpu_dispatch.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 274520852569..a0f82fa3da05 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -17,7 +17,7 @@
  * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
  */
 /**
- * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
+ * Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION',
  * due the nature of command argument '--disable-optimization',
  * which is explicitly disabling the module ccompiler_opt.
  */
@@ -29,7 +29,7 @@
          * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
          * since c99 supports bool variables which may lead to ambiguous errors.
         */
-        // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
+        // backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token.
         #define NPY__DISPATCH_DEFBOOL
         typedef bool npy__dispatch_bkbool;
     #endif
@@ -134,10 +134,10 @@
  *    NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
  *    NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
  *
- * By assuming the provided config header drived from a dispatch-able source,
+ * By assuming the provided config header derived from a dispatch-able source,
  * that configured with "@targets baseline sse41 vsx3 asimdhp",
  * they supported by the compiler and enabled via '--cpu-dspatch',
- * then the prototype declrations at the above example will equlivent to the follows:
+ * then the prototype declrations at the above example will equivalent to the follows:
  *
  * - x86:
  *      void dispatch_me(const int*, int*); // baseline
@@ -179,7 +179,7 @@
 /**
  * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
  *
- * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
  * if it was provided within the configration statments.
  */
 #define NPY_CPU_DISPATCH_DECLARE_XB(...) \
@@ -206,7 +206,7 @@
  *  In order to call or to assign the pointer of it from outside the dispatch-able source,
  *  you have to use this Macro as follows:
  *
- *    // bring the genreated config header of the dispatch-abel source
+ *    // bring the generated config header of the dispatch-able source
  *    #ifndef NPY_DISABLE_OPTIMIZATION
  *        #include "dispatchable_source_name.dispatch.h"
  *    #endif

From 73f61c33775a394d72857298b932df5821acb454 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 13 Oct 2020 09:56:20 +0800
Subject: [PATCH 27/27] remove extra test

---
 numpy/core/src/common/simd/simd.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 4d20143534d4..2f39c8427b5d 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -14,8 +14,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-#define NPY_HAVE_AVX2
-#include <immintrin.h>
+
 // lane type by intrin suffix
 typedef npy_uint8  npyv_lanetype_u8;
 typedef npy_int8   npyv_lanetype_s8;