diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index dc2849d58380..a72cccb5f7ed 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -108,26 +108,73 @@ def time_numpy_linalg_lstsq_a__b_float64(self):
 
 class Einsum(Benchmark):
     param_names = ['dtype']
-    params = [[np.float64]]
+    params = [[np.float32, np.float64]]
     def setup(self, dtype):
-        self.a = np.arange(2900, dtype=dtype)
-        self.b = np.arange(3000, dtype=dtype)
-        self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
-        self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40)
-        self.d = np.arange(10000, dtype=dtype).reshape(10,100,10)
-
-    #outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
+        self.one_dim_small = np.arange(600, dtype=dtype)
+        self.one_dim = np.arange(3000, dtype=dtype)
+        self.one_dim_big = np.arange(480000, dtype=dtype)
+        self.two_dim_small = np.arange(1200, dtype=dtype).reshape(30, 40)
+        self.two_dim = np.arange(240000, dtype=dtype).reshape(400, 600)
+        self.three_dim_small = np.arange(10000, dtype=dtype).reshape(10,100,10)
+        self.three_dim = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
+        # non_contigous arrays
+        self.non_contigous_dim1_small = np.arange(1, 80, 2, dtype=dtype)
+        self.non_contigous_dim1 = np.arange(1, 4000, 2, dtype=dtype)
+        self.non_contigous_dim2 = np.arange(1, 2400, 2, dtype=dtype).reshape(30, 40)
+        self.non_contigous_dim3 = np.arange(1, 48000, 2, dtype=dtype).reshape(20, 30, 40)
+
+    # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
     def time_einsum_outer(self, dtype):
-        np.einsum("i,j", self.a, self.b, optimize=True)
+        np.einsum("i,j", self.one_dim, self.one_dim, optimize=True)
 
     # multiply(a, b):trigger sum_of_products_contig_two
     def time_einsum_multiply(self, dtype):
-        np.einsum("..., ...", self.c1, self.c , optimize=True)
+        np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True)
     
     # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two
     def time_einsum_sum_mul(self, dtype):
-        np.einsum(",i...->", 300, self.d, optimize=True)
+        np.einsum(",i...->", 300, self.three_dim_small, optimize=True)
 
     # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
     def time_einsum_sum_mul2(self, dtype):
-        np.einsum("i...,->", self.d, 300, optimize=True)
\ No newline at end of file
+        np.einsum("i...,->", self.three_dim_small, 300, optimize=True)
+    
+    # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
+    def time_einsum_mul(self, dtype):
+        np.einsum("i,->i", self.one_dim_big, 300, optimize=True)
+    
+    # trigger contig_contig_outstride0_two
+    def time_einsum_contig_contig(self, dtype):
+        np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True)
+
+    # trigger sum_of_products_contig_outstride0_one
+    def time_einsum_contig_outstride0(self, dtype):
+        np.einsum("i->", self.one_dim_big, optimize=True)
+
+    # outer(a,b): non_contigous arrays
+    def time_einsum_noncon_outer(self, dtype):
+        np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)
+
+    # multiply(a, b):non_contigous arrays
+    def time_einsum_noncon_multiply(self, dtype):
+        np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)
+    
+    # sum and multiply:non_contigous arrays
+    def time_einsum_noncon_sum_mul(self, dtype):
+        np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)
+
+    # sum and multiply:non_contigous arrays
+    def time_einsum_noncon_sum_mul2(self, dtype):
+        np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)
+    
+    # scalar mul: non_contigous arrays
+    def time_einsum_noncon_mul(self, dtype):
+        np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)
+    
+    # contig_contig_outstride0_two: non_contigous arrays
+    def time_einsum_noncon_contig_contig(self, dtype):
+        np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)
+
+    # sum_of_products_contig_outstride0_one：non_contigous arrays
+    def time_einsum_noncon_contig_outstride0(self, dtype):
+        np.einsum("i->", self.non_contigous_dim1, optimize=True)
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 274520852569..a0f82fa3da05 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -17,7 +17,7 @@
  * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
  */
 /**
- * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
+ * Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION',
  * due the nature of command argument '--disable-optimization',
  * which is explicitly disabling the module ccompiler_opt.
  */
@@ -29,7 +29,7 @@
          * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
          * since c99 supports bool variables which may lead to ambiguous errors.
         */
-        // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
+        // backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token.
         #define NPY__DISPATCH_DEFBOOL
         typedef bool npy__dispatch_bkbool;
     #endif
@@ -134,10 +134,10 @@
  *    NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
  *    NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
  *
- * By assuming the provided config header drived from a dispatch-able source,
+ * By assuming the provided config header derived from a dispatch-able source,
  * that configured with "@targets baseline sse41 vsx3 asimdhp",
  * they supported by the compiler and enabled via '--cpu-dspatch',
- * then the prototype declrations at the above example will equlivent to the follows:
+ * then the prototype declrations at the above example will equivalent to the follows:
  *
  * - x86:
  *      void dispatch_me(const int*, int*); // baseline
@@ -179,7 +179,7 @@
 /**
  * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
  *
- * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
  * if it was provided within the configration statments.
  */
 #define NPY_CPU_DISPATCH_DECLARE_XB(...) \
@@ -206,7 +206,7 @@
  *  In order to call or to assign the pointer of it from outside the dispatch-able source,
  *  you have to use this Macro as follows:
  *
- *    // bring the genreated config header of the dispatch-abel source
+ *    // bring the generated config header of the dispatch-able source
  *    #ifndef NPY_DISABLE_OPTIMIZATION
  *        #include "dispatchable_source_name.dispatch.h"
  *    #endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 4af9e4d1748a..f00d8e153fe4 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -72,6 +72,26 @@
 #define npyv_div_f32 _mm256_div_ps
 #define npyv_div_f64 _mm256_div_pd
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m256 a)
+{
+    __m256 sum_halves = _mm256_hadd_ps(a, a);
+    sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
+    __m128 lo = _mm256_castps256_ps128(sum_halves);
+    __m128 hi = _mm256_extractf128_ps(sum_halves, 1);
+    __m128 sum = _mm_add_ps(lo, hi);
+    return _mm_cvtss_f32(sum);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m256d a)
+{
+    __m256d sum_halves = _mm256_hadd_pd(a, a);
+    __m128d lo = _mm256_castpd256_pd128(sum_halves);
+    __m128d hi = _mm256_extractf128_pd(sum_halves, 1);
+    __m128d sum = _mm_add_pd(lo, hi);
+    return _mm_cvtsd_f64(sum);
+}
+
 /***************************
  * FUSED
  ***************************/
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 824ae818ee3a..39d93be257d3 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -113,6 +113,49 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_div_f32 _mm512_div_ps
 #define npyv_div_f64 _mm512_div_pd
 
+/***************************
+ * Reduce Sum
+ * there are three ways to implement reduce sum for AVX512:
+ * 1- split(256) /add /split(128) /add /hadd /hadd /extract
+ * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
+ * 3- _mm512_reduce_add_ps/pd
+ * The first one is been widely used by many projects
+ * 
+ * the second one is used by Intel Compiler, maybe because the
+ * latency of hadd increased by (2-3) starting from Skylake-X which makes two
+ * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
+ * 
+ * The third one is almost the same as the second one but only works for
+ * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
+ ***************************/
+#ifdef NPY_HAVE_AVX512F_REDUCE
+    #define npyv_sum_f32 _mm512_reduce_add_ps
+    #define npyv_sum_f64 _mm512_reduce_add_pd
+#else
+    NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+    {
+        __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512 sum32 = _mm512_add_ps(a, h64);
+        __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum16 = _mm512_add_ps(sum32, h32);
+        __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum8  = _mm512_add_ps(sum16, h16);
+        __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512 sum4  = _mm512_add_ps(sum8, h4);
+        return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+    }
+    NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+    {
+        __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512d sum32 = _mm512_add_pd(a, h64);
+        __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512d sum16 = _mm512_add_pd(sum32, h32);
+        __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512d sum8  = _mm512_add_pd(sum16, h16);
+        return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+    }
+#endif
+
 /***************************
  * FUSED
  ***************************/
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 5eeee1bb6d02..ff31311d5dcf 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -75,6 +75,19 @@
 #endif
 #define npyv_div_f64 vdivq_f64
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(float32x4_t a)
+{
+    float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+    return vget_lane_f32(vpadd_f32(r, r), 0);
+}
+#ifdef __aarch64__
+    NPY_FINLINE double npyv_sum_f64(float64x2_t a)
+    {
+        return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
+    }
+#endif
+
 /***************************
  * FUSED F32
  ***************************/
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 717dacd39f8d..e1e158ff41df 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -91,6 +91,31 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 // TODO: emulate integer division
 #define npyv_div_f32 _mm_div_ps
 #define npyv_div_f64 _mm_div_pd
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m128 a)
+{
+#ifdef NPY_HAVE_SSE3
+    __m128 sum_halves = _mm_hadd_ps(a, a);
+    return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
+#else
+    __m128 t1 = _mm_movehl_ps(a, a);
+    __m128 t2 = _mm_add_ps(a, t1);
+    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+    __m128 t4 = _mm_add_ss(t2, t3);
+    return _mm_cvtss_f32(t4); 
+#endif
+}
+
+NPY_FINLINE double npyv_sum_f64(__m128d a)
+{
+#ifdef NPY_HAVE_SSE3
+    return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
+#else
+    return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
+#endif
+}
+
 /***************************
  * FUSED
  ***************************/
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 6ef007676d03..5454b2eef2fc 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -100,6 +100,18 @@
 #define npyv_div_f32 vec_div
 #define npyv_div_f64 vec_div
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1) +
+    vec_extract(a, 2) + vec_extract(a, 3);
+}
+
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1);
+}
+
 /***************************
  * FUSED
  ***************************/
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index c58e742874d0..d347a69a1322 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -10,38 +10,33 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
-
-#include <numpy/npy_common.h>
-#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
 #include <numpy/halffloat.h>
-
 #include "einsum_sumprod.h"
 #include "einsum_debug.h"
+#include "simd/simd.h"
+#include "common.h"
 
-
-#ifdef NPY_HAVE_SSE_INTRINSICS
-#define EINSUM_USE_SSE1 1
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
 #else
-#define EINSUM_USE_SSE1 0
-#endif
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#define EINSUM_USE_SSE2 1
-#else
-#define EINSUM_USE_SSE2 0
-#endif
-
-#if EINSUM_USE_SSE1
-#include <xmmintrin.h>
-#endif
-
-#if EINSUM_USE_SSE2
-#include <emmintrin.h>
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 #endif
 
-#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
-
-/**********************************************/
+/**
+ * This macro is used to enable a scalar loop which advances 4 elements at a
+ * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
+ * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
+ * that finishes up all the remaining scalars. The purpose of the unrolled loop
+ * is to enable auto-vectorization in cases when all of the following are true:
+ *
+ *  - optimization is allowed
+ *  - either:
+ *    - we did not run the SIMD loop at all, due to NPV being disabled.
+ *    - the SIMD loop was larger than 128bit, so there are likely to be many
+ *      elements left to process.
+ */
+#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
 
 /**begin repeat
  * #name = byte, short, int, long, longlong,
@@ -56,6 +51,10 @@
  *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *             npy_float, npy_float, npy_double, npy_longdouble,
  *             npy_float, npy_double, npy_longdouble#
+* #sfx  = s8, s16, s32, long, s64,
+ *        u8, u16, u32, ulong, u64,
+ *        half, f32, f64, longdouble,
+ *        f32, f64, clongdouble#
  * #to = ,,,,,
  *       ,,,,,
  *       npy_float_to_half,,,,
@@ -76,8 +75,15 @@
  *            0*5,
  *            0,0,1,0,
  *            0*3#
+ * #NPYV_CHK = 0*5,
+ *             0*5,
+ *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0*3#
+ * #unroll_by = 0*5,
+ *              0*5,
+ *              0,2, 4, 0,
+ *              0*3#
  */
-
 /**begin repeat1
  * #nop = 1, 2, 3, 1000#
  * #noplabel = one, two, three, any#
@@ -250,115 +256,90 @@ static void
     @type@ *data0 = (@type@ *)dataptr[0];
     @type@ *data1 = (@type@ *)dataptr[1];
     @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b;
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
                                                             (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@]) *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_@sfx@;
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
+            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
+            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, b0, c0);
+            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, b1, c1);
+            npyv_@st@_@sfx@(data_out, abc0);
+            npyv_@st@_@sfx@(data_out + vstep, abc1);
         }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
+    }
+    /**end repeat2**/
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
     }
 #endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-        data_out += 8;
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b + c);
     }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
 }
 
 /* Some extra specializations for the two operand case */
@@ -366,128 +347,89 @@ static void
 @name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]);
     @type@ *data1 = (@type@ *)dataptr[1];
     @type@ *data_out = (@type@ *)dataptr[2];
 
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value0_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value0_sse;
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                     (int)count);
 
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(value0 *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value0_sse = _mm_set_ps1(value0);
-
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
+    const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_@sfx@;
+    const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar);
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
-        else {
-            return;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) {
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
+            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
+            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(va_scalar, b0, c0);
+            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(va_scalar, b1, c1);
+            npyv_@st@_@sfx@(data_out, abc0);
+            npyv_@st@_@sfx@(data_out + vstep, abc1);
         }
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
     }
-#elif EINSUM_USE_SSE2 && @float64@
-    value0_sse = _mm_set1_pd(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
+    /**end repeat2**/
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a_scalar * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
     }
 #endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(value0 *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    if (count > 0) {
-        goto finish_after_unrolled_loop;
+    for (; count > 0; --count, ++data1, ++data_out) {
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a_scalar * b + c);
     }
 }
 
@@ -496,116 +438,87 @@ static void
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
     @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]);
     @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value1_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value1_sse;
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                     (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@])*
-                                 value1  +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value1_sse = _mm_set_ps1(value1);
-
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_@sfx@;
+    const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar);
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
         }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value1_sse = _mm_set1_pd(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
+            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
+            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, vb_scalar, c0);
+            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, vb_scalar, c1);
+            npyv_@st@_@sfx@(data_out, abc0);
+            npyv_@st@_@sfx@(data_out + vstep, abc1);
         }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
+    }
+    /**end repeat2**/
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data0 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b_scalar + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
     }
 #endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@])*
-                             value1  +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data_out += 8;
+    for (; count > 0; --count, ++data0, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b_scalar + c);
     }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
 }
 
 static void
@@ -616,295 +529,134 @@ static void
     @type@ *data1 = (@type@ *)dataptr[1];
     @temptype@ accum = 0;
 
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                     (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
              */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum);
+            npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3);
+            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2);
+                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
         }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum);
+                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
         }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
     }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
          */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
+        const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
+        /**end repeat2**/
+        accum += ab0 + ab1 + ab2 + ab3;
     }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
 #endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    for (; count > 0; --count, ++data0, ++data1) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        accum += a * b;
+    }
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
 }
 
 static void
 @name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]);
     @type@ *data1 = (@type@ *)dataptr[1];
     @temptype@ accum = 0;
 
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                     (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
+    const int is_aligned = EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
              */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ b01   = npyv_add_@sfx@(b0, b1);
+            npyv_@sfx@ b23   = npyv_add_@sfx@(b2, b3);
+            npyv_@sfx@ b0123 = npyv_add_@sfx@(b01, b23);
+                      vaccum = npyv_add_@sfx@(b0123, vaccum);
         }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) {
+            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
+            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
+            npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1);
+                    vaccum = npyv_add_@sfx@(b01, vaccum);
         }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
     }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data1 += 8;
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data1 += 4) {
+        const @type@ b01 = @from@(data1[0]) + @from@(data1[1]);
+        const @type@ b23 = @from@(data1[2]) + @from@(data1[3]);
+        accum += b01 + b23;
     }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
 #endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    for (; count > 0; --count, ++data1) {
+        accum += @from@(*data1);
+    }
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + a_scalar * accum);
 }
 
 static void
@@ -912,136 +664,63 @@ static void
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
     @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]);
     @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                     (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
+    const int is_aligned = EINSUM_IS_ALIGNED(data0);
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
              */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
+            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
+            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
+                      vaccum = npyv_add_@sfx@(a0123, vaccum);
         }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
+                    vaccum = npyv_add_@sfx@(a01, vaccum);
         }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
     }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    for (; count >= 4; count -= 4, data0 += 4) {
+        const @type@ a01 = @from@(data0[0]) + @from@(data0[1]);
+        const @type@ a23 = @from@(data0[2]) + @from@(data0[3]);
+        accum += a01 + a23;
     }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
 #endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    for (; count > 0; --count, ++data0) {
+        accum += @from@(*data0);
+    }
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + b_scalar * accum);
 }
 
 #elif @nop@ == 3 && !@complex@
@@ -1155,167 +834,80 @@ static void
     @type@ *data0 = (@type@ *)dataptr[0];
 #endif
 
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            accum += @from@(data0[@i@]);
-#else /* complex */
-            accum_re += data0[2*@i@+0];
-            accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
-        case 0:
-#if @complex@
-            ((@temptype@ *)dataptr[1])[0] += accum_re;
-            ((@temptype@ *)dataptr[1])[1] += accum_im;
-#else
-            *((@type@ *)dataptr[1]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[1])));
-#endif
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
+    const int is_aligned = EINSUM_IS_ALIGNED(data0);
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+    #if @unroll_by@ == 4
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
              */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
+            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
+            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
+                      vaccum = npyv_add_@sfx@(a0123, vaccum);
         }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
+    #elif @unroll_by@ == 2
+        const npy_intp vstepx2 = vstep * 2;
+        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
+            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
+            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
+            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
+                    vaccum = npyv_add_@sfx@(a01, vaccum);
         }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    #else
+        #error "Invalid unroll_by = @unroll_by@"
+    #endif
     }
+    /**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+    npyv_cleanup();
+#endif // NPYV check for @type@
+
+#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+    #if @complex@
+        for (; count > 4; count -= 4, data0 += 4*2) {
+            const @temptype@ re01 = data0[0] + data0[2];
+            const @temptype@ re23 = data0[4] + data0[6];
+            const @temptype@ im13 = data0[1] + data0[3];
+            const @temptype@ im57 = data0[5] + data0[7];
+            accum_re += re01 + re23;
+            accum_im += im13 + im57;
+        }
+    #else
+        for (; count > 4; count -= 4, data0 += 4) {
+            const @temptype@ a01 = @from@(data0[0]) + @from@(data0[1]);
+            const @temptype@ a23 = @from@(data0[2]) + @from@(data0[3]);
+            accum +=  a01 + a23;
+        }
+    #endif  // complex
 #endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#  if !@complex@
-        accum += @from@(data0[@i@]);
-#  else /* complex */
-        accum_re += data0[2*@i@+0];
-        accum_im += data0[2*@i@+1];
-#  endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
-        data0 += 8;
+#if @complex@
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((@temptype@ *)dataptr[1])[0] += accum_re;
+    ((@temptype@ *)dataptr[1])[1] += accum_im;
 #else
-        data0 += 8*2;
-#endif
+    for (; count > 0; --count, ++data0) {
+        accum += @from@(*data0);
     }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1])));
+#endif // complex
 }
 
 #endif /* @nop@ == 1 */