diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index dc2849d58380..a72cccb5f7ed 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -108,26 +108,73 @@ def time_numpy_linalg_lstsq_a__b_float64(self): class Einsum(Benchmark): param_names = ['dtype'] - params = [[np.float64]] + params = [[np.float32, np.float64]] def setup(self, dtype): - self.a = np.arange(2900, dtype=dtype) - self.b = np.arange(3000, dtype=dtype) - self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40) - self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40) - self.d = np.arange(10000, dtype=dtype).reshape(10,100,10) - - #outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two + self.one_dim_small = np.arange(600, dtype=dtype) + self.one_dim = np.arange(3000, dtype=dtype) + self.one_dim_big = np.arange(480000, dtype=dtype) + self.two_dim_small = np.arange(1200, dtype=dtype).reshape(30, 40) + self.two_dim = np.arange(240000, dtype=dtype).reshape(400, 600) + self.three_dim_small = np.arange(10000, dtype=dtype).reshape(10,100,10) + self.three_dim = np.arange(24000, dtype=dtype).reshape(20, 30, 40) + # non_contigous arrays + self.non_contigous_dim1_small = np.arange(1, 80, 2, dtype=dtype) + self.non_contigous_dim1 = np.arange(1, 4000, 2, dtype=dtype) + self.non_contigous_dim2 = np.arange(1, 2400, 2, dtype=dtype).reshape(30, 40) + self.non_contigous_dim3 = np.arange(1, 48000, 2, dtype=dtype).reshape(20, 30, 40) + + # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two def time_einsum_outer(self, dtype): - np.einsum("i,j", self.a, self.b, optimize=True) + np.einsum("i,j", self.one_dim, self.one_dim, optimize=True) # multiply(a, b):trigger sum_of_products_contig_two def time_einsum_multiply(self, dtype): - np.einsum("..., ...", self.c1, self.c , optimize=True) + np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True) # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two def time_einsum_sum_mul(self, dtype): - np.einsum(",i...->", 300, self.d, optimize=True) + np.einsum(",i...->", 300, self.three_dim_small, optimize=True) # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two def time_einsum_sum_mul2(self, dtype): - np.einsum("i...,->", self.d, 300, optimize=True) \ No newline at end of file + np.einsum("i...,->", self.three_dim_small, 300, optimize=True) + + # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two + def time_einsum_mul(self, dtype): + np.einsum("i,->i", self.one_dim_big, 300, optimize=True) + + # trigger contig_contig_outstride0_two + def time_einsum_contig_contig(self, dtype): + np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True) + + # trigger sum_of_products_contig_outstride0_one + def time_einsum_contig_outstride0(self, dtype): + np.einsum("i->", self.one_dim_big, optimize=True) + + # outer(a,b): non_contigous arrays + def time_einsum_noncon_outer(self, dtype): + np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True) + + # multiply(a, b):non_contigous arrays + def time_einsum_noncon_multiply(self, dtype): + np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True) + + # sum and multiply:non_contigous arrays + def time_einsum_noncon_sum_mul(self, dtype): + np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True) + + # sum and multiply:non_contigous arrays + def time_einsum_noncon_sum_mul2(self, dtype): + np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True) + + # scalar mul: non_contigous arrays + def time_einsum_noncon_mul(self, dtype): + np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True) + + # contig_contig_outstride0_two: non_contigous arrays + def time_einsum_noncon_contig_contig(self, dtype): + np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True) + + # sum_of_products_contig_outstride0_one:non_contigous arrays + def time_einsum_noncon_contig_outstride0(self, dtype): + np.einsum("i->", self.non_contigous_dim1, optimize=True) diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h index 274520852569..a0f82fa3da05 100644 --- a/numpy/core/src/common/npy_cpu_dispatch.h +++ b/numpy/core/src/common/npy_cpu_dispatch.h @@ -17,7 +17,7 @@ * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`. */ /** - * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION', + * Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION', * due the nature of command argument '--disable-optimization', * which is explicitly disabling the module ccompiler_opt. */ @@ -29,7 +29,7 @@ * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead, * since c99 supports bool variables which may lead to ambiguous errors. */ - // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token. + // backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token. #define NPY__DISPATCH_DEFBOOL typedef bool npy__dispatch_bkbool; #endif @@ -134,10 +134,10 @@ * NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*)) * NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE]) * - * By assuming the provided config header drived from a dispatch-able source, + * By assuming the provided config header derived from a dispatch-able source, * that configured with "@targets baseline sse41 vsx3 asimdhp", * they supported by the compiler and enabled via '--cpu-dspatch', - * then the prototype declrations at the above example will equlivent to the follows: + * then the prototype declrations at the above example will equivalent to the follows: * * - x86: * void dispatch_me(const int*, int*); // baseline @@ -179,7 +179,7 @@ /** * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...) * - * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even + * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even * if it was provided within the configration statments. */ #define NPY_CPU_DISPATCH_DECLARE_XB(...) \ @@ -206,7 +206,7 @@ * In order to call or to assign the pointer of it from outside the dispatch-able source, * you have to use this Macro as follows: * - * // bring the genreated config header of the dispatch-abel source + * // bring the generated config header of the dispatch-able source * #ifndef NPY_DISABLE_OPTIMIZATION * #include "dispatchable_source_name.dispatch.h" * #endif diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 4af9e4d1748a..f00d8e153fe4 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -72,6 +72,26 @@ #define npyv_div_f32 _mm256_div_ps #define npyv_div_f64 _mm256_div_pd +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(__m256 a) +{ + __m256 sum_halves = _mm256_hadd_ps(a, a); + sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); + __m128 lo = _mm256_castps256_ps128(sum_halves); + __m128 hi = _mm256_extractf128_ps(sum_halves, 1); + __m128 sum = _mm_add_ps(lo, hi); + return _mm_cvtss_f32(sum); +} + +NPY_FINLINE double npyv_sum_f64(__m256d a) +{ + __m256d sum_halves = _mm256_hadd_pd(a, a); + __m128d lo = _mm256_castpd256_pd128(sum_halves); + __m128d hi = _mm256_extractf128_pd(sum_halves, 1); + __m128d sum = _mm_add_pd(lo, hi); + return _mm_cvtsd_f64(sum); +} + /*************************** * FUSED ***************************/ diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 824ae818ee3a..39d93be257d3 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -113,6 +113,49 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_div_f32 _mm512_div_ps #define npyv_div_f64 _mm512_div_pd +/*************************** + * Reduce Sum + * there are three ways to implement reduce sum for AVX512: + * 1- split(256) /add /split(128) /add /hadd /hadd /extract + * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract + * 3- _mm512_reduce_add_ps/pd + * The first one is been widely used by many projects + * + * the second one is used by Intel Compiler, maybe because the + * latency of hadd increased by (2-3) starting from Skylake-X which makes two + * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info. + * + * The third one is almost the same as the second one but only works for + * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. + ***************************/ +#ifdef NPY_HAVE_AVX512F_REDUCE + #define npyv_sum_f32 _mm512_reduce_add_ps + #define npyv_sum_f64 _mm512_reduce_add_pd +#else + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) + { + __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 sum32 = _mm512_add_ps(a, h64); + __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum16 = _mm512_add_ps(sum32, h32); + __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum8 = _mm512_add_ps(sum16, h16); + __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); + __m512 sum4 = _mm512_add_ps(sum8, h4); + return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); + } + NPY_FINLINE double npyv_sum_f64(npyv_f64 a) + { + __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512d sum32 = _mm512_add_pd(a, h64); + __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512d sum16 = _mm512_add_pd(sum32, h32); + __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); + __m512d sum8 = _mm512_add_pd(sum16, h16); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); + } +#endif + /*************************** * FUSED ***************************/ diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 5eeee1bb6d02..ff31311d5dcf 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -75,6 +75,19 @@ #endif #define npyv_div_f64 vdivq_f64 +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(float32x4_t a) +{ + float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(r, r), 0); +} +#ifdef __aarch64__ + NPY_FINLINE double npyv_sum_f64(float64x2_t a) + { + return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); + } +#endif + /*************************** * FUSED F32 ***************************/ diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 717dacd39f8d..e1e158ff41df 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -91,6 +91,31 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) // TODO: emulate integer division #define npyv_div_f32 _mm_div_ps #define npyv_div_f64 _mm_div_pd + +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(__m128 a) +{ +#ifdef NPY_HAVE_SSE3 + __m128 sum_halves = _mm_hadd_ps(a, a); + return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves)); +#else + __m128 t1 = _mm_movehl_ps(a, a); + __m128 t2 = _mm_add_ps(a, t1); + __m128 t3 = _mm_shuffle_ps(t2, t2, 1); + __m128 t4 = _mm_add_ss(t2, t3); + return _mm_cvtss_f32(t4); +#endif +} + +NPY_FINLINE double npyv_sum_f64(__m128d a) +{ +#ifdef NPY_HAVE_SSE3 + return _mm_cvtsd_f64(_mm_hadd_pd(a, a)); +#else + return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a))); +#endif +} + /*************************** * FUSED ***************************/ diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 6ef007676d03..5454b2eef2fc 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -100,6 +100,18 @@ #define npyv_div_f32 vec_div #define npyv_div_f64 vec_div +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ + return vec_extract(a, 0) + vec_extract(a, 1) + + vec_extract(a, 2) + vec_extract(a, 3); +} + +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ + return vec_extract(a, 0) + vec_extract(a, 1); +} + /*************************** * FUSED ***************************/ diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index c58e742874d0..d347a69a1322 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -10,38 +10,33 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE - -#include -#include /* for NPY_NTYPES */ #include - #include "einsum_sumprod.h" #include "einsum_debug.h" +#include "simd/simd.h" +#include "common.h" - -#ifdef NPY_HAVE_SSE_INTRINSICS -#define EINSUM_USE_SSE1 1 +// ARM/Neon don't have instructions for aligned memory access +#ifdef NPY_HAVE_NEON + #define EINSUM_IS_ALIGNED(x) 0 #else -#define EINSUM_USE_SSE1 0 -#endif - -#ifdef NPY_HAVE_SSE2_INTRINSICS -#define EINSUM_USE_SSE2 1 -#else -#define EINSUM_USE_SSE2 0 -#endif - -#if EINSUM_USE_SSE1 -#include -#endif - -#if EINSUM_USE_SSE2 -#include + #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) #endif -#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0) - -/**********************************************/ +/** + * This macro is used to enable a scalar loop which advances 4 elements at a + * time, which appears after a main SIMD loop gated by `CHK` that unrolls by + * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop + * that finishes up all the remaining scalars. The purpose of the unrolled loop + * is to enable auto-vectorization in cases when all of the following are true: + * + * - optimization is allowed + * - either: + * - we did not run the SIMD loop at all, due to NPV being disabled. + * - the SIMD loop was larger than 128bit, so there are likely to be many + * elements left to process. + */ +#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128)) /**begin repeat * #name = byte, short, int, long, longlong, @@ -56,6 +51,10 @@ * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_float, npy_float, npy_double, npy_longdouble, * npy_float, npy_double, npy_longdouble# +* #sfx = s8, s16, s32, long, s64, + * u8, u16, u32, ulong, u64, + * half, f32, f64, longdouble, + * f32, f64, clongdouble# * #to = ,,,,, * ,,,,, * npy_float_to_half,,,, @@ -76,8 +75,15 @@ * 0*5, * 0,0,1,0, * 0*3# + * #NPYV_CHK = 0*5, + * 0*5, + * 0, NPY_SIMD, NPY_SIMD_F64, 0, + * 0*3# + * #unroll_by = 0*5, + * 0*5, + * 0,2, 4, 0, + * 0*3# */ - /**begin repeat1 * #nop = 1, 2, 3, 1000# * #noplabel = one, two, three, any# @@ -250,115 +256,90 @@ static void @type@ *data0 = (@type@ *)dataptr[0]; @type@ *data1 = (@type@ *)dataptr[1]; @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b; -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) && + EINSUM_IS_ALIGNED(data_out); + const int vstep = npyv_nlanes_@sfx@; + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out); + npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep); + npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, b0, c0); + npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, b1, c1); + npyv_@st@_@sfx@(data_out, abc0); + npyv_@st@_@sfx@(data_out + vstep, abc1); } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + #else + #error "Invalid unroll_by = @unroll_by@" + #endif + } + /**end repeat2**/ + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ a@i@ = @from@(data0[@i@]); + const @type@ b@i@ = @from@(data1[@i@]); + const @type@ c@i@ = @from@(data_out[@i@]); + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ abc@i@ = a@i@ * b@i@ + c@i@; + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + data_out[@i@] = @to@(abc@i@); + /**end repeat2**/ } #endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - data_out += 8; + for (; count > 0; --count, ++data0, ++data1, ++data_out) { + const @type@ a = @from@(*data0); + const @type@ b = @from@(*data1); + const @type@ c = @from@(*data_out); + *data_out = @to@(a * b + c); } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; } /* Some extra specializations for the two operand case */ @@ -366,128 +347,89 @@ static void @name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, npy_intp const *NPY_UNUSED(strides), npy_intp count) { - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); + @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]); @type@ *data1 = (@type@ *)dataptr[1]; @type@ *data_out = (@type@ *)dataptr[2]; -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value0_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value0_sse; -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n", (int)count); -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value0_sse = _mm_set_ps1(value0); - +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; + const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out); + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ } - else { - return; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) { + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out); + npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep); + npyv_@sfx@ abc0 = npyv_muladd_@sfx@(va_scalar, b0, c0); + npyv_@sfx@ abc1 = npyv_muladd_@sfx@(va_scalar, b1, c1); + npyv_@st@_@sfx@(data_out, abc0); + npyv_@st@_@sfx@(data_out + vstep, abc1); } + #else + #error "Invalid unroll_by = @unroll_by@" + #endif } -#elif EINSUM_USE_SSE2 && @float64@ - value0_sse = _mm_set1_pd(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } + /**end repeat2**/ + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + for (; count >= 4; count -= 4, data1 += 4, data_out += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ b@i@ = @from@(data1[@i@]); + const @type@ c@i@ = @from@(data_out[@i@]); + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ abc@i@ = a_scalar * b@i@ + c@i@; + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + data_out[@i@] = @to@(abc@i@); + /**end repeat2**/ } #endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; + for (; count > 0; --count, ++data1, ++data_out) { + const @type@ b = @from@(*data1); + const @type@ c = @from@(*data_out); + *data_out = @to@(a_scalar * b + c); } } @@ -496,116 +438,87 @@ static void npy_intp const *NPY_UNUSED(strides), npy_intp count) { @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); + @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]); @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value1_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value1_sse; -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value1_sse = _mm_set_ps1(value1); - +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out); + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - value1_sse = _mm_set1_pd(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out); + npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep); + npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, vb_scalar, c0); + npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, vb_scalar, c1); + npyv_@st@_@sfx@(data_out, abc0); + npyv_@st@_@sfx@(data_out + vstep, abc1); } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + #else + #error "Invalid unroll_by = @unroll_by@" + #endif + } + /**end repeat2**/ + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + for (; count >= 4; count -= 4, data0 += 4, data_out += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ a@i@ = @from@(data0[@i@]); + const @type@ c@i@ = @from@(data_out[@i@]); + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ abc@i@ = a@i@ * b_scalar + c@i@; + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + data_out[@i@] = @to@(abc@i@); + /**end repeat2**/ } #endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data_out += 8; + for (; count > 0; --count, ++data0, ++data_out) { + const @type@ a = @from@(*data0); + const @type@ c = @from@(*data_out); + *data_out = @to@(a * b_scalar + c); } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; } static void @@ -616,295 +529,134 @@ static void @type@ *data1 = (@type@ *)dataptr[1]; @temptype@ accum = 0; -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1); + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum); + npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3); + npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2); + vaccum = npyv_muladd_@sfx@(a0, b0, ab1); } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum); + vaccum = npyv_muladd_@sfx@(a0, b0, ab1); } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + #else + #error "Invalid unroll_by = @unroll_by@" + #endif } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + for (; count >= 4; count -= 4, data0 += 4, data1 += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; + const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]); + /**end repeat2**/ + accum += ab0 + ab1 + ab2 + ab3; } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); #endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + for (; count > 0; --count, ++data0, ++data1) { + const @type@ a = @from@(*data0); + const @type@ b = @from@(*data1); + accum += a * b; + } + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); } static void @name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, npy_intp const *NPY_UNUSED(strides), npy_intp count) { - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); + @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]); @type@ *data1 = (@type@ *)dataptr[1]; @temptype@ accum = 0; -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. + const int is_aligned = EINSUM_IS_ALIGNED(data1); + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); -/**end repeat2**/ - data1 += 8; + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1); + npyv_@sfx@ b23 = npyv_add_@sfx@(b2, b3); + npyv_@sfx@ b0123 = npyv_add_@sfx@(b01, b23); + vaccum = npyv_add_@sfx@(b0123, vaccum); } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); -/**end repeat2**/ - data1 += 8; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) { + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1); + vaccum = npyv_add_@sfx@(b01, vaccum); } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + #else + #error "Invalid unroll_by = @unroll_by@" + #endif } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data1[@i@]); -/**end repeat2**/ -#endif - data1 += 8; + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + for (; count >= 4; count -= 4, data1 += 4) { + const @type@ b01 = @from@(data1[0]) + @from@(data1[1]); + const @type@ b23 = @from@(data1[2]) + @from@(data1[3]); + accum += b01 + b23; } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); #endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + for (; count > 0; --count, ++data1) { + accum += @from@(*data1); + } + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + a_scalar * accum); } static void @@ -912,136 +664,63 @@ static void npy_intp const *NPY_UNUSED(strides), npy_intp count) { @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); + @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]); @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. + const int is_aligned = EINSUM_IS_ALIGNED(data0); + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3); + npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23); + vaccum = npyv_add_@sfx@(a0123, vaccum); } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + vaccum = npyv_add_@sfx@(a01, vaccum); } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; + #else + #error "Invalid unroll_by = @unroll_by@" + #endif } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]); -/**end repeat2**/ -#endif - data0 += 8; + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + for (; count >= 4; count -= 4, data0 += 4) { + const @type@ a01 = @from@(data0[0]) + @from@(data0[1]); + const @type@ a23 = @from@(data0[2]) + @from@(data0[3]); + accum += a01 + a23; } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); #endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + for (; count > 0; --count, ++data0) { + accum += @from@(*data0); + } + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + b_scalar * accum); } #elif @nop@ == 3 && !@complex@ @@ -1155,167 +834,80 @@ static void @type@ *data0 = (@type@ *)dataptr[0]; #endif -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - accum += @from@(data0[@i@]); -#else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -#endif -/**end repeat2**/ - case 0: -#if @complex@ - ((@temptype@ *)dataptr[1])[0] += accum_re; - ((@temptype@ *)dataptr[1])[1] += accum_im; -#else - *((@type@ *)dataptr[1]) = @to@(accum + - @from@(*((@type@ *)dataptr[1]))); -#endif - return; - } - -#if EINSUM_USE_SSE1 && @float32@ + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. + const int is_aligned = EINSUM_IS_ALIGNED(data0); + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3); + npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23); + vaccum = npyv_add_@sfx@(a0123, vaccum); } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; + #elif @unroll_by@ == 2 + const npy_intp vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + vaccum = npyv_add_@sfx@(a01, vaccum); } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + #else + #error "Invalid unroll_by = @unroll_by@" + #endif } + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); + npyv_cleanup(); +#endif // NPYV check for @type@ + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) + #if @complex@ + for (; count > 4; count -= 4, data0 += 4*2) { + const @temptype@ re01 = data0[0] + data0[2]; + const @temptype@ re23 = data0[4] + data0[6]; + const @temptype@ im13 = data0[1] + data0[3]; + const @temptype@ im57 = data0[5] + data0[7]; + accum_re += re01 + re23; + accum_im += im13 + im57; + } + #else + for (; count > 4; count -= 4, data0 += 4) { + const @temptype@ a01 = @from@(data0[0]) + @from@(data0[1]); + const @temptype@ a23 = @from@(data0[2]) + @from@(data0[3]); + accum += a01 + a23; + } + #endif // complex #endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -# if !@complex@ - accum += @from@(data0[@i@]); -# else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -# endif -/**end repeat2**/ -#endif - -#if !@complex@ - data0 += 8; +#if @complex@ + for (; count > 0; --count, data0 += 2) { + accum_re += data0[0]; + accum_im += data0[1]; + } + ((@temptype@ *)dataptr[1])[0] += accum_re; + ((@temptype@ *)dataptr[1])[1] += accum_im; #else - data0 += 8*2; -#endif + for (; count > 0; --count, ++data0) { + accum += @from@(*data0); } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1]))); +#endif // complex } #endif /* @nop@ == 1 */