From e26dcf74c4cb5158dd9de8091f049d369b8b2361 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 11 Aug 2020 12:10:37 +0800 Subject: [PATCH 01/27] new npyv intrinsics --- numpy/core/src/common/simd/avx2/arithmetic.h | 25 +++++++++++++++++++ .../core/src/common/simd/avx512/arithmetic.h | 25 +++++++++++++++++++ numpy/core/src/common/simd/neon/arithmetic.h | 19 ++++++++++++++ numpy/core/src/common/simd/sse/arithmetic.h | 24 ++++++++++++++++++ numpy/core/src/common/simd/vsx/arithmetic.h | 15 +++++++++++ 5 files changed, 108 insertions(+) diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 9d8b4ab5e62b..d61b4a0f66ae 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -62,6 +62,13 @@ #define npyv_mul_f32 _mm256_mul_ps #define npyv_mul_f64 _mm256_mul_pd +#ifdef NPY_HAVE_FMA3 + #define npyv_muladd_f32 _mm256_fmadd_ps + #define npyv_muladd_f64 _mm256_fmadd_pd +#else + #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c) + #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c) +#endif // saturated // TODO: after implment Packs intrins @@ -72,4 +79,22 @@ #define npyv_div_f32 _mm256_div_ps #define npyv_div_f64 _mm256_div_pd +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(__m256 a) +{ + __m128 t1 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a,1)); + __m128 t2 = _mm_movehdup_ps(t1); + __m128 t3 = _mm_add_ps(t1, t2); + __m128 t4 = _mm_movehl_ps(t3, t3); + __m128 t5 = _mm_add_ss(t3, t4); + return _mm_cvtss_f32(t5); +} + +NPY_FINLINE double npyv_sum_f64(__m256d a) +{ + __m128d t1 = _mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a,1)); + __m128d t2 = _mm_unpackhi_pd(t1, t1); + __m128d t3 = _mm_add_sd(t2, t1); + return _mm_cvtsd_f64(t3); +} #endif // _NPY_SIMD_AVX2_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index fcaef0efd9a4..4b89c06c7321 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -103,6 +103,9 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_mul_f32 _mm512_mul_ps #define npyv_mul_f64 _mm512_mul_pd +#define npyv_muladd_f32 _mm512_fmadd_ps +#define npyv_muladd_f64 _mm512_fmadd_pd + // saturated // TODO: after implment Packs intrins @@ -112,5 +115,27 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) // TODO: emulate integer division #define npyv_div_f32 _mm512_div_ps #define npyv_div_f64 _mm512_div_pd +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ + __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 sum32 = _mm512_add_ps(a, h64); + __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum16 = _mm512_add_ps(sum32, h32); + __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum8 = _mm512_add_ps(sum16, h16); + __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); + __m512 sum4 = _mm512_add_ps(sum8, h4); + return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); +} +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ + __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512d sum32 = _mm512_add_pd(a, h64); + __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512d sum16 = _mm512_add_pd(sum32, h32); + __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); + __m512d sum8 = _mm512_add_pd(sum16, h16); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); +} #endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index ec8b8ecd0cfe..c7409e78fb53 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -60,6 +60,12 @@ #define npyv_mul_f32 vmulq_f32 #define npyv_mul_f64 vmulq_f64 +#ifdef NPY_HAVE_NEON_VFPV4 + #define npyv_muladd_f32(A, B, C) vfmaq_f32(C, A, B) +#else + #define npyv_muladd_f32(A, B, C) vmlaq_f32(C, A, B) +#endif +#define npyv_muladd_f64(A, B, C) vfmaq_f64(C, A, B) /*************************** * Division ***************************/ @@ -75,4 +81,17 @@ #endif #define npyv_div_f64 vdivq_f64 +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(float32x4_t a) +{ + float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(r, r), 0); +} +#ifdef __aarch64__ + NPY_FINLINE double npyv_sum_f64(float64x2_t a) + { + return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); + } +#endif + #endif // _NPY_SIMD_NEON_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 12d0af05cd15..62dc0d8cf4b0 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -82,6 +82,13 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) #define npyv_mul_f32 _mm_mul_ps #define npyv_mul_f64 _mm_mul_pd +#ifdef NPY_HAVE_FMA3 + #define npyv_muladd_f32 _mm_fmadd_ps + #define npyv_muladd_f64 _mm_fmadd_pd +#else + #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c) + #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c) +#endif // saturated // TODO: after implment Packs intrins @@ -92,4 +99,21 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) #define npyv_div_f32 _mm_div_ps #define npyv_div_f64 _mm_div_pd +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(__m128 a) +{ + __m128 t1 = _mm_movehl_ps(a, a); + __m128 t2 = _mm_add_ps(a, t1); + __m128 t3 = _mm_shuffle_ps(t2, t2, 1); + __m128 t4 = _mm_add_ss(t2, t3); + return _mm_cvtss_f32(t4); +} + +NPY_FINLINE double npyv_sum_f64(__m128d a) +{ + __m128 t0 = _mm_castpd_ps(a); + __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0)); + __m128d t2 = _mm_add_sd(a,t1); + return _mm_cvtsd_f64(t2); +} #endif // _NPY_SIMD_SSE_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index dd23b5b11e95..eb1aa20b11d7 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -94,10 +94,25 @@ #define npyv_mul_f32 vec_mul #define npyv_mul_f64 vec_mul +#define npyv_muladd_f32 vec_madd +#define npyv_muladd_f64 vec_madd + /*************************** * Division ***************************/ #define npyv_div_f32 vec_div #define npyv_div_f64 vec_div +// TODO: Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ + return vec_extract(a, 0) + vec_extract(a, 1) + + vec_extract(a, 2) + vec_extract(a, 3); +} + +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ + return vec_extract(a, 0) + vec_extract(a, 1); +} + #endif // _NPY_SIMD_VSX_ARITHMETIC_H From 47118fb6530e55f5f7164d2e652ff8b94f55d025 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 11 Aug 2020 12:11:00 +0800 Subject: [PATCH 02/27] einsum dispatch and usimd process --- benchmarks/benchmarks/bench_linalg.py | 19 +- numpy/core/setup.py | 1 + numpy/core/src/multiarray/einsum.c.src | 1913 +---------------- .../core/src/multiarray/einsum.dispatch.c.src | 1530 +++++++++++++ numpy/core/src/multiarray/einsum_p.h | 48 + 5 files changed, 1603 insertions(+), 1908 deletions(-) create mode 100644 numpy/core/src/multiarray/einsum.dispatch.c.src create mode 100644 numpy/core/src/multiarray/einsum_p.h diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index dc2849d58380..a64fc05f78f1 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -112,11 +112,14 @@ class Einsum(Benchmark): def setup(self, dtype): self.a = np.arange(2900, dtype=dtype) self.b = np.arange(3000, dtype=dtype) + self.b1 = np.arange(240000, dtype=dtype).reshape(400, 600) self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40) self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40) + self.c2 = np.arange(480000, dtype=dtype) + self.c3 = np.arange(600, dtype=dtype) self.d = np.arange(10000, dtype=dtype).reshape(10,100,10) - #outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two + # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two def time_einsum_outer(self, dtype): np.einsum("i,j", self.a, self.b, optimize=True) @@ -130,4 +133,16 @@ def time_einsum_sum_mul(self, dtype): # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two def time_einsum_sum_mul2(self, dtype): - np.einsum("i...,->", self.d, 300, optimize=True) \ No newline at end of file + np.einsum("i...,->", self.d, 300, optimize=True) + + # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two + def time_einsum_mul(self, dtype): + np.einsum("i,->i", self.c2, 300, optimize=True) + + # trigger contig_contig_outstride0_two + def time_einsum_contig_contig(self, dtype): + np.einsum("ji,i->", self.b1, self.c3, optimize=True) + + # trigger sum_of_products_contig_outstride0_one + def time_einsum_contig_outstride0(self, dtype): + np.einsum("i->", self.c2, optimize=True) \ No newline at end of file diff --git a/numpy/core/setup.py b/numpy/core/setup.py index aede12080017..e854fc0cae5b 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -853,6 +853,7 @@ def get_mathlib_info(*args): join('src', 'multiarray', 'dragon4.c'), join('src', 'multiarray', 'dtype_transfer.c'), join('src', 'multiarray', 'einsum.c.src'), + join('src', 'multiarray', 'einsum.dispatch.c.src'), join('src', 'multiarray', 'flagsobject.c'), join('src', 'multiarray', 'getset.c'), join('src', 'multiarray', 'hashdescr.c'), diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index 2538e05c626a..94b22641daef 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -8,1917 +8,18 @@ * See LICENSE.txt for the license. */ -#define PY_SSIZE_T_CLEAN -#include "Python.h" -#include "structmember.h" - -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#define _MULTIARRAYMODULE -#include -#include -#include -#include - -#include - -#include "convert.h" -#include "common.h" -#include "ctors.h" - -#ifdef NPY_HAVE_SSE_INTRINSICS -#define EINSUM_USE_SSE1 1 -#else -#define EINSUM_USE_SSE1 0 -#endif - -#ifdef NPY_HAVE_SSE2_INTRINSICS -#define EINSUM_USE_SSE2 1 -#else -#define EINSUM_USE_SSE2 0 -#endif - -#if EINSUM_USE_SSE1 -#include -#endif - -#if EINSUM_USE_SSE2 -#include -#endif - -#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0) - -/********** PRINTF DEBUG TRACING **************/ -#define NPY_EINSUM_DBG_TRACING 0 - -#if NPY_EINSUM_DBG_TRACING -#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s); -#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1); -#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2); -#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s); -#else -#define NPY_EINSUM_DBG_PRINT(s) -#define NPY_EINSUM_DBG_PRINT1(s, p1) -#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) -#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) -#endif -/**********************************************/ - -/**begin repeat - * #name = byte, short, int, long, longlong, - * ubyte, ushort, uint, ulong, ulonglong, - * half, float, double, longdouble, - * cfloat, cdouble, clongdouble# - * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_cfloat, npy_cdouble, npy_clongdouble# - * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_float, npy_float, npy_double, npy_longdouble, - * npy_float, npy_double, npy_longdouble# - * #to = ,,,,, - * ,,,,, - * npy_float_to_half,,,, - * ,,# - * #from = ,,,,, - * ,,,,, - * npy_half_to_float,,,, - * ,,# - * #complex = 0*5, - * 0*5, - * 0*4, - * 1*3# - * #float32 = 0*5, - * 0*5, - * 0,1,0,0, - * 0*3# - * #float64 = 0*5, - * 0*5, - * 0,0,1,0, - * 0*3# - */ - -/**begin repeat1 - * #nop = 1, 2, 3, 1000# - * #noplabel = one, two, three, any# - */ -static void -@name@_sum_of_products_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) && !@complex@ - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) && !@complex@ - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data_out = dataptr[@nop@]; - npy_intp stride_out = strides[@nop@]; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count); - - while (count--) { -#if !@complex@ -# if @nop@ == 1 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data_out += stride_out; -# elif @nop@ == 2 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data1 += stride1; - data_out += stride_out; -# elif @nop@ == 3 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) * - @from@(*(@type@ *)data2) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data1 += stride1; - data2 += stride2; - data_out += stride_out; -# else - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - *(@type@ *)dataptr[nop] = @to@(temp + - @from@(*(@type@ *)dataptr[i])); - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -# endif -#else /* complex */ -# if @nop@ == 1 - ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] + - ((@temptype@ *)data_out)[0]; - ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] + - ((@temptype@ *)data_out)[1]; - data0 += stride0; - data_out += stride_out; -# else -# if @nop@ <= 3 -#define _SUMPROD_NOP @nop@ -# else -#define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; - - for (i = 0; i <= _SUMPROD_NOP; ++i) { - dataptr[i] += strides[i]; - } -#undef _SUMPROD_NOP -# endif -#endif - } -} - -#if @nop@ == 1 - -static void -@name@_sum_of_products_contig_one(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data_out = (@type@ *)dataptr[1]; - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - data_out[@i@] = @to@(@from@(data0[@i@]) + - @from@(data_out[@i@])); -#else - ((@temptype@ *)data_out + 2*@i@)[0] = - ((@temptype@ *)data0 + 2*@i@)[0] + - ((@temptype@ *)data_out + 2*@i@)[0]; - ((@temptype@ *)data_out + 2*@i@)[1] = - ((@temptype@ *)data0 + 2*@i@)[1] + - ((@temptype@ *)data_out + 2*@i@)[1]; -#endif -/**end repeat2**/ - case 0: - return; - } - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -#if !@complex@ - data_out[@i@] = @to@(@from@(data0[@i@]) + - @from@(data_out[@i@])); -#else /* complex */ - ((@temptype@ *)data_out + 2*@i@)[0] = - ((@temptype@ *)data0 + 2*@i@)[0] + - ((@temptype@ *)data_out + 2*@i@)[0]; - ((@temptype@ *)data_out + 2*@i@)[1] = - ((@temptype@ *)data0 + 2*@i@)[1] + - ((@temptype@ *)data_out + 2*@i@)[1]; -#endif -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#elif @nop@ == 2 && !@complex@ - -static void -@name@_sum_of_products_contig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -/* Some extra specializations for the two operand case */ -static void -@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value0_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value0_sse; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value0_sse = _mm_set_ps1(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } - } -#elif EINSUM_USE_SSE2 && @float64@ - value0_sse = _mm_set1_pd(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } -} - -static void -@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value1_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value1_sse; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value1_sse = _mm_set_ps1(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - value1_sse = _mm_set1_pd(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data1[@i@]); -/**end repeat2**/ -#endif - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#elif @nop@ == 3 && !@complex@ - -static void -@name@_sum_of_products_contig_three(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data2 = (@type@ *)dataptr[2]; - @type@ *data_out = (@type@ *)dataptr[3]; - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) * - @from@(data2[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data2 += 8; - data_out += 8; - } - - /* Finish off the loop */ - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - if (count-- == 0) { - return; - } - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) * - @from@(data2[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -} - -#else /* @nop@ > 3 || @complex */ - -static void -@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n", - (int)count); - - while (count--) { -#if !@complex@ - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - *(@type@ *)dataptr[nop] = @to@(temp + - @from@(*(@type@ *)dataptr[i])); - for (i = 0; i <= nop; ++i) { - dataptr[i] += sizeof(@type@); - } -#else /* complex */ -# if @nop@ <= 3 -# define _SUMPROD_NOP @nop@ -# else -# define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; - - for (i = 0; i <= _SUMPROD_NOP; ++i) { - dataptr[i] += sizeof(@type@); - } -# undef _SUMPROD_NOP -#endif - } -} - -#endif /* functions for various @nop@ */ - -#if @nop@ == 1 - -static void -@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; - @temptype@ *data0 = (@temptype@ *)dataptr[0]; -#else - @temptype@ accum = 0; - @type@ *data0 = (@type@ *)dataptr[0]; -#endif - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - accum += @from@(data0[@i@]); -#else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -#endif -/**end repeat2**/ - case 0: -#if @complex@ - ((@temptype@ *)dataptr[1])[0] += accum_re; - ((@temptype@ *)dataptr[1])[1] += accum_im; -#else - *((@type@ *)dataptr[1]) = @to@(accum + - @from@(*((@type@ *)dataptr[1]))); -#endif - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -# if !@complex@ - accum += @from@(data0[@i@]); -# else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -# endif -/**end repeat2**/ -#endif - -#if !@complex@ - data0 += 8; -#else - data0 += 8*2; -#endif - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#endif /* @nop@ == 1 */ - -static void -@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; -#else - @temptype@ accum = 0; -#endif - -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) && !@complex@ - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) && !@complex@ - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n", - (int)count); - - while (count--) { -#if !@complex@ -# if @nop@ == 1 - accum += @from@(*(@type@ *)data0); - data0 += stride0; -# elif @nop@ == 2 - accum += @from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1); - data0 += stride0; - data1 += stride1; -# elif @nop@ == 3 - accum += @from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) * - @from@(*(@type@ *)data2); - data0 += stride0; - data1 += stride1; - data2 += stride2; -# else - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - accum += temp; - for (i = 0; i < nop; ++i) { - dataptr[i] += strides[i]; - } -# endif -#else /* complex */ -# if @nop@ == 1 - accum_re += ((@temptype@ *)data0)[0]; - accum_im += ((@temptype@ *)data0)[1]; - data0 += stride0; -# else -# if @nop@ <= 3 -#define _SUMPROD_NOP @nop@ -# else -#define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - accum_re += re; - accum_im += im; - for (i = 0; i < _SUMPROD_NOP; ++i) { - dataptr[i] += strides[i]; - } -#undef _SUMPROD_NOP -# endif -#endif - } - -#if @complex@ -# if @nop@ <= 3 - ((@temptype@ *)dataptr[@nop@])[0] += accum_re; - ((@temptype@ *)dataptr[@nop@])[1] += accum_im; -# else - ((@temptype@ *)dataptr[nop])[0] += accum_re; - ((@temptype@ *)dataptr[nop])[1] += accum_im; -# endif -#else -# if @nop@ <= 3 - *((@type@ *)dataptr[@nop@]) = @to@(accum + - @from@(*((@type@ *)dataptr[@nop@]))); -# else - *((@type@ *)dataptr[nop]) = @to@(accum + - @from@(*((@type@ *)dataptr[nop]))); -# endif -#endif - -} - -/**end repeat1**/ - -/**end repeat**/ - - -/* Do OR of ANDs for the boolean type */ - -/**begin repeat - * #nop = 1, 2, 3, 1000# - * #noplabel = one, two, three, any# - */ - -static void -bool_sum_of_products_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ <= 3) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif -#if (@nop@ <= 3) - char *data_out = dataptr[@nop@]; - npy_intp stride_out = strides[@nop@]; -#endif - - while (count--) { -#if @nop@ == 1 - *(npy_bool *)data_out = *(npy_bool *)data0 || - *(npy_bool *)data_out; - data0 += stride0; - data_out += stride_out; -#elif @nop@ == 2 - *(npy_bool *)data_out = (*(npy_bool *)data0 && - *(npy_bool *)data1) || - *(npy_bool *)data_out; - data0 += stride0; - data1 += stride1; - data_out += stride_out; -#elif @nop@ == 3 - *(npy_bool *)data_out = (*(npy_bool *)data0 && - *(npy_bool *)data1 && - *(npy_bool *)data2) || - *(npy_bool *)data_out; - data0 += stride0; - data1 += stride1; - data2 += stride2; - data_out += stride_out; -#else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -#endif - } -} - -static void -bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ <= 3) - char *data0 = dataptr[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; -#endif -#if (@nop@ <= 3) - char *data_out = dataptr[@nop@]; -#endif - -#if (@nop@ <= 3) -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat1 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -# if @nop@ == 1 - ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] || - ((npy_bool *)data_out)[@i@]; -# elif @nop@ == 2 - ((npy_bool *)data_out)[@i@] = - (((npy_bool *)data0)[@i@] && - ((npy_bool *)data1)[@i@]) || - ((npy_bool *)data_out)[@i@]; -# elif @nop@ == 3 - ((npy_bool *)data_out)[@i@] = - (((npy_bool *)data0)[@i@] && - ((npy_bool *)data1)[@i@] && - ((npy_bool *)data2)[@i@]) || - ((npy_bool *)data_out)[@i@]; -# endif -/**end repeat1**/ - case 0: - return; - } -#endif - -/* Unroll the loop by 8 for fixed-size nop */ -#if (@nop@ <= 3) - while (count >= 8) { - count -= 8; -#else - while (count--) { -#endif - -# if @nop@ == 1 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# elif @nop@ == 2 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = - ((*((npy_bool *)data0 + @i@)) && - (*((npy_bool *)data1 + @i@))) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data1 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# elif @nop@ == 3 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = - ((*((npy_bool *)data0 + @i@)) && - (*((npy_bool *)data1 + @i@)) && - (*((npy_bool *)data2 + @i@))) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data1 += 8*sizeof(npy_bool); - data2 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; - for (i = 0; i <= nop; ++i) { - dataptr[i] += sizeof(npy_bool); - } -# endif - } - - /* If the loop was unrolled, we need to finish it off */ -#if (@nop@ <= 3) - goto finish_after_unrolled_loop; -#endif -} - -static void -bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ - npy_bool accum = 0; - -#if (@nop@ <= 3) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif - - while (count--) { -#if @nop@ == 1 - accum = *(npy_bool *)data0 || accum; - data0 += stride0; -#elif @nop@ == 2 - accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum; - data0 += stride0; - data1 += stride1; -#elif @nop@ == 3 - accum = (*(npy_bool *)data0 && - *(npy_bool *)data1 && - *(npy_bool *)data2) || accum; - data0 += stride0; - data1 += stride1; - data2 += stride2; -#else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - accum = temp || accum; - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -#endif - } - -# if @nop@ <= 3 - *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]); -# else - *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]); -# endif -} - -/**end repeat**/ - -typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); - -/* These tables need to match up with the type enum */ -static sum_of_products_fn -_contig_outstride0_unary_specialization_table[NPY_NTYPES] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 0, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ - &@name@_sum_of_products_contig_outstride0_one, -#else - NULL, -#endif -/**end repeat**/ -}; /* End of _contig_outstride0_unary_specialization_table */ - -static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 0, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 0, 0, 0, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_stride0_contig_outstride0_two, - &@name@_sum_of_products_stride0_contig_outcontig_two, - &@name@_sum_of_products_contig_stride0_outstride0_two, - &@name@_sum_of_products_contig_stride0_outcontig_two, - &@name@_sum_of_products_contig_contig_outstride0_two, -}, -#else - {NULL, NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _binary_specialization_table */ - -static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_outstride0_any, - &@name@_sum_of_products_outstride0_one, - &@name@_sum_of_products_outstride0_two, - &@name@_sum_of_products_outstride0_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _outstride0_specialized_table */ - -static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_contig_any, - &@name@_sum_of_products_contig_one, - &@name@_sum_of_products_contig_two, - &@name@_sum_of_products_contig_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _allcontig_specialized_table */ - -static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_any, - &@name@_sum_of_products_one, - &@name@_sum_of_products_two, - &@name@_sum_of_products_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _unnspecialized_table */ +#include "einsum_p.h" static sum_of_products_fn -get_sum_of_products_function(int nop, int type_num, - npy_intp itemsize, npy_intp const *fixed_strides) +get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides) { - int iop; - - if (type_num >= NPY_NTYPES) { - return NULL; - } - - /* contiguous reduction */ - if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) { - sum_of_products_fn ret = - _contig_outstride0_unary_specialization_table[type_num]; - if (ret != NULL) { - return ret; - } - } - - /* nop of 2 has more specializations */ - if (nop == 2) { - /* Encode the zero/contiguous strides */ - int code; - code = (fixed_strides[0] == 0) ? 0 : - (fixed_strides[0] == itemsize) ? 2*2*1 : 8; - code += (fixed_strides[1] == 0) ? 0 : - (fixed_strides[1] == itemsize) ? 2*1 : 8; - code += (fixed_strides[2] == 0) ? 0 : - (fixed_strides[2] == itemsize) ? 1 : 8; - if (code >= 2 && code < 7) { - sum_of_products_fn ret = - _binary_specialization_table[type_num][code-2]; - if (ret != NULL) { - return ret; - } - } - } - - /* Inner loop with an output stride of 0 */ - if (fixed_strides[nop] == 0) { - return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0]; - } - - /* Check for all contiguous */ - for (iop = 0; iop < nop + 1; ++iop) { - if (fixed_strides[iop] != itemsize) { - break; - } - } - - /* Contiguous loop */ - if (iop == nop + 1) { - return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0]; - } - - /* None of the above specializations caught it, general loops */ - return _unspecialized_table[type_num][nop <= 3 ? nop : 0]; + #ifndef NPY_DISABLE_OPTIMIZATION + #include "einsum.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL(return einsum_get_sum_of_products_function, + (nop, type_num, itemsize, fixed_strides)) } - /* * Parses the subscripts for one operand into an output of 'ndim' * labels. The resulting 'op_labels' array will have: diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src new file mode 100644 index 000000000000..bfb5075e3594 --- /dev/null +++ b/numpy/core/src/multiarray/einsum.dispatch.c.src @@ -0,0 +1,1530 @@ +/* + * This file contains the implementation of the 'einsum' function, + * which provides an einstein-summation operation. + * + * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com) + * The University of British Columbia + * + * See LICENSE.txt for the license. + */ +/** + * @targets $maxopt baseline + * SSE2 (AVX2 FMA3) AVX512F + * NEON NEON_VFPV4 + * VSX VSX2 + */ +#include "einsum_p.h" + +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble, + * cfloat, cdouble, clongdouble# + * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_half, npy_float, npy_double, npy_longdouble, + * npy_cfloat, npy_cdouble, npy_clongdouble# + * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_float, npy_float, npy_double, npy_longdouble, + * npy_float, npy_double, npy_longdouble# +* #sfx = s8, s16, s32, long, s64, + * u8, u16, u32, ulong, u64, + * half, f32, f64, longdouble, + * f32, f64, clongdouble# + * #to = ,,,,, + * ,,,,, + * npy_float_to_half,,,, + * ,,# + * #from = ,,,,, + * ,,,,, + * npy_half_to_float,,,, + * ,,# + * #complex = 0*5, + * 0*5, + * 0*4, + * 1*3# + * #float32 = 0*5, + * 0*5, + * 0,1,0,0, + * 0*3# + * #float64 = 0*5, + * 0*5, + * 0,0,1,0, + * 0*3# + * #NPYV_CHK = 0*5, + * 0*5, + * 0, NPY_SIMD, NPY_SIMD_F64, 0, + * 0*3# + * #unroll_by = 0*5, + * 0*5, + * 0,2, 4, 0, + * 0*3# + */ +/**begin repeat1 + * #nop = 1, 2, 3, 1000# + * #noplabel = one, two, three, any# + */ +static void +@name@_sum_of_products_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) && !@complex@ + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) && !@complex@ + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data_out = dataptr[@nop@]; + npy_intp stride_out = strides[@nop@]; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count); + + while (count--) { +#if !@complex@ +# if @nop@ == 1 + *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) + + @from@(*(@type@ *)data_out)); + data0 += stride0; + data_out += stride_out; +# elif @nop@ == 2 + *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1) + + @from@(*(@type@ *)data_out)); + data0 += stride0; + data1 += stride1; + data_out += stride_out; +# elif @nop@ == 3 + *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1) * + @from@(*(@type@ *)data2) + + @from@(*(@type@ *)data_out)); + data0 += stride0; + data1 += stride1; + data2 += stride2; + data_out += stride_out; +# else + @temptype@ temp = @from@(*(@type@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(@type@ *)dataptr[i]); + } + *(@type@ *)dataptr[nop] = @to@(temp + + @from@(*(@type@ *)dataptr[i])); + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +# endif +#else /* complex */ +# if @nop@ == 1 + ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] + + ((@temptype@ *)data_out)[0]; + ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] + + ((@temptype@ *)data_out)[1]; + data0 += stride0; + data_out += stride_out; +# else +# if @nop@ <= 3 +#define _SUMPROD_NOP @nop@ +# else +#define _SUMPROD_NOP nop +# endif + @temptype@ re, im, tmp; + int i; + re = ((@temptype@ *)dataptr[0])[0]; + im = ((@temptype@ *)dataptr[0])[1]; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((@temptype@ *)dataptr[i])[0] - + im * ((@temptype@ *)dataptr[i])[1]; + im = re * ((@temptype@ *)dataptr[i])[1] + + im * ((@temptype@ *)dataptr[i])[0]; + re = tmp; + } + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; + + for (i = 0; i <= _SUMPROD_NOP; ++i) { + dataptr[i] += strides[i]; + } +#undef _SUMPROD_NOP +# endif +#endif + } +} + +#if @nop@ == 1 + +static void +@name@_sum_of_products_contig_one(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data_out = (@type@ *)dataptr[1]; + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: +#if !@complex@ + data_out[@i@] = @to@(@from@(data0[@i@]) + + @from@(data_out[@i@])); +#else + ((@temptype@ *)data_out + 2*@i@)[0] = + ((@temptype@ *)data0 + 2*@i@)[0] + + ((@temptype@ *)data_out + 2*@i@)[0]; + ((@temptype@ *)data_out + 2*@i@)[1] = + ((@temptype@ *)data0 + 2*@i@)[1] + + ((@temptype@ *)data_out + 2*@i@)[1]; +#endif +/**end repeat2**/ + case 0: + return; + } + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ +#if !@complex@ + data_out[@i@] = @to@(@from@(data0[@i@]) + + @from@(data_out[@i@])); +#else /* complex */ + ((@temptype@ *)data_out + 2*@i@)[0] = + ((@temptype@ *)data0 + 2*@i@)[0] + + ((@temptype@ *)data_out + 2*@i@)[0]; + ((@temptype@ *)data_out + 2*@i@)[1] = + ((@temptype@ *)data0 + 2*@i@)[1] + + ((@temptype@ *)data_out + 2*@i@)[1]; +#endif +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +#elif @nop@ == 2 && !@complex@ + +static void +@name@_sum_of_products_contig_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data1 = (@type@ *)dataptr[1]; + @type@ *data_out = (@type@ *)dataptr[2]; + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", + (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) && + EINSUM_IS_ALIGNED(data_out); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out); + npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep); + npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, b0, c0); + npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, b1, c1); + npyv_@st@_@sfx@(data_out, abc0); + npyv_@st@_@sfx@(data_out + vstep, abc1); + } + #endif + } + /**end repeat2**/ +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ a@i@ = @from@(data0[@i@]); + const @type@ b@i@ = @from@(data1[@i@]); + const @type@ c@i@ = @from@(data_out[@i@]); + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ abc@i@ = a@i@ * b@i@ + c@i@; + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + data_out[@i@] = @to@(abc@i@); + /**end repeat2**/ + } +#endif + for (; count > 0; --count, ++data0, ++data1, ++data_out) { + const @type@ a = @from@(*data0); + const @type@ b = @from@(*data1); + const @type@ c = @from@(*data_out); + *data_out = @to@(a * b + c); + } +} + +/* Some extra specializations for the two operand case */ +static void +@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]); + @type@ *data1 = (@type@ *)dataptr[1]; + @type@ *data_out = (@type@ *)dataptr[2]; + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n", + (int)count); + +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) { + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out); + npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep); + npyv_@sfx@ abc0 = npyv_muladd_@sfx@(va_scalar, b0, c0); + npyv_@sfx@ abc1 = npyv_muladd_@sfx@(va_scalar, b1, c1); + npyv_@st@_@sfx@(data_out, abc0); + npyv_@st@_@sfx@(data_out + vstep, abc1); + } + #endif + } + /**end repeat2**/ +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + for (; count >= 4; count -= 4, data1 += 4, data_out += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ b@i@ = @from@(data1[@i@]); + const @type@ c@i@ = @from@(data_out[@i@]); + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ abc@i@ = a_scalar * b@i@ + c@i@; + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + data_out[@i@] = @to@(abc@i@); + /**end repeat2**/ + } +#endif + for (; count > 0; --count, ++data1, ++data_out) { + const @type@ b = @from@(*data1); + const @type@ c = @from@(*data_out); + *data_out = @to@(a_scalar * b + c); + } +} + +static void +@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]); + @type@ *data_out = (@type@ *)dataptr[2]; + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", + (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out); + npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep); + npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, vb_scalar, c0); + npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, vb_scalar, c1); + npyv_@st@_@sfx@(data_out, abc0); + npyv_@st@_@sfx@(data_out + vstep, abc1); + } + #endif + } + /**end repeat2**/ +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + for (; count >= 4; count -= 4, data0 += 4, data_out += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ a@i@ = @from@(data0[@i@]); + const @type@ c@i@ = @from@(data_out[@i@]); + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ abc@i@ = a@i@ * b_scalar + c@i@; + /**end repeat2**/ + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + data_out[@i@] = @to@(abc@i@); + /**end repeat2**/ + } +#endif + for (; count > 0; --count, ++data0, ++data_out) { + const @type@ a = @from@(*data0); + const @type@ c = @from@(*data_out); + *data_out = @to@(a * b_scalar + c); + } +} + +static void +@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data1 = (@type@ *)dataptr[1]; + @temptype@ accum = 0; + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", + (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum); + npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3); + npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2); + vaccum = npyv_muladd_@sfx@(a0, b0, ab1); + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum); + vaccum = npyv_muladd_@sfx@(a0, b0, ab1); + } + #endif + } + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + for (; count >= 4; count -= 4, data0 += 4, data1 += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]); + /**end repeat2**/ + accum += ab0 + ab1 + ab2 + ab3; + } +#endif + for (; count > 0; --count, ++data0, ++data1) { + const @type@ a = @from@(*data0); + const @type@ b = @from@(*data1); + accum += a * b; + } + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); +} + +static void +@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]); + @type@ *data1 = (@type@ *)dataptr[1]; + @temptype@ accum = 0; + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", + (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data1); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1); + npyv_@sfx@ b23 = npyv_add_@sfx@(b2, b3); + npyv_@sfx@ b0123 = npyv_add_@sfx@(b01, b23); + vaccum = npyv_add_@sfx@(b0123, vaccum); + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) { + npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); + npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); + npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1); + vaccum = npyv_add_@sfx@(b01, vaccum); + } + #endif + } + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + for (; count >= 4; count -= 4, data1 += 4) { + const @type@ b01 = @from@(data1[0]) + @from@(data1[1]); + const @type@ b23 = @from@(data1[2]) + @from@(data1[3]); + accum += b01 + b23; + } +#endif + for (; count > 0; --count, ++data1) { + accum += @from@(*data1); + } + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + a_scalar * accum); +} + +static void +@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]); + @temptype@ accum = 0; + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", + (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data0); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3); + npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23); + vaccum = npyv_add_@sfx@(a0123, vaccum); + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + vaccum = npyv_add_@sfx@(a01, vaccum); + } + #endif + } + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + for (; count >= 4; count -= 4, data0 += 4) { + const @type@ a01 = @from@(data0[0]) + @from@(data0[1]); + const @type@ a23 = @from@(data0[2]) + @from@(data0[3]); + accum += a01 + a23; + } +#endif + for (; count > 0; --count, ++data0) { + accum += @from@(*data0); + } + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + b_scalar * accum); +} + +#elif @nop@ == 3 && !@complex@ + +static void +@name@_sum_of_products_contig_three(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data1 = (@type@ *)dataptr[1]; + @type@ *data2 = (@type@ *)dataptr[2]; + @type@ *data_out = (@type@ *)dataptr[3]; + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) * + @from@(data2[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ + data0 += 8; + data1 += 8; + data2 += 8; + data_out += 8; + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + if (count-- == 0) { + return; + } + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) * + @from@(data2[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +} + +#else /* @nop@ > 3 || @complex */ + +static void +@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n", + (int)count); + + while (count--) { +#if !@complex@ + @temptype@ temp = @from@(*(@type@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(@type@ *)dataptr[i]); + } + *(@type@ *)dataptr[nop] = @to@(temp + + @from@(*(@type@ *)dataptr[i])); + for (i = 0; i <= nop; ++i) { + dataptr[i] += sizeof(@type@); + } +#else /* complex */ +# if @nop@ <= 3 +# define _SUMPROD_NOP @nop@ +# else +# define _SUMPROD_NOP nop +# endif + @temptype@ re, im, tmp; + int i; + re = ((@temptype@ *)dataptr[0])[0]; + im = ((@temptype@ *)dataptr[0])[1]; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((@temptype@ *)dataptr[i])[0] - + im * ((@temptype@ *)dataptr[i])[1]; + im = re * ((@temptype@ *)dataptr[i])[1] + + im * ((@temptype@ *)dataptr[i])[0]; + re = tmp; + } + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; + + for (i = 0; i <= _SUMPROD_NOP; ++i) { + dataptr[i] += sizeof(@type@); + } +# undef _SUMPROD_NOP +#endif + } +} + +#endif /* functions for various @nop@ */ + +#if @nop@ == 1 + +static void +@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if @complex@ + @temptype@ accum_re = 0, accum_im = 0; + @temptype@ *data0 = (@temptype@ *)dataptr[0]; +#else + @temptype@ accum = 0; + @type@ *data0 = (@type@ *)dataptr[0]; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count); +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + #ifndef NPY_HAVE_NEON + const int is_aligned = EINSUM_IS_ALIGNED(data0); + #else + // ARM/Neon don't have instructions for aligned memory access + const int is_aligned = 0; + #endif + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + #if @unroll_by@ == 4 + const int vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3); + npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23); + vaccum = npyv_add_@sfx@(a0123, vaccum); + } + #elif @unroll_by@ == 2 + const int vstepx2 = vstep * 2; + for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) { + npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); + npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + vaccum = npyv_add_@sfx@(a01, vaccum); + } + #endif + } + /**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); +npyv_cleanup(); +#endif // NPYV check for @type@ +/** + * Unroll by four/eight scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the change to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + #if @complex@ + for (; count > 4; count -= 4, data0 += 4*2) { + const @temptype@ re01 = data0[0] + data0[2]; + const @temptype@ re23 = data0[4] + data0[6]; + const @temptype@ im13 = data0[1] + data0[3]; + const @temptype@ im57 = data0[5] + data0[7]; + accum_re += re01 + re23; + accum_im += im13 + im57; + } + #else + for (; count > 4; count -= 4, data0 += 4) { + const @temptype@ a01 = @from@(data0[0]) + @from@(data0[1]); + const @temptype@ a23 = @from@(data0[2]) + @from@(data0[3]); + accum += a01 + a23; + } + #endif // complex +#endif +#if @complex@ + for (; count > 0; --count, data0 += 2) { + accum_re += data0[0]; + accum_im += data0[1]; + } + ((@temptype@ *)dataptr[1])[0] += accum_re; + ((@temptype@ *)dataptr[1])[1] += accum_im; +#else + for (; count > 0; --count, ++data0) { + accum += @from@(*data0); + } + *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1]))); +#endif // complex +} + +#endif /* @nop@ == 1 */ + +static void +@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if @complex@ + @temptype@ accum_re = 0, accum_im = 0; +#else + @temptype@ accum = 0; +#endif + +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) && !@complex@ + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) && !@complex@ + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n", + (int)count); + + while (count--) { +#if !@complex@ +# if @nop@ == 1 + accum += @from@(*(@type@ *)data0); + data0 += stride0; +# elif @nop@ == 2 + accum += @from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1); + data0 += stride0; + data1 += stride1; +# elif @nop@ == 3 + accum += @from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1) * + @from@(*(@type@ *)data2); + data0 += stride0; + data1 += stride1; + data2 += stride2; +# else + @temptype@ temp = @from@(*(@type@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(@type@ *)dataptr[i]); + } + accum += temp; + for (i = 0; i < nop; ++i) { + dataptr[i] += strides[i]; + } +# endif +#else /* complex */ +# if @nop@ == 1 + accum_re += ((@temptype@ *)data0)[0]; + accum_im += ((@temptype@ *)data0)[1]; + data0 += stride0; +# else +# if @nop@ <= 3 +#define _SUMPROD_NOP @nop@ +# else +#define _SUMPROD_NOP nop +# endif + @temptype@ re, im, tmp; + int i; + re = ((@temptype@ *)dataptr[0])[0]; + im = ((@temptype@ *)dataptr[0])[1]; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((@temptype@ *)dataptr[i])[0] - + im * ((@temptype@ *)dataptr[i])[1]; + im = re * ((@temptype@ *)dataptr[i])[1] + + im * ((@temptype@ *)dataptr[i])[0]; + re = tmp; + } + accum_re += re; + accum_im += im; + for (i = 0; i < _SUMPROD_NOP; ++i) { + dataptr[i] += strides[i]; + } +#undef _SUMPROD_NOP +# endif +#endif + } + +#if @complex@ +# if @nop@ <= 3 + ((@temptype@ *)dataptr[@nop@])[0] += accum_re; + ((@temptype@ *)dataptr[@nop@])[1] += accum_im; +# else + ((@temptype@ *)dataptr[nop])[0] += accum_re; + ((@temptype@ *)dataptr[nop])[1] += accum_im; +# endif +#else +# if @nop@ <= 3 + *((@type@ *)dataptr[@nop@]) = @to@(accum + + @from@(*((@type@ *)dataptr[@nop@]))); +# else + *((@type@ *)dataptr[nop]) = @to@(accum + + @from@(*((@type@ *)dataptr[nop]))); +# endif +#endif + +} + +/**end repeat1**/ + +/**end repeat**/ + + +/* Do OR of ANDs for the boolean type */ + +/**begin repeat + * #nop = 1, 2, 3, 1000# + * #noplabel = one, two, three, any# + */ + +static void +bool_sum_of_products_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if (@nop@ <= 3) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif +#if (@nop@ <= 3) + char *data_out = dataptr[@nop@]; + npy_intp stride_out = strides[@nop@]; +#endif + + while (count--) { +#if @nop@ == 1 + *(npy_bool *)data_out = *(npy_bool *)data0 || + *(npy_bool *)data_out; + data0 += stride0; + data_out += stride_out; +#elif @nop@ == 2 + *(npy_bool *)data_out = (*(npy_bool *)data0 && + *(npy_bool *)data1) || + *(npy_bool *)data_out; + data0 += stride0; + data1 += stride1; + data_out += stride_out; +#elif @nop@ == 3 + *(npy_bool *)data_out = (*(npy_bool *)data0 && + *(npy_bool *)data1 && + *(npy_bool *)data2) || + *(npy_bool *)data_out; + data0 += stride0; + data1 += stride1; + data2 += stride2; + data_out += stride_out; +#else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +#endif + } +} + +static void +bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if (@nop@ <= 3) + char *data0 = dataptr[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; +#endif +#if (@nop@ <= 3) + char *data_out = dataptr[@nop@]; +#endif + +#if (@nop@ <= 3) +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat1 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: +# if @nop@ == 1 + ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] || + ((npy_bool *)data_out)[@i@]; +# elif @nop@ == 2 + ((npy_bool *)data_out)[@i@] = + (((npy_bool *)data0)[@i@] && + ((npy_bool *)data1)[@i@]) || + ((npy_bool *)data_out)[@i@]; +# elif @nop@ == 3 + ((npy_bool *)data_out)[@i@] = + (((npy_bool *)data0)[@i@] && + ((npy_bool *)data1)[@i@] && + ((npy_bool *)data2)[@i@]) || + ((npy_bool *)data_out)[@i@]; +# endif +/**end repeat1**/ + case 0: + return; + } +#endif + +/* Unroll the loop by 8 for fixed-size nop */ +#if (@nop@ <= 3) + while (count >= 8) { + count -= 8; +#else + while (count--) { +#endif + +# if @nop@ == 1 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 8*sizeof(npy_bool); + data_out += 8*sizeof(npy_bool); +# elif @nop@ == 2 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 8*sizeof(npy_bool); + data1 += 8*sizeof(npy_bool); + data_out += 8*sizeof(npy_bool); +# elif @nop@ == 3 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@)) && + (*((npy_bool *)data2 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 8*sizeof(npy_bool); + data1 += 8*sizeof(npy_bool); + data2 += 8*sizeof(npy_bool); + data_out += 8*sizeof(npy_bool); +# else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; + for (i = 0; i <= nop; ++i) { + dataptr[i] += sizeof(npy_bool); + } +# endif + } + + /* If the loop was unrolled, we need to finish it off */ +#if (@nop@ <= 3) + goto finish_after_unrolled_loop; +#endif +} + +static void +bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ + npy_bool accum = 0; + +#if (@nop@ <= 3) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif + + while (count--) { +#if @nop@ == 1 + accum = *(npy_bool *)data0 || accum; + data0 += stride0; +#elif @nop@ == 2 + accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum; + data0 += stride0; + data1 += stride1; +#elif @nop@ == 3 + accum = (*(npy_bool *)data0 && + *(npy_bool *)data1 && + *(npy_bool *)data2) || accum; + data0 += stride0; + data1 += stride1; + data2 += stride2; +#else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + accum = temp || accum; + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +#endif + } + +# if @nop@ <= 3 + *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]); +# else + *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]); +# endif +} + +/**end repeat**/ + +typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); + +/* These tables need to match up with the type enum */ +static sum_of_products_fn +_contig_outstride0_unary_specialization_table[NPY_NTYPES] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 0, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ + &@name@_sum_of_products_contig_outstride0_one, +#else + NULL, +#endif +/**end repeat**/ +}; /* End of _contig_outstride0_unary_specialization_table */ + +static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 0, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 0, 0, 0, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_stride0_contig_outstride0_two, + &@name@_sum_of_products_stride0_contig_outcontig_two, + &@name@_sum_of_products_contig_stride0_outstride0_two, + &@name@_sum_of_products_contig_stride0_outcontig_two, + &@name@_sum_of_products_contig_contig_outstride0_two, +}, +#else + {NULL, NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _binary_specialization_table */ + +static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_outstride0_any, + &@name@_sum_of_products_outstride0_one, + &@name@_sum_of_products_outstride0_two, + &@name@_sum_of_products_outstride0_three +}, +#else + {NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _outstride0_specialized_table */ + +static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_contig_any, + &@name@_sum_of_products_contig_one, + &@name@_sum_of_products_contig_two, + &@name@_sum_of_products_contig_three +}, +#else + {NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _allcontig_specialized_table */ + +static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_any, + &@name@_sum_of_products_one, + &@name@_sum_of_products_two, + &@name@_sum_of_products_three +}, +#else + {NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _unnspecialized_table */ + +NPY_NO_EXPORT sum_of_products_fn NPY_CPU_DISPATCH_CURFX(einsum_get_sum_of_products_function) +(int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides) +{ + int iop; + + if (type_num >= NPY_NTYPES) { + return NULL; + } + + /* contiguous reduction */ + if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) { + sum_of_products_fn ret = + _contig_outstride0_unary_specialization_table[type_num]; + if (ret != NULL) { + return ret; + } + } + + /* nop of 2 has more specializations */ + if (nop == 2) { + /* Encode the zero/contiguous strides */ + int code; + code = (fixed_strides[0] == 0) ? 0 : + (fixed_strides[0] == itemsize) ? 2*2*1 : 8; + code += (fixed_strides[1] == 0) ? 0 : + (fixed_strides[1] == itemsize) ? 2*1 : 8; + code += (fixed_strides[2] == 0) ? 0 : + (fixed_strides[2] == itemsize) ? 1 : 8; + if (code >= 2 && code < 7) { + sum_of_products_fn ret = + _binary_specialization_table[type_num][code-2]; + if (ret != NULL) { + return ret; + } + } + } + + /* Inner loop with an output stride of 0 */ + if (fixed_strides[nop] == 0) { + return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0]; + } + + /* Check for all contiguous */ + for (iop = 0; iop < nop + 1; ++iop) { + if (fixed_strides[iop] != itemsize) { + break; + } + } + + /* Contiguous loop */ + if (iop == nop + 1) { + return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0]; + } + + /* None of the above specializations caught it, general loops */ + return _unspecialized_table[type_num][nop <= 3 ? nop : 0]; +} diff --git a/numpy/core/src/multiarray/einsum_p.h b/numpy/core/src/multiarray/einsum_p.h new file mode 100644 index 000000000000..7abe1b7f7548 --- /dev/null +++ b/numpy/core/src/multiarray/einsum_p.h @@ -0,0 +1,48 @@ +#ifndef _NPY_EINSUM_P_H_ +#define _NPY_EINSUM_P_H_ + +#define PY_SSIZE_T_CLEAN +#include "Python.h" +#include "structmember.h" + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include +#include +#include +#include + +#include + +#include "simd/simd.h" +#include "convert.h" +#include "common.h" +#include "ctors.h" + +#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) + +/********** PRINTF DEBUG TRACING **************/ +#define NPY_EINSUM_DBG_TRACING 0 + +#if NPY_EINSUM_DBG_TRACING +#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s); +#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1); +#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2); +#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s); +#else +#define NPY_EINSUM_DBG_PRINT(s) +#define NPY_EINSUM_DBG_PRINT1(s, p1) +#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) +#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "einsum.dispatch.h" +#endif + +typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function, + (int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides)) + +#endif // _NPY_EINSUM_P_H_ From 55200fcf36b436dfa92773f14193243020d40f8a Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 11 Aug 2020 16:59:37 +0800 Subject: [PATCH 03/27] add float32 benchmark case --- benchmarks/benchmarks/bench_linalg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index a64fc05f78f1..56f9a2e095c8 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -108,7 +108,7 @@ def time_numpy_linalg_lstsq_a__b_float64(self): class Einsum(Benchmark): param_names = ['dtype'] - params = [[np.float64]] + params = [[np.float32, np.float64]] def setup(self, dtype): self.a = np.arange(2900, dtype=dtype) self.b = np.arange(3000, dtype=dtype) From ae53e350dc4b367356e1268d4a5ca9a085e43cbc Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 12 Aug 2020 16:57:02 +0800 Subject: [PATCH 04/27] fix typos --- numpy/core/src/common/simd/vsx/arithmetic.h | 2 +- numpy/core/src/multiarray/einsum_p.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index eb1aa20b11d7..9a67386f0986 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -103,7 +103,7 @@ #define npyv_div_f32 vec_div #define npyv_div_f64 vec_div -// TODO: Horizontal add: Calculates the sum of all vector elements. +// Horizontal add: Calculates the sum of all vector elements. NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { return vec_extract(a, 0) + vec_extract(a, 1) + diff --git a/numpy/core/src/multiarray/einsum_p.h b/numpy/core/src/multiarray/einsum_p.h index 7abe1b7f7548..6b50f01baa45 100644 --- a/numpy/core/src/multiarray/einsum_p.h +++ b/numpy/core/src/multiarray/einsum_p.h @@ -43,6 +43,6 @@ typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function, - (int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides)) + (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)) #endif // _NPY_EINSUM_P_H_ From 2e713b0b4b119dda516be1e1df1b9cbc18628f1f Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 13 Aug 2020 14:53:41 +0800 Subject: [PATCH 05/27] add avx512 reduce sum comments --- numpy/core/src/common/simd/avx512/arithmetic.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 4b89c06c7321..4e04e9f3f4db 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -115,6 +115,18 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) // TODO: emulate integer division #define npyv_div_f32 _mm512_div_ps #define npyv_div_f64 _mm512_div_pd + +/*************************** + * Reduce Sum + * there are three ways to implement reduce sum for AVX512: + * 1- split(256) /add /split(128) /add /hadd /hadd /extract + * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract + * 3- _mm512_reduce_add_ps/pd + * The first one is been widely used by many projects while the second one is used by Intel Compiler and here + * the reason why the second preferred by intel compiler maybe because the latency of hadd increased by (2-3) + * starting from Skylake-X which makes two extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more clarification. + * The third one is almost the same as the second one but only works for intel compiler/GCC 7.1/Clang 4. + ***************************/ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); From 5e7cbd1f51074500b5d5a304e75f84deb46fecf0 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 20 Aug 2020 17:07:07 +0800 Subject: [PATCH 06/27] add non_contigous arrays ,improve reduce the sum --- benchmarks/benchmarks/bench_linalg.py | 41 ++++++++++++------- numpy/core/src/common/simd/avx2/arithmetic.h | 21 +++++----- .../core/src/common/simd/avx512/arithmetic.h | 12 ++++-- numpy/core/src/common/simd/sse/arithmetic.h | 17 +++++--- 4 files changed, 56 insertions(+), 35 deletions(-) diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index 56f9a2e095c8..04889265b591 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -110,39 +110,50 @@ class Einsum(Benchmark): param_names = ['dtype'] params = [[np.float32, np.float64]] def setup(self, dtype): - self.a = np.arange(2900, dtype=dtype) - self.b = np.arange(3000, dtype=dtype) - self.b1 = np.arange(240000, dtype=dtype).reshape(400, 600) - self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40) - self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40) - self.c2 = np.arange(480000, dtype=dtype) - self.c3 = np.arange(600, dtype=dtype) - self.d = np.arange(10000, dtype=dtype).reshape(10,100,10) + self.one_dim_small = np.arange(600, dtype=dtype) + self.one_dim = np.arange(3000, dtype=dtype) + self.one_dim_big = np.arange(480000, dtype=dtype) + self.two_dim_small = np.arange(1200, dtype=dtype).reshape(30, 40) + self.two_dim = np.arange(240000, dtype=dtype).reshape(400, 600) + self.three_dim_small = np.arange(10000, dtype=dtype).reshape(10,100,10) + self.three_dim = np.arange(24000, dtype=dtype).reshape(20, 30, 40) + # non_contigous arrays + self.non_contigous_dim1_small = np.arange(1, 80, 2, dtype=dtype) + self.non_contigous_dim1 = np.arange(1, 4000, 2, dtype=dtype) + self.non_contigous_dim2 = np.arange(1, 2400, 2, dtype=dtype).reshape(30, 40) + self.non_contigous_dim3 = np.arange(1, 48000, 2, dtype=dtype).reshape(20, 30, 40) # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two def time_einsum_outer(self, dtype): - np.einsum("i,j", self.a, self.b, optimize=True) + np.einsum("i,j", self.one_dim, self.one_dim, optimize=True) + np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True) # multiply(a, b):trigger sum_of_products_contig_two def time_einsum_multiply(self, dtype): - np.einsum("..., ...", self.c1, self.c , optimize=True) + np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True) + np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True) # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two def time_einsum_sum_mul(self, dtype): - np.einsum(",i...->", 300, self.d, optimize=True) + np.einsum(",i...->", 300, self.three_dim_small, optimize=True) + np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True) # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two def time_einsum_sum_mul2(self, dtype): - np.einsum("i...,->", self.d, 300, optimize=True) + np.einsum("i...,->", self.three_dim_small, 300, optimize=True) + np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True) # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two def time_einsum_mul(self, dtype): - np.einsum("i,->i", self.c2, 300, optimize=True) + np.einsum("i,->i", self.one_dim_big, 300, optimize=True) + np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True) # trigger contig_contig_outstride0_two def time_einsum_contig_contig(self, dtype): - np.einsum("ji,i->", self.b1, self.c3, optimize=True) + np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True) + np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True) # trigger sum_of_products_contig_outstride0_one def time_einsum_contig_outstride0(self, dtype): - np.einsum("i->", self.c2, optimize=True) \ No newline at end of file + np.einsum("i->", self.one_dim_big, optimize=True) + np.einsum("i->", self.non_contigous_dim1, optimize=True) \ No newline at end of file diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index d61b4a0f66ae..084e854b4559 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -82,19 +82,20 @@ // Horizontal add: Calculates the sum of all vector elements. NPY_FINLINE float npyv_sum_f32(__m256 a) { - __m128 t1 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a,1)); - __m128 t2 = _mm_movehdup_ps(t1); - __m128 t3 = _mm_add_ps(t1, t2); - __m128 t4 = _mm_movehl_ps(t3, t3); - __m128 t5 = _mm_add_ss(t3, t4); - return _mm_cvtss_f32(t5); + __m256 sum_halves = _mm256_hadd_ps(a, a); + sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); + __m128 lo = _mm256_castps256_ps128(sum_halves); + __m128 hi = _mm256_extractf128_ps(sum_halves, 1); + __m128 sum = _mm_add_ps(lo, hi); + return _mm_cvtss_f32(sum); } NPY_FINLINE double npyv_sum_f64(__m256d a) { - __m128d t1 = _mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a,1)); - __m128d t2 = _mm_unpackhi_pd(t1, t1); - __m128d t3 = _mm_add_sd(t2, t1); - return _mm_cvtsd_f64(t3); + __m256d sum_halves = _mm256_hadd_pd(a, a); + __m128d lo = _mm256_castpd256_pd128(sum_halves); + __m128d hi = _mm256_extractf128_pd(sum_halves, 1); + __m128d sum = _mm_add_pd(lo, hi); + return _mm_cvtsd_f64(sum); } #endif // _NPY_SIMD_AVX2_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 4e04e9f3f4db..def5df093b45 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -122,10 +122,14 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) * 1- split(256) /add /split(128) /add /hadd /hadd /extract * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract * 3- _mm512_reduce_add_ps/pd - * The first one is been widely used by many projects while the second one is used by Intel Compiler and here - * the reason why the second preferred by intel compiler maybe because the latency of hadd increased by (2-3) - * starting from Skylake-X which makes two extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more clarification. - * The third one is almost the same as the second one but only works for intel compiler/GCC 7.1/Clang 4. + * The first one is been widely used by many projects + * + * the second one is used by Intel Compiler, maybe because the + * latency of hadd increased by (2-3) starting from Skylake-X which makes two + * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info. + * + * The third one is almost the same as the second one but only works for + * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. ***************************/ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 62dc0d8cf4b0..74c9539240b8 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -102,18 +102,23 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) // Horizontal add: Calculates the sum of all vector elements. NPY_FINLINE float npyv_sum_f32(__m128 a) { +#ifdef NPY_HAVE_SSE3 + __m128 sum_halves = _mm_hadd_ps(a, a); + return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves)); +#else __m128 t1 = _mm_movehl_ps(a, a); __m128 t2 = _mm_add_ps(a, t1); __m128 t3 = _mm_shuffle_ps(t2, t2, 1); __m128 t4 = _mm_add_ss(t2, t3); - return _mm_cvtss_f32(t4); + return _mm_cvtss_f32(t4); +#endif } - NPY_FINLINE double npyv_sum_f64(__m128d a) { - __m128 t0 = _mm_castpd_ps(a); - __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0)); - __m128d t2 = _mm_add_sd(a,t1); - return _mm_cvtsd_f64(t2); +#ifdef NPY_HAVE_SSE3 + return _mm_cvtsd_f64(_mm_hadd_pd(a, a)); +#else + return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a))); +#endif } #endif // _NPY_SIMD_SSE_ARITHMETIC_H From 90602314902f68e6e0971bf05ce695ec71a52ba8 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 24 Aug 2020 11:35:06 +0800 Subject: [PATCH 07/27] rebase after split for a better review --- numpy/core/setup.py | 2 +- .../core/src/common/simd/avx512/arithmetic.h | 8 + numpy/core/src/multiarray/einsum.c.src | 10 + .../core/src/multiarray/einsum.dispatch.c.src | 4 +- numpy/core/src/multiarray/einsum_p.h | 48 - .../core/src/multiarray/einsum_sumprod.c.src | 1897 ----------------- numpy/core/src/multiarray/einsum_sumprod.h | 27 +- 7 files changed, 45 insertions(+), 1951 deletions(-) delete mode 100644 numpy/core/src/multiarray/einsum_p.h delete mode 100644 numpy/core/src/multiarray/einsum_sumprod.c.src diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a4a84397d9ee..43ba1e22e661 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -855,7 +855,7 @@ def get_mathlib_info(*args): join('src', 'multiarray', 'dragon4.c'), join('src', 'multiarray', 'dtype_transfer.c'), join('src', 'multiarray', 'einsum.c.src'), - join('src', 'multiarray', 'einsum_sumprod.c.src'), + join('src', 'multiarray', 'einsum.dispatch.c.src'), join('src', 'multiarray', 'flagsobject.c'), join('src', 'multiarray', 'getset.c'), join('src', 'multiarray', 'hashdescr.c'), diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index def5df093b45..94ff185fcf5a 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -133,6 +133,9 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) ***************************/ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { +#ifdef NPY_HAVE_AVX512F_REDUCE + return _mm512_reduce_add_ps(a); +#else __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); __m512 sum32 = _mm512_add_ps(a, h64); __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); @@ -142,9 +145,13 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); __m512 sum4 = _mm512_add_ps(sum8, h4); return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); +#endif } NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { +#ifdef NPY_HAVE_AVX512F_REDUCE + return _mm512_reduce_add_pd(a); +#else __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); __m512d sum32 = _mm512_add_pd(a, h64); __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); @@ -152,6 +159,7 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a) __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); __m512d sum8 = _mm512_add_pd(sum16, h16); return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); +#endif } #endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index cfbee0fe9a18..01ca3111eb1e 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -27,6 +27,16 @@ #include "einsum_sumprod.h" #include "einsum_debug.h" +static sum_of_products_fn +get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides) +{ + #ifndef NPY_DISABLE_OPTIMIZATION + #include "einsum.dispatch.h" + #endif + NPY_CPU_DISPATCH_CALL(return einsum_get_sum_of_products_function, + (nop, type_num, itemsize, fixed_strides)) +} + /* * Parses the subscripts for one operand into an output of 'ndim' * labels. The resulting 'op_labels' array will have: diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src index bfb5075e3594..1c692518e3a8 100644 --- a/numpy/core/src/multiarray/einsum.dispatch.c.src +++ b/numpy/core/src/multiarray/einsum.dispatch.c.src @@ -13,8 +13,8 @@ * NEON NEON_VFPV4 * VSX VSX2 */ -#include "einsum_p.h" - +#include "einsum_sumprod.h" +#include "einsum_debug.h" /**begin repeat * #name = byte, short, int, long, longlong, * ubyte, ushort, uint, ulong, ulonglong, diff --git a/numpy/core/src/multiarray/einsum_p.h b/numpy/core/src/multiarray/einsum_p.h deleted file mode 100644 index 6b50f01baa45..000000000000 --- a/numpy/core/src/multiarray/einsum_p.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _NPY_EINSUM_P_H_ -#define _NPY_EINSUM_P_H_ - -#define PY_SSIZE_T_CLEAN -#include "Python.h" -#include "structmember.h" - -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#define _MULTIARRAYMODULE -#include -#include -#include -#include - -#include - -#include "simd/simd.h" -#include "convert.h" -#include "common.h" -#include "ctors.h" - -#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) - -/********** PRINTF DEBUG TRACING **************/ -#define NPY_EINSUM_DBG_TRACING 0 - -#if NPY_EINSUM_DBG_TRACING -#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s); -#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1); -#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2); -#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s); -#else -#define NPY_EINSUM_DBG_PRINT(s) -#define NPY_EINSUM_DBG_PRINT1(s, p1) -#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) -#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) -#endif - -#ifndef NPY_DISABLE_OPTIMIZATION - #include "einsum.dispatch.h" -#endif - -typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function, - (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)) - -#endif // _NPY_EINSUM_P_H_ diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src deleted file mode 100644 index c58e742874d0..000000000000 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ /dev/null @@ -1,1897 +0,0 @@ -/* - * This file provides optimized sum of product implementations used internally - * by einsum. - * - * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com) - * The University of British Columbia - * - * See LICENSE.txt for the license. - */ - -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#define _MULTIARRAYMODULE - -#include -#include /* for NPY_NTYPES */ -#include - -#include "einsum_sumprod.h" -#include "einsum_debug.h" - - -#ifdef NPY_HAVE_SSE_INTRINSICS -#define EINSUM_USE_SSE1 1 -#else -#define EINSUM_USE_SSE1 0 -#endif - -#ifdef NPY_HAVE_SSE2_INTRINSICS -#define EINSUM_USE_SSE2 1 -#else -#define EINSUM_USE_SSE2 0 -#endif - -#if EINSUM_USE_SSE1 -#include -#endif - -#if EINSUM_USE_SSE2 -#include -#endif - -#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0) - -/**********************************************/ - -/**begin repeat - * #name = byte, short, int, long, longlong, - * ubyte, ushort, uint, ulong, ulonglong, - * half, float, double, longdouble, - * cfloat, cdouble, clongdouble# - * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_cfloat, npy_cdouble, npy_clongdouble# - * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_float, npy_float, npy_double, npy_longdouble, - * npy_float, npy_double, npy_longdouble# - * #to = ,,,,, - * ,,,,, - * npy_float_to_half,,,, - * ,,# - * #from = ,,,,, - * ,,,,, - * npy_half_to_float,,,, - * ,,# - * #complex = 0*5, - * 0*5, - * 0*4, - * 1*3# - * #float32 = 0*5, - * 0*5, - * 0,1,0,0, - * 0*3# - * #float64 = 0*5, - * 0*5, - * 0,0,1,0, - * 0*3# - */ - -/**begin repeat1 - * #nop = 1, 2, 3, 1000# - * #noplabel = one, two, three, any# - */ -static void -@name@_sum_of_products_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) && !@complex@ - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) && !@complex@ - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data_out = dataptr[@nop@]; - npy_intp stride_out = strides[@nop@]; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count); - - while (count--) { -#if !@complex@ -# if @nop@ == 1 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data_out += stride_out; -# elif @nop@ == 2 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data1 += stride1; - data_out += stride_out; -# elif @nop@ == 3 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) * - @from@(*(@type@ *)data2) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data1 += stride1; - data2 += stride2; - data_out += stride_out; -# else - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - *(@type@ *)dataptr[nop] = @to@(temp + - @from@(*(@type@ *)dataptr[i])); - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -# endif -#else /* complex */ -# if @nop@ == 1 - ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] + - ((@temptype@ *)data_out)[0]; - ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] + - ((@temptype@ *)data_out)[1]; - data0 += stride0; - data_out += stride_out; -# else -# if @nop@ <= 3 -#define _SUMPROD_NOP @nop@ -# else -#define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; - - for (i = 0; i <= _SUMPROD_NOP; ++i) { - dataptr[i] += strides[i]; - } -#undef _SUMPROD_NOP -# endif -#endif - } -} - -#if @nop@ == 1 - -static void -@name@_sum_of_products_contig_one(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data_out = (@type@ *)dataptr[1]; - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - data_out[@i@] = @to@(@from@(data0[@i@]) + - @from@(data_out[@i@])); -#else - ((@temptype@ *)data_out + 2*@i@)[0] = - ((@temptype@ *)data0 + 2*@i@)[0] + - ((@temptype@ *)data_out + 2*@i@)[0]; - ((@temptype@ *)data_out + 2*@i@)[1] = - ((@temptype@ *)data0 + 2*@i@)[1] + - ((@temptype@ *)data_out + 2*@i@)[1]; -#endif -/**end repeat2**/ - case 0: - return; - } - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -#if !@complex@ - data_out[@i@] = @to@(@from@(data0[@i@]) + - @from@(data_out[@i@])); -#else /* complex */ - ((@temptype@ *)data_out + 2*@i@)[0] = - ((@temptype@ *)data0 + 2*@i@)[0] + - ((@temptype@ *)data_out + 2*@i@)[0]; - ((@temptype@ *)data_out + 2*@i@)[1] = - ((@temptype@ *)data0 + 2*@i@)[1] + - ((@temptype@ *)data_out + 2*@i@)[1]; -#endif -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#elif @nop@ == 2 && !@complex@ - -static void -@name@_sum_of_products_contig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -/* Some extra specializations for the two operand case */ -static void -@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value0_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value0_sse; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value0_sse = _mm_set_ps1(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } - } -#elif EINSUM_USE_SSE2 && @float64@ - value0_sse = _mm_set1_pd(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } -} - -static void -@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value1_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value1_sse; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value1_sse = _mm_set_ps1(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - value1_sse = _mm_set1_pd(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data1[@i@]); -/**end repeat2**/ -#endif - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#elif @nop@ == 3 && !@complex@ - -static void -@name@_sum_of_products_contig_three(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data2 = (@type@ *)dataptr[2]; - @type@ *data_out = (@type@ *)dataptr[3]; - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) * - @from@(data2[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data2 += 8; - data_out += 8; - } - - /* Finish off the loop */ - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - if (count-- == 0) { - return; - } - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) * - @from@(data2[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -} - -#else /* @nop@ > 3 || @complex */ - -static void -@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n", - (int)count); - - while (count--) { -#if !@complex@ - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - *(@type@ *)dataptr[nop] = @to@(temp + - @from@(*(@type@ *)dataptr[i])); - for (i = 0; i <= nop; ++i) { - dataptr[i] += sizeof(@type@); - } -#else /* complex */ -# if @nop@ <= 3 -# define _SUMPROD_NOP @nop@ -# else -# define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; - - for (i = 0; i <= _SUMPROD_NOP; ++i) { - dataptr[i] += sizeof(@type@); - } -# undef _SUMPROD_NOP -#endif - } -} - -#endif /* functions for various @nop@ */ - -#if @nop@ == 1 - -static void -@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; - @temptype@ *data0 = (@temptype@ *)dataptr[0]; -#else - @temptype@ accum = 0; - @type@ *data0 = (@type@ *)dataptr[0]; -#endif - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - accum += @from@(data0[@i@]); -#else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -#endif -/**end repeat2**/ - case 0: -#if @complex@ - ((@temptype@ *)dataptr[1])[0] += accum_re; - ((@temptype@ *)dataptr[1])[1] += accum_im; -#else - *((@type@ *)dataptr[1]) = @to@(accum + - @from@(*((@type@ *)dataptr[1]))); -#endif - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -# if !@complex@ - accum += @from@(data0[@i@]); -# else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -# endif -/**end repeat2**/ -#endif - -#if !@complex@ - data0 += 8; -#else - data0 += 8*2; -#endif - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#endif /* @nop@ == 1 */ - -static void -@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; -#else - @temptype@ accum = 0; -#endif - -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) && !@complex@ - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) && !@complex@ - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n", - (int)count); - - while (count--) { -#if !@complex@ -# if @nop@ == 1 - accum += @from@(*(@type@ *)data0); - data0 += stride0; -# elif @nop@ == 2 - accum += @from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1); - data0 += stride0; - data1 += stride1; -# elif @nop@ == 3 - accum += @from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) * - @from@(*(@type@ *)data2); - data0 += stride0; - data1 += stride1; - data2 += stride2; -# else - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - accum += temp; - for (i = 0; i < nop; ++i) { - dataptr[i] += strides[i]; - } -# endif -#else /* complex */ -# if @nop@ == 1 - accum_re += ((@temptype@ *)data0)[0]; - accum_im += ((@temptype@ *)data0)[1]; - data0 += stride0; -# else -# if @nop@ <= 3 -#define _SUMPROD_NOP @nop@ -# else -#define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - accum_re += re; - accum_im += im; - for (i = 0; i < _SUMPROD_NOP; ++i) { - dataptr[i] += strides[i]; - } -#undef _SUMPROD_NOP -# endif -#endif - } - -#if @complex@ -# if @nop@ <= 3 - ((@temptype@ *)dataptr[@nop@])[0] += accum_re; - ((@temptype@ *)dataptr[@nop@])[1] += accum_im; -# else - ((@temptype@ *)dataptr[nop])[0] += accum_re; - ((@temptype@ *)dataptr[nop])[1] += accum_im; -# endif -#else -# if @nop@ <= 3 - *((@type@ *)dataptr[@nop@]) = @to@(accum + - @from@(*((@type@ *)dataptr[@nop@]))); -# else - *((@type@ *)dataptr[nop]) = @to@(accum + - @from@(*((@type@ *)dataptr[nop]))); -# endif -#endif - -} - -/**end repeat1**/ - -/**end repeat**/ - - -/* Do OR of ANDs for the boolean type */ - -/**begin repeat - * #nop = 1, 2, 3, 1000# - * #noplabel = one, two, three, any# - */ - -static void -bool_sum_of_products_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ <= 3) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif -#if (@nop@ <= 3) - char *data_out = dataptr[@nop@]; - npy_intp stride_out = strides[@nop@]; -#endif - - while (count--) { -#if @nop@ == 1 - *(npy_bool *)data_out = *(npy_bool *)data0 || - *(npy_bool *)data_out; - data0 += stride0; - data_out += stride_out; -#elif @nop@ == 2 - *(npy_bool *)data_out = (*(npy_bool *)data0 && - *(npy_bool *)data1) || - *(npy_bool *)data_out; - data0 += stride0; - data1 += stride1; - data_out += stride_out; -#elif @nop@ == 3 - *(npy_bool *)data_out = (*(npy_bool *)data0 && - *(npy_bool *)data1 && - *(npy_bool *)data2) || - *(npy_bool *)data_out; - data0 += stride0; - data1 += stride1; - data2 += stride2; - data_out += stride_out; -#else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -#endif - } -} - -static void -bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ <= 3) - char *data0 = dataptr[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; -#endif -#if (@nop@ <= 3) - char *data_out = dataptr[@nop@]; -#endif - -#if (@nop@ <= 3) -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat1 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -# if @nop@ == 1 - ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] || - ((npy_bool *)data_out)[@i@]; -# elif @nop@ == 2 - ((npy_bool *)data_out)[@i@] = - (((npy_bool *)data0)[@i@] && - ((npy_bool *)data1)[@i@]) || - ((npy_bool *)data_out)[@i@]; -# elif @nop@ == 3 - ((npy_bool *)data_out)[@i@] = - (((npy_bool *)data0)[@i@] && - ((npy_bool *)data1)[@i@] && - ((npy_bool *)data2)[@i@]) || - ((npy_bool *)data_out)[@i@]; -# endif -/**end repeat1**/ - case 0: - return; - } -#endif - -/* Unroll the loop by 8 for fixed-size nop */ -#if (@nop@ <= 3) - while (count >= 8) { - count -= 8; -#else - while (count--) { -#endif - -# if @nop@ == 1 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# elif @nop@ == 2 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = - ((*((npy_bool *)data0 + @i@)) && - (*((npy_bool *)data1 + @i@))) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data1 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# elif @nop@ == 3 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = - ((*((npy_bool *)data0 + @i@)) && - (*((npy_bool *)data1 + @i@)) && - (*((npy_bool *)data2 + @i@))) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data1 += 8*sizeof(npy_bool); - data2 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; - for (i = 0; i <= nop; ++i) { - dataptr[i] += sizeof(npy_bool); - } -# endif - } - - /* If the loop was unrolled, we need to finish it off */ -#if (@nop@ <= 3) - goto finish_after_unrolled_loop; -#endif -} - -static void -bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ - npy_bool accum = 0; - -#if (@nop@ <= 3) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif - - while (count--) { -#if @nop@ == 1 - accum = *(npy_bool *)data0 || accum; - data0 += stride0; -#elif @nop@ == 2 - accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum; - data0 += stride0; - data1 += stride1; -#elif @nop@ == 3 - accum = (*(npy_bool *)data0 && - *(npy_bool *)data1 && - *(npy_bool *)data2) || accum; - data0 += stride0; - data1 += stride1; - data2 += stride2; -#else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - accum = temp || accum; - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -#endif - } - -# if @nop@ <= 3 - *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]); -# else - *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]); -# endif -} - -/**end repeat**/ - -/* These tables need to match up with the type enum */ -static sum_of_products_fn -_contig_outstride0_unary_specialization_table[NPY_NTYPES] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 0, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ - &@name@_sum_of_products_contig_outstride0_one, -#else - NULL, -#endif -/**end repeat**/ -}; /* End of _contig_outstride0_unary_specialization_table */ - -static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 0, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 0, 0, 0, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_stride0_contig_outstride0_two, - &@name@_sum_of_products_stride0_contig_outcontig_two, - &@name@_sum_of_products_contig_stride0_outstride0_two, - &@name@_sum_of_products_contig_stride0_outcontig_two, - &@name@_sum_of_products_contig_contig_outstride0_two, -}, -#else - {NULL, NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _binary_specialization_table */ - -static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_outstride0_any, - &@name@_sum_of_products_outstride0_one, - &@name@_sum_of_products_outstride0_two, - &@name@_sum_of_products_outstride0_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _outstride0_specialized_table */ - -static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_contig_any, - &@name@_sum_of_products_contig_one, - &@name@_sum_of_products_contig_two, - &@name@_sum_of_products_contig_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _allcontig_specialized_table */ - -static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_any, - &@name@_sum_of_products_one, - &@name@_sum_of_products_two, - &@name@_sum_of_products_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _unnspecialized_table */ - -NPY_VISIBILITY_HIDDEN sum_of_products_fn -get_sum_of_products_function(int nop, int type_num, - npy_intp itemsize, npy_intp const *fixed_strides) -{ - int iop; - - if (type_num >= NPY_NTYPES) { - return NULL; - } - - /* contiguous reduction */ - if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) { - sum_of_products_fn ret = - _contig_outstride0_unary_specialization_table[type_num]; - if (ret != NULL) { - return ret; - } - } - - /* nop of 2 has more specializations */ - if (nop == 2) { - /* Encode the zero/contiguous strides */ - int code; - code = (fixed_strides[0] == 0) ? 0 : - (fixed_strides[0] == itemsize) ? 2*2*1 : 8; - code += (fixed_strides[1] == 0) ? 0 : - (fixed_strides[1] == itemsize) ? 2*1 : 8; - code += (fixed_strides[2] == 0) ? 0 : - (fixed_strides[2] == itemsize) ? 1 : 8; - if (code >= 2 && code < 7) { - sum_of_products_fn ret = - _binary_specialization_table[type_num][code-2]; - if (ret != NULL) { - return ret; - } - } - } - - /* Inner loop with an output stride of 0 */ - if (fixed_strides[nop] == 0) { - return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0]; - } - - /* Check for all contiguous */ - for (iop = 0; iop < nop + 1; ++iop) { - if (fixed_strides[iop] != itemsize) { - break; - } - } - - /* Contiguous loop */ - if (iop == nop + 1) { - return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0]; - } - - /* None of the above specializations caught it, general loops */ - return _unspecialized_table[type_num][nop <= 3 ? nop : 0]; -} diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h index c6cf18ec6094..0a4e5bbd2718 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.h +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -1,12 +1,33 @@ #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H +#define PY_SSIZE_T_CLEAN +#include "Python.h" +#include "structmember.h" +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE #include +#include +#include +#include + +#include + +#include "simd/simd.h" +#include "convert.h" +#include "common.h" +#include "ctors.h" + +#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) +#include + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "einsum.dispatch.h" +#endif typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); -NPY_VISIBILITY_HIDDEN sum_of_products_fn -get_sum_of_products_function(int nop, int type_num, - npy_intp itemsize, npy_intp const *fixed_strides); +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function, + (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)) #endif From 1990c13d5d45ab925555566c1f106d501e649ba3 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 25 Aug 2020 11:59:24 +0800 Subject: [PATCH 08/27] headers reconstruct --- .../core/src/multiarray/einsum.dispatch.c.src | 7 +++++++ numpy/core/src/multiarray/einsum_sumprod.h | 18 ------------------ 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src index 1c692518e3a8..4a95d59c507c 100644 --- a/numpy/core/src/multiarray/einsum.dispatch.c.src +++ b/numpy/core/src/multiarray/einsum.dispatch.c.src @@ -13,8 +13,15 @@ * NEON NEON_VFPV4 * VSX VSX2 */ + +#define _MULTIARRAYMODULE +#include +#include #include "einsum_sumprod.h" #include "einsum_debug.h" +#include "simd/simd.h" +#include "common.h" + /**begin repeat * #name = byte, short, int, long, longlong, * ubyte, ushort, uint, ulong, ulonglong, diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h index 0a4e5bbd2718..5a863767e29d 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.h +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -1,25 +1,7 @@ #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H -#define PY_SSIZE_T_CLEAN -#include "Python.h" -#include "structmember.h" - -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#define _MULTIARRAYMODULE -#include -#include -#include -#include - -#include - -#include "simd/simd.h" -#include "convert.h" -#include "common.h" -#include "ctors.h" #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) -#include #ifndef NPY_DISABLE_OPTIMIZATION #include "einsum.dispatch.h" From 7b756af0c709d2caa9d484f11f274c44c690399e Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 25 Aug 2020 14:59:01 +0800 Subject: [PATCH 09/27] use for loop replace begin repeat for readability --- .../core/src/multiarray/einsum.dispatch.c.src | 68 ++++++------------- 1 file changed, 19 insertions(+), 49 deletions(-) diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src index 4a95d59c507c..613c6026043b 100644 --- a/numpy/core/src/multiarray/einsum.dispatch.c.src +++ b/numpy/core/src/multiarray/einsum.dispatch.c.src @@ -262,23 +262,13 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) { - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); - npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); - npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); - /**end repeat3**/ - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@); - /**end repeat3**/ - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); - /**end repeat3**/ + for (int i = 0; i < 4; i++) { + npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i); + npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i); + npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); + npyv_@sfx@ abc = npyv_muladd_@sfx@(a, b, c); + npyv_@st@_@sfx@(data_out + vstep * i, abc); + } } #elif @unroll_by@ == 2 const int vstepx2 = vstep * 2; @@ -367,22 +357,12 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) { - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); - npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); - /**end repeat3**/ - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@); - /**end repeat3**/ - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); - /**end repeat3**/ + for (int i = 0; i < 4; i++) { + npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i); + npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); + npyv_@sfx@ abc = npyv_muladd_@sfx@(va_scalar, b, c); + npyv_@st@_@sfx@(data_out + vstep * i, abc); + } } #elif @unroll_by@ == 2 const int vstepx2 = vstep * 2; @@ -464,22 +444,12 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) { - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); - npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); - /**end repeat3**/ - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@); - /**end repeat3**/ - /**begin repeat3 - * #i = 0, 1, 2, 3# - */ - npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); - /**end repeat3**/ + for (int i = 0; i < 4; i++) { + npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i); + npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); + npyv_@sfx@ abc = npyv_muladd_@sfx@(a, vb_scalar, c); + npyv_@st@_@sfx@(data_out + vstep * i, abc); + } } #elif @unroll_by@ == 2 const int vstepx2 = vstep * 2; From 4877e4062093e637f5b0715078fdeac19c211df5 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 26 Aug 2020 14:41:29 +0800 Subject: [PATCH 10/27] add ivdeps and handle header dependency --- benchmarks/benchmarks/bench_linalg.py | 2 +- .../core/src/multiarray/einsum.dispatch.c.src | 18 +++++++++++++----- numpy/core/src/multiarray/einsum_sumprod.h | 3 +++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index 04889265b591..4ce14ac3ddf9 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -156,4 +156,4 @@ def time_einsum_contig_contig(self, dtype): # trigger sum_of_products_contig_outstride0_one def time_einsum_contig_outstride0(self, dtype): np.einsum("i->", self.one_dim_big, optimize=True) - np.einsum("i->", self.non_contigous_dim1, optimize=True) \ No newline at end of file + np.einsum("i->", self.non_contigous_dim1, optimize=True) diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src index 613c6026043b..e7741bb6f509 100644 --- a/numpy/core/src/multiarray/einsum.dispatch.c.src +++ b/numpy/core/src/multiarray/einsum.dispatch.c.src @@ -1,12 +1,13 @@ /* - * This file contains the implementation of the 'einsum' function, - * which provides an einstein-summation operation. + * This file provides optimized sum of product implementations used internally + * by einsum. * * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com) * The University of British Columbia * * See LICENSE.txt for the license. */ + /** * @targets $maxopt baseline * SSE2 (AVX2 FMA3) AVX512F @@ -14,13 +15,11 @@ * VSX VSX2 */ +#define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE -#include #include #include "einsum_sumprod.h" #include "einsum_debug.h" -#include "simd/simd.h" -#include "common.h" /**begin repeat * #name = byte, short, int, long, longlong, @@ -262,6 +261,9 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) { +#ifdef __GNUC__ +#pragma GCC ivdep +#endif for (int i = 0; i < 4; i++) { npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i); npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i); @@ -357,6 +359,9 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) { +#ifdef __GNUC__ +#pragma GCC ivdep +#endif for (int i = 0; i < 4; i++) { npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i); npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); @@ -444,6 +449,9 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) { +#ifdef __GNUC__ +#pragma GCC ivdep +#endif for (int i = 0; i < 4; i++) { npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i); npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h index 5a863767e29d..5683b5b1851c 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.h +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -1,6 +1,9 @@ #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H +#include "simd/simd.h" +#include "common.h" + #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) #ifndef NPY_DISABLE_OPTIMIZATION From 954e642a82d6138d22aead5d0ae608b8bd77cd48 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 27 Aug 2020 10:19:57 +0800 Subject: [PATCH 11/27] revert to faster simd code --- .../core/src/multiarray/einsum.dispatch.c.src | 81 ++++++++++++------- numpy/core/src/multiarray/einsum_sumprod.h | 8 +- 2 files changed, 56 insertions(+), 33 deletions(-) diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum.dispatch.c.src index e7741bb6f509..d73ec872b1fa 100644 --- a/numpy/core/src/multiarray/einsum.dispatch.c.src +++ b/numpy/core/src/multiarray/einsum.dispatch.c.src @@ -20,6 +20,10 @@ #include #include "einsum_sumprod.h" #include "einsum_debug.h" +#include "simd/simd.h" +#include "common.h" + +#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) /**begin repeat * #name = byte, short, int, long, longlong, @@ -261,16 +265,23 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) { -#ifdef __GNUC__ -#pragma GCC ivdep -#endif - for (int i = 0; i < 4; i++) { - npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i); - npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i); - npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); - npyv_@sfx@ abc = npyv_muladd_@sfx@(a, b, c); - npyv_@st@_@sfx@(data_out + vstep * i, abc); - } + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ } #elif @unroll_by@ == 2 const int vstepx2 = vstep * 2; @@ -359,15 +370,22 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) { -#ifdef __GNUC__ -#pragma GCC ivdep -#endif - for (int i = 0; i < 4; i++) { - npyv_@sfx@ b = npyv_@ld@_@sfx@(data1 + vstep * i); - npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); - npyv_@sfx@ abc = npyv_muladd_@sfx@(va_scalar, b, c); - npyv_@st@_@sfx@(data_out + vstep * i, abc); - } + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ } #elif @unroll_by@ == 2 const int vstepx2 = vstep * 2; @@ -449,15 +467,22 @@ static void #if @unroll_by@ == 4 const int vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) { -#ifdef __GNUC__ -#pragma GCC ivdep -#endif - for (int i = 0; i < 4; i++) { - npyv_@sfx@ a = npyv_@ld@_@sfx@(data0 + vstep * i); - npyv_@sfx@ c = npyv_@ld@_@sfx@(data_out + vstep * i); - npyv_@sfx@ abc = npyv_muladd_@sfx@(a, vb_scalar, c); - npyv_@st@_@sfx@(data_out + vstep * i, abc); - } + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@); + /**end repeat3**/ + /**begin repeat3 + * #i = 0, 1, 2, 3# + */ + npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@); + /**end repeat3**/ } #elif @unroll_by@ == 2 const int vstepx2 = vstep * 2; diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h index 5683b5b1851c..59c374434613 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.h +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -1,10 +1,8 @@ #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H -#include "simd/simd.h" -#include "common.h" - -#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) +#include "npy_cpu_dispatch.h" +#include #ifndef NPY_DISABLE_OPTIMIZATION #include "einsum.dispatch.h" @@ -12,7 +10,7 @@ typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT sum_of_products_fn einsum_get_sum_of_products_function, +NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN sum_of_products_fn einsum_get_sum_of_products_function, (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)) #endif From 50c6b7ee1468be16d18e022546fb90d1ba9835ea Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 28 Aug 2020 11:39:25 +0800 Subject: [PATCH 12/27] changed to baseline solution --- numpy/core/setup.py | 2 +- numpy/core/src/multiarray/einsum.c.src | 10 ---------- .../{einsum.dispatch.c.src => einsum_sumprod.c.src} | 12 +++--------- numpy/core/src/multiarray/einsum_sumprod.h | 11 +++-------- 4 files changed, 7 insertions(+), 28 deletions(-) rename numpy/core/src/multiarray/{einsum.dispatch.c.src => einsum_sumprod.c.src} (99%) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 43ba1e22e661..a4a84397d9ee 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -855,7 +855,7 @@ def get_mathlib_info(*args): join('src', 'multiarray', 'dragon4.c'), join('src', 'multiarray', 'dtype_transfer.c'), join('src', 'multiarray', 'einsum.c.src'), - join('src', 'multiarray', 'einsum.dispatch.c.src'), + join('src', 'multiarray', 'einsum_sumprod.c.src'), join('src', 'multiarray', 'flagsobject.c'), join('src', 'multiarray', 'getset.c'), join('src', 'multiarray', 'hashdescr.c'), diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index 01ca3111eb1e..cfbee0fe9a18 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -27,16 +27,6 @@ #include "einsum_sumprod.h" #include "einsum_debug.h" -static sum_of_products_fn -get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides) -{ - #ifndef NPY_DISABLE_OPTIMIZATION - #include "einsum.dispatch.h" - #endif - NPY_CPU_DISPATCH_CALL(return einsum_get_sum_of_products_function, - (nop, type_num, itemsize, fixed_strides)) -} - /* * Parses the subscripts for one operand into an output of 'ndim' * labels. The resulting 'op_labels' array will have: diff --git a/numpy/core/src/multiarray/einsum.dispatch.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src similarity index 99% rename from numpy/core/src/multiarray/einsum.dispatch.c.src rename to numpy/core/src/multiarray/einsum_sumprod.c.src index d73ec872b1fa..8fff213bec2a 100644 --- a/numpy/core/src/multiarray/einsum.dispatch.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -8,13 +8,6 @@ * See LICENSE.txt for the license. */ -/** - * @targets $maxopt baseline - * SSE2 (AVX2 FMA3) AVX512F - * NEON NEON_VFPV4 - * VSX VSX2 - */ - #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #include @@ -1481,8 +1474,9 @@ static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = { /**end repeat**/ }; /* End of _unnspecialized_table */ -NPY_NO_EXPORT sum_of_products_fn NPY_CPU_DISPATCH_CURFX(einsum_get_sum_of_products_function) -(int nop, int type_num, npy_intp itemsize,npy_intp const *fixed_strides) +NPY_VISIBILITY_HIDDEN sum_of_products_fn +get_sum_of_products_function(int nop, int type_num, + npy_intp itemsize, npy_intp const *fixed_strides) { int iop; diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h index 59c374434613..a9fdc733d182 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.h +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -1,16 +1,11 @@ #ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H #define _NPY_MULTIARRAY_EINSUM_SUMPROD_H -#include "npy_cpu_dispatch.h" #include -#ifndef NPY_DISABLE_OPTIMIZATION - #include "einsum.dispatch.h" -#endif - typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); -NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN sum_of_products_fn einsum_get_sum_of_products_function, - (int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides)) - +NPY_VISIBILITY_HIDDEN sum_of_products_fn +get_sum_of_products_function(int nop, int type_num, + npy_intp itemsize, npy_intp const *fixed_strides); #endif From 23e28c0112fa5d6d50d1f55020cc630a8d144129 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 31 Aug 2020 10:45:32 +0800 Subject: [PATCH 13/27] remove redundant typedef --- numpy/core/src/multiarray/einsum_sumprod.c.src | 2 -- 1 file changed, 2 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 8fff213bec2a..67b40e0fb178 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -1294,8 +1294,6 @@ bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, /**end repeat**/ -typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); - /* These tables need to match up with the type enum */ static sum_of_products_fn _contig_outstride0_unary_specialization_table[NPY_NTYPES] = { From d298c8e63987f7d292380a34936d8374cf8c8f2b Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 10 Sep 2020 21:06:03 +0800 Subject: [PATCH 14/27] remove redundant intrinsics --- numpy/core/src/common/simd/avx2/arithmetic.h | 7 ------- numpy/core/src/common/simd/avx512/arithmetic.h | 3 --- numpy/core/src/common/simd/neon/arithmetic.h | 6 ------ numpy/core/src/common/simd/sse/arithmetic.h | 8 +------- 4 files changed, 1 insertion(+), 23 deletions(-) diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 21a1fe8818bc..f00d8e153fe4 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -62,13 +62,6 @@ #define npyv_mul_f32 _mm256_mul_ps #define npyv_mul_f64 _mm256_mul_pd -#ifdef NPY_HAVE_FMA3 - #define npyv_muladd_f32 _mm256_fmadd_ps - #define npyv_muladd_f64 _mm256_fmadd_pd -#else - #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c) - #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c) -#endif // saturated // TODO: after implment Packs intrins diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 68752bf86146..a783e98ae94e 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -103,9 +103,6 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_mul_f32 _mm512_mul_ps #define npyv_mul_f64 _mm512_mul_pd -#define npyv_muladd_f32 _mm512_fmadd_ps -#define npyv_muladd_f64 _mm512_fmadd_pd - // saturated // TODO: after implment Packs intrins diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 7234928de964..ff31311d5dcf 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -60,12 +60,6 @@ #define npyv_mul_f32 vmulq_f32 #define npyv_mul_f64 vmulq_f64 -#ifdef NPY_HAVE_NEON_VFPV4 - #define npyv_muladd_f32(A, B, C) vfmaq_f32(C, A, B) -#else - #define npyv_muladd_f32(A, B, C) vmlaq_f32(C, A, B) -#endif -#define npyv_muladd_f64(A, B, C) vfmaq_f64(C, A, B) /*************************** * Division ***************************/ diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 32d2ec560807..e1e158ff41df 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -82,13 +82,6 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) #define npyv_mul_f32 _mm_mul_ps #define npyv_mul_f64 _mm_mul_pd -#ifdef NPY_HAVE_FMA3 - #define npyv_muladd_f32 _mm_fmadd_ps - #define npyv_muladd_f64 _mm_fmadd_pd -#else - #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c) - #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c) -#endif // saturated // TODO: after implment Packs intrins @@ -113,6 +106,7 @@ NPY_FINLINE float npyv_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } + NPY_FINLINE double npyv_sum_f64(__m128d a) { #ifdef NPY_HAVE_SSE3 From 6dac52e03bb4859ad0fb86345c93df0d8635e739 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 11 Sep 2020 09:24:16 +0800 Subject: [PATCH 15/27] add blank lines --- numpy/core/src/common/simd/vsx/arithmetic.h | 3 --- numpy/core/src/multiarray/einsum.c.src | 1 + numpy/core/src/multiarray/einsum_sumprod.h | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 70e3c05d42ab..5454b2eef2fc 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -94,9 +94,6 @@ #define npyv_mul_f32 vec_mul #define npyv_mul_f64 vec_mul -#define npyv_muladd_f32 vec_madd -#define npyv_muladd_f64 vec_madd - /*************************** * Division ***************************/ diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index cfbee0fe9a18..6ad375f670a5 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -27,6 +27,7 @@ #include "einsum_sumprod.h" #include "einsum_debug.h" + /* * Parses the subscripts for one operand into an output of 'ndim' * labels. The resulting 'op_labels' array will have: diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h index a9fdc733d182..c6cf18ec6094 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.h +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -8,4 +8,5 @@ typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); NPY_VISIBILITY_HIDDEN sum_of_products_fn get_sum_of_products_function(int nop, int type_num, npy_intp itemsize, npy_intp const *fixed_strides); + #endif From 985e5b26492df2df347bac29a0c2859339421e2c Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 14 Sep 2020 10:25:15 +0800 Subject: [PATCH 16/27] add format --- numpy/core/src/multiarray/einsum_sumprod.c.src | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 67b40e0fb178..4d1d3f5f63ab 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -7,7 +7,7 @@ * * See LICENSE.txt for the license. */ - + #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #include @@ -293,7 +293,7 @@ static void #endif } /**end repeat2**/ -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four scalars in case of: @@ -395,7 +395,7 @@ static void #endif } /**end repeat2**/ -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four scalars in case of: @@ -492,7 +492,7 @@ static void #endif } /**end repeat2**/ -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four scalars in case of: @@ -583,7 +583,7 @@ static void } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four scalars in case of: @@ -662,7 +662,7 @@ static void } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four scalars in case of: @@ -735,7 +735,7 @@ static void } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four scalars in case of: @@ -911,7 +911,7 @@ static void } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); -npyv_cleanup(); + npyv_cleanup(); #endif // NPYV check for @type@ /** * Unroll by four/eight scalars in case of: From 88c27475c2d60c9246945bf7ebf7fc61cd594a5f Mon Sep 17 00:00:00 2001 From: Chunlin Date: Mon, 14 Sep 2020 19:29:59 +0800 Subject: [PATCH 17/27] Update numpy/core/src/common/simd/avx512/arithmetic.h Co-authored-by: Eric Wieser --- .../core/src/common/simd/avx512/arithmetic.h | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index a783e98ae94e..39d93be257d3 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -128,36 +128,33 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) * The third one is almost the same as the second one but only works for * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. ***************************/ -NPY_FINLINE float npyv_sum_f32(npyv_f32 a) -{ #ifdef NPY_HAVE_AVX512F_REDUCE - return _mm512_reduce_add_ps(a); + #define npyv_sum_f32 _mm512_reduce_add_ps + #define npyv_sum_f64 _mm512_reduce_add_pd #else - __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 sum32 = _mm512_add_ps(a, h64); - __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); - __m512 sum16 = _mm512_add_ps(sum32, h32); - __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); - __m512 sum8 = _mm512_add_ps(sum16, h16); - __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); - __m512 sum4 = _mm512_add_ps(sum8, h4); - return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) + { + __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 sum32 = _mm512_add_ps(a, h64); + __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum16 = _mm512_add_ps(sum32, h32); + __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum8 = _mm512_add_ps(sum16, h16); + __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); + __m512 sum4 = _mm512_add_ps(sum8, h4); + return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); + } + NPY_FINLINE double npyv_sum_f64(npyv_f64 a) + { + __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512d sum32 = _mm512_add_pd(a, h64); + __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512d sum16 = _mm512_add_pd(sum32, h32); + __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); + __m512d sum8 = _mm512_add_pd(sum16, h16); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); + } #endif -} -NPY_FINLINE double npyv_sum_f64(npyv_f64 a) -{ -#ifdef NPY_HAVE_AVX512F_REDUCE - return _mm512_reduce_add_pd(a); -#else - __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); - __m512d sum32 = _mm512_add_pd(a, h64); - __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); - __m512d sum16 = _mm512_add_pd(sum32, h32); - __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); - __m512d sum8 = _mm512_add_pd(sum16, h16); - return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); -#endif -} /*************************** * FUSED From 54943e09f864417ce56c14268dfe00576a144929 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 15 Sep 2020 09:36:31 +0800 Subject: [PATCH 18/27] modify the int to npy_intp --- .../core/src/multiarray/einsum_sumprod.c.src | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 4d1d3f5f63ab..c5e3dc70e0a5 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -256,7 +256,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -277,7 +277,7 @@ static void /**end repeat3**/ } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) { npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); @@ -361,7 +361,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -381,7 +381,7 @@ static void /**end repeat3**/ } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) { npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); @@ -458,7 +458,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -478,7 +478,7 @@ static void /**end repeat3**/ } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) { npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); @@ -556,7 +556,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -570,7 +570,7 @@ static void vaccum = npyv_muladd_@sfx@(a0, b0, ab1); } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) { npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); @@ -638,7 +638,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -651,7 +651,7 @@ static void vaccum = npyv_add_@sfx@(b0123, vaccum); } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) { npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1); npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep); @@ -711,7 +711,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -724,7 +724,7 @@ static void vaccum = npyv_add_@sfx@(a0123, vaccum); } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) { npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); @@ -887,7 +887,7 @@ static void */ @cond@ { #if @unroll_by@ == 4 - const int vstepx4 = vstep * 4; + const npy_intp vstepx4 = vstep * 4; for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) { /**begin repeat3 * #i = 0, 1, 2, 3# @@ -900,7 +900,7 @@ static void vaccum = npyv_add_@sfx@(a0123, vaccum); } #elif @unroll_by@ == 2 - const int vstepx2 = vstep * 2; + const npy_intp vstepx2 = vstep * 2; for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) { npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0); npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep); From e993af2dca9b658cb08aa0111bc031a97dbe6430 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 18 Sep 2020 11:27:02 +0800 Subject: [PATCH 19/27] split benchmark and define common macro --- benchmarks/benchmarks/bench_linalg.py | 35 ++++- numpy/core/src/multiarray/common.h | 4 + .../core/src/multiarray/einsum_sumprod.c.src | 129 +++++------------- 3 files changed, 69 insertions(+), 99 deletions(-) diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index 4ce14ac3ddf9..602a0cb6bb98 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -126,34 +126,55 @@ def setup(self, dtype): # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two def time_einsum_outer(self, dtype): np.einsum("i,j", self.one_dim, self.one_dim, optimize=True) - np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True) # multiply(a, b):trigger sum_of_products_contig_two def time_einsum_multiply(self, dtype): np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True) - np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True) # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two def time_einsum_sum_mul(self, dtype): np.einsum(",i...->", 300, self.three_dim_small, optimize=True) - np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True) # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two def time_einsum_sum_mul2(self, dtype): np.einsum("i...,->", self.three_dim_small, 300, optimize=True) - np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True) # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two def time_einsum_mul(self, dtype): np.einsum("i,->i", self.one_dim_big, 300, optimize=True) - np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True) # trigger contig_contig_outstride0_two def time_einsum_contig_contig(self, dtype): np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True) - np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True) # trigger sum_of_products_contig_outstride0_one def time_einsum_contig_outstride0(self, dtype): np.einsum("i->", self.one_dim_big, optimize=True) - np.einsum("i->", self.non_contigous_dim1, optimize=True) + + # outer(a,b): non_contigous arrays + def time_einsum_noncon_outer(self, dtype): + np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True) + + # multiply(a, b):non_contigous arrays + def time_einsum_noncon_multiply(self, dtype): + np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True) + + # sum and multiply:non_contigous arrays + def time_einsum_noncon_sum_mul(self, dtype): + np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True) + + # sum and multiply:non_contigous arrays + def time_einsum_noncon_sum_mul2(self, dtype): + np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True) + + # scalar mul: non_contigous arrays + def time_einsum_noncon_mul(self, dtype): + np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True) + + # contig_contig_outstride0_two: non_contigous arrays + def time_einsum_noncon_contig_contig(self, dtype): + np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True) + + # sum_of_products_contig_outstride0_one:non_contigous arrays + def time_einsum_noncon_contig_outstride0(self, dtype): + np.einsum("i->", self.non_contigous_dim1, optimize=True) \ No newline at end of file diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index ef9bc79da325..b36cbcae01a6 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -205,7 +205,11 @@ npy_is_aligned(const void * p, const npy_uintp alignment) * This test is faster than a direct modulo. * Note alignment value of 0 is allowed and returns False. */ +#ifdef NPY_HAVE_NEON + return 0; +#else return ((npy_uintp)(p) & ((alignment) - 1)) == 0; +#endif } /* Get equivalent "uint" alignment given an itemsize, for use in copy code */ diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index c5e3dc70e0a5..b69a2c15d906 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -17,6 +17,14 @@ #include "common.h" #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) +/** + * Unroll by four/eight scalars in case of: + * - The SIMD width is higher than 128bit since we unroll by x2/x4 + * and that may lead to performance loss on small arrays. + * - To give the chance to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ +#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128)) /**begin repeat * #name = byte, short, int, long, longlong, @@ -240,13 +248,8 @@ static void (int)count); #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) && - EINSUM_IS_ALIGNED(data_out); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif + EINSUM_IS_ALIGNED(data_out); const int vstep = npyv_nlanes_@sfx@; /**begin repeat2 @@ -290,19 +293,15 @@ static void npyv_@st@_@sfx@(data_out, abc0); npyv_@st@_@sfx@(data_out + vstep, abc1); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) { /**begin repeat2 * #i = 0, 1, 2, 3# @@ -345,12 +344,7 @@ static void #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar); @@ -392,19 +386,15 @@ static void npyv_@st@_@sfx@(data_out, abc0); npyv_@st@_@sfx@(data_out + vstep, abc1); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) for (; count >= 4; count -= 4, data1 += 4, data_out += 4) { /**begin repeat2 * #i = 0, 1, 2, 3# @@ -442,12 +432,7 @@ static void (int)count); #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar); @@ -489,19 +474,15 @@ static void npyv_@st@_@sfx@(data_out, abc0); npyv_@st@_@sfx@(data_out + vstep, abc1); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) for (; count >= 4; count -= 4, data0 += 4, data_out += 4) { /**begin repeat2 * #i = 0, 1, 2, 3# @@ -540,12 +521,7 @@ static void (int)count); #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif const int vstep = npyv_nlanes_@sfx@; npyv_@sfx@ vaccum = npyv_zero_@sfx@(); @@ -579,20 +555,16 @@ static void npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum); vaccum = npyv_muladd_@sfx@(a0, b0, ab1); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) for (; count >= 4; count -= 4, data0 += 4, data1 += 4) { /**begin repeat2 * #i = 0, 1, 2, 3# @@ -622,12 +594,7 @@ static void (int)count); #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data1); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif const int vstep = npyv_nlanes_@sfx@; npyv_@sfx@ vaccum = npyv_zero_@sfx@(); @@ -658,20 +625,16 @@ static void npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1); vaccum = npyv_add_@sfx@(b01, vaccum); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) for (; count >= 4; count -= 4, data1 += 4) { const @type@ b01 = @from@(data1[0]) + @from@(data1[1]); const @type@ b23 = @from@(data1[2]) + @from@(data1[3]); @@ -695,12 +658,7 @@ static void (int)count); #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data0); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif const int vstep = npyv_nlanes_@sfx@; npyv_@sfx@ vaccum = npyv_zero_@sfx@(); @@ -731,20 +689,16 @@ static void npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); vaccum = npyv_add_@sfx@(a01, vaccum); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) for (; count >= 4; count -= 4, data0 += 4) { const @type@ a01 = @from@(data0[0]) + @from@(data0[1]); const @type@ a23 = @from@(data0[2]) + @from@(data0[3]); @@ -871,12 +825,7 @@ static void NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count); #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - #ifndef NPY_HAVE_NEON const int is_aligned = EINSUM_IS_ALIGNED(data0); - #else - // ARM/Neon don't have instructions for aligned memory access - const int is_aligned = 0; - #endif const int vstep = npyv_nlanes_@sfx@; npyv_@sfx@ vaccum = npyv_zero_@sfx@(); @@ -907,20 +856,16 @@ static void npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); vaccum = npyv_add_@sfx@(a01, vaccum); } + #else + #error "Invalid unroll_by = @unroll_by@" #endif } /**end repeat2**/ accum = npyv_sum_@sfx@(vaccum); npyv_cleanup(); #endif // NPYV check for @type@ -/** - * Unroll by four/eight scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the change to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ -#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) + +#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) #if @complex@ for (; count > 4; count -= 4, data0 += 4*2) { const @temptype@ re01 = data0[0] + data0[2]; From 38f7382c3bc508b0ba96dd3b76e336b77f0dfca6 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 18 Sep 2020 11:59:15 +0800 Subject: [PATCH 20/27] avx2 test --- numpy/core/src/common/simd/simd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 2f39c8427b5d..4d20143534d4 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -14,7 +14,8 @@ #ifdef __cplusplus extern "C" { #endif - +#define NPY_HAVE_AVX2 +#include // lane type by intrin suffix typedef npy_uint8 npyv_lanetype_u8; typedef npy_int8 npyv_lanetype_s8; From c6c1e303d92e4769a811ea09d9eadc96ffef8119 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 18 Sep 2020 18:05:34 +0800 Subject: [PATCH 21/27] explain for auto-vectorize part --- numpy/core/src/multiarray/common.h | 4 ---- numpy/core/src/multiarray/einsum_sumprod.c.src | 14 +++++++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index b36cbcae01a6..ef9bc79da325 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -205,11 +205,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment) * This test is faster than a direct modulo. * Note alignment value of 0 is allowed and returns False. */ -#ifdef NPY_HAVE_NEON - return 0; -#else return ((npy_uintp)(p) & ((alignment) - 1)) == 0; -#endif } /* Get equivalent "uint" alignment given an itemsize, for use in copy code */ diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index b69a2c15d906..072b4da2a5dc 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -16,11 +16,19 @@ #include "simd/simd.h" #include "common.h" -#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) +// ARM/Neon don't have instructions for aligned memory access +#ifdef NPY_HAVE_NEON + #define EINSUM_IS_ALIGNED(x) 0 +#else + #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) +#endif /** * Unroll by four/eight scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. + * - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents + * the number of times the loop is unrolled, which may lead to + * un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4) + * if The SIMD width is higher than 128bit, The performance loss on remained + * arrays is nonnegligible, so we choose to use the compiler auto-vectorize. * - To give the chance to the compiler to * auto-vectorize in case of NPYV wasn't available. */ From f18ade4b08cf0db63d8897b86025ebaa516a2ed6 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 18 Sep 2020 18:12:38 +0800 Subject: [PATCH 22/27] add explantion --- numpy/core/src/multiarray/common.h | 4 ---- .../core/src/multiarray/einsum_sumprod.c.src | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index b36cbcae01a6..ef9bc79da325 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -205,11 +205,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment) * This test is faster than a direct modulo. * Note alignment value of 0 is allowed and returns False. */ -#ifdef NPY_HAVE_NEON - return 0; -#else return ((npy_uintp)(p) & ((alignment) - 1)) == 0; -#endif } /* Get equivalent "uint" alignment given an itemsize, for use in copy code */ diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index b69a2c15d906..10c8f7a9fb10 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -16,6 +16,25 @@ #include "simd/simd.h" #include "common.h" +// ARM/Neon don't have instructions for aligned memory access +#ifdef NPY_HAVE_NEON + #define EINSUM_IS_ALIGNED(x) 0 +#else + #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) +#endif + +/** + * Unroll by four/eight scalars in case of: + * - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents + * the number of times the loop that unrolled, Eg: for float32, 2 simd loop + * is unrolled, for float64, 4 simd loop is unrolled, which may lead to + * un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4) + * if The SIMD width is higher than 128bit, The performance loss on remained + * arrays is nonnegligible, so we choose to use the compiler auto-vectorize. + * - To give the chance to the compiler to + * auto-vectorize in case of NPYV wasn't available. + */ + #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) /** * Unroll by four/eight scalars in case of: From 33b7d2a129369539799e586162399c584b16a88f Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Sat, 19 Sep 2020 09:04:43 +0800 Subject: [PATCH 23/27] remove duplicated message --- numpy/core/src/multiarray/einsum_sumprod.c.src | 9 --------- 1 file changed, 9 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 10c8f7a9fb10..777b24c8b01a 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -34,15 +34,6 @@ * - To give the chance to the compiler to * auto-vectorize in case of NPYV wasn't available. */ - -#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) -/** - * Unroll by four/eight scalars in case of: - * - The SIMD width is higher than 128bit since we unroll by x2/x4 - * and that may lead to performance loss on small arrays. - * - To give the chance to the compiler to - * auto-vectorize in case of NPYV wasn't available. - */ #define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128)) /**begin repeat From 5a692ed2be497050fcad732cff682c49837eccb7 Mon Sep 17 00:00:00 2001 From: Chunlin Date: Tue, 29 Sep 2020 15:26:29 +0800 Subject: [PATCH 24/27] Update benchmarks/benchmarks/bench_linalg.py Co-authored-by: Eric Wieser --- benchmarks/benchmarks/bench_linalg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index 602a0cb6bb98..a72cccb5f7ed 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -177,4 +177,4 @@ def time_einsum_noncon_contig_contig(self, dtype): # sum_of_products_contig_outstride0_one:non_contigous arrays def time_einsum_noncon_contig_outstride0(self, dtype): - np.einsum("i->", self.non_contigous_dim1, optimize=True) \ No newline at end of file + np.einsum("i->", self.non_contigous_dim1, optimize=True) From 20d5cdae0c19345c3e6825394a686b48d8983acc Mon Sep 17 00:00:00 2001 From: Chunlin Date: Wed, 30 Sep 2020 10:31:00 +0800 Subject: [PATCH 25/27] Update numpy/core/src/multiarray/einsum_sumprod.c.src Co-authored-by: Eric Wieser --- .../core/src/multiarray/einsum_sumprod.c.src | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 777b24c8b01a..d347a69a1322 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -24,15 +24,17 @@ #endif /** - * Unroll by four/eight scalars in case of: - * - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents - * the number of times the loop that unrolled, Eg: for float32, 2 simd loop - * is unrolled, for float64, 4 simd loop is unrolled, which may lead to - * un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4) - * if The SIMD width is higher than 128bit, The performance loss on remained - * arrays is nonnegligible, so we choose to use the compiler auto-vectorize. - * - To give the chance to the compiler to - * auto-vectorize in case of NPYV wasn't available. + * This macro is used to enable a scalar loop which advances 4 elements at a + * time, which appears after a main SIMD loop gated by `CHK` that unrolls by + * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop + * that finishes up all the remaining scalars. The purpose of the unrolled loop + * is to enable auto-vectorization in cases when all of the following are true: + * + * - optimization is allowed + * - either: + * - we did not run the SIMD loop at all, due to NPV being disabled. + * - the SIMD loop was larger than 128bit, so there are likely to be many + * elements left to process. */ #define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128)) From 7ff73245031cfd1004082a657f83d4e9f3751b9d Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 12 Oct 2020 09:47:14 +0800 Subject: [PATCH 26/27] fix typos --- numpy/core/src/common/npy_cpu_dispatch.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h index 274520852569..a0f82fa3da05 100644 --- a/numpy/core/src/common/npy_cpu_dispatch.h +++ b/numpy/core/src/common/npy_cpu_dispatch.h @@ -17,7 +17,7 @@ * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`. */ /** - * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION', + * Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION', * due the nature of command argument '--disable-optimization', * which is explicitly disabling the module ccompiler_opt. */ @@ -29,7 +29,7 @@ * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead, * since c99 supports bool variables which may lead to ambiguous errors. */ - // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token. + // backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token. #define NPY__DISPATCH_DEFBOOL typedef bool npy__dispatch_bkbool; #endif @@ -134,10 +134,10 @@ * NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*)) * NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE]) * - * By assuming the provided config header drived from a dispatch-able source, + * By assuming the provided config header derived from a dispatch-able source, * that configured with "@targets baseline sse41 vsx3 asimdhp", * they supported by the compiler and enabled via '--cpu-dspatch', - * then the prototype declrations at the above example will equlivent to the follows: + * then the prototype declrations at the above example will equivalent to the follows: * * - x86: * void dispatch_me(const int*, int*); // baseline @@ -179,7 +179,7 @@ /** * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...) * - * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even + * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even * if it was provided within the configration statments. */ #define NPY_CPU_DISPATCH_DECLARE_XB(...) \ @@ -206,7 +206,7 @@ * In order to call or to assign the pointer of it from outside the dispatch-able source, * you have to use this Macro as follows: * - * // bring the genreated config header of the dispatch-abel source + * // bring the generated config header of the dispatch-able source * #ifndef NPY_DISABLE_OPTIMIZATION * #include "dispatchable_source_name.dispatch.h" * #endif From 73f61c33775a394d72857298b932df5821acb454 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 13 Oct 2020 09:56:20 +0800 Subject: [PATCH 27/27] remove extra test --- numpy/core/src/common/simd/simd.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 4d20143534d4..2f39c8427b5d 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -14,8 +14,7 @@ #ifdef __cplusplus extern "C" { #endif -#define NPY_HAVE_AVX2 -#include + // lane type by intrin suffix typedef npy_uint8 npyv_lanetype_u8; typedef npy_int8 npyv_lanetype_s8;