-
-
Notifications
You must be signed in to change notification settings - Fork 12k
ENH: Improve the performance of einsum by using universal simd #17049
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit
Hold shift + click to select a range
e26dcf7
new npyv intrinsics
Qiyu8 47118fb
einsum dispatch and usimd process
Qiyu8 ad0b3b4
update
Qiyu8 55200fc
add float32 benchmark case
Qiyu8 94cff77
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 4d7d94d
update
Qiyu8 ae53e35
fix typos
Qiyu8 2e713b0
add avx512 reduce sum comments
Qiyu8 5e7cbd1
add non_contigous arrays ,improve reduce the sum
Qiyu8 80c0ed4
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 9060231
rebase after split for a better review
Qiyu8 b0375dc
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 1990c13
headers reconstruct
Qiyu8 7b756af
use for loop replace begin repeat for readability
Qiyu8 4877e40
add ivdeps and handle header dependency
Qiyu8 168c6c9
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 954e642
revert to faster simd code
Qiyu8 50c6b7e
changed to baseline solution
Qiyu8 23e28c0
remove redundant typedef
Qiyu8 21f1c0b
update
Qiyu8 a07455a
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 d298c8e
remove redundant intrinsics
Qiyu8 6dac52e
add blank lines
Qiyu8 985e5b2
add format
Qiyu8 88c2747
Update numpy/core/src/common/simd/avx512/arithmetic.h
Qiyu8 90026f9
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 54943e0
modify the int to npy_intp
Qiyu8 e993af2
split benchmark and define common macro
Qiyu8 38f7382
avx2 test
Qiyu8 f351665
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 c6c1e30
explain for auto-vectorize part
Qiyu8 f18ade4
add explantion
Qiyu8 33b7d2a
remove duplicated message
Qiyu8 5a692ed
Update benchmarks/benchmarks/bench_linalg.py
Qiyu8 20d5cda
Update numpy/core/src/multiarray/einsum_sumprod.c.src
Qiyu8 83734bf
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 f8f7482
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 1889738
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 7ff7324
fix typos
Qiyu8 73f61c3
remove extra test
Qiyu8 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
split benchmark and define common macro
- Loading branch information
commit e993af2dca9b658cb08aa0111bc031a97dbe6430
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,14 @@ | |
| #include "common.h" | ||
|
|
||
| #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH) | ||
Qiyu8 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /** | ||
| * Unroll by four/eight scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
|
||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the chance to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128)) | ||
|
|
||
| /**begin repeat | ||
| * #name = byte, short, int, long, longlong, | ||
|
|
@@ -240,13 +248,8 @@ static void | |
| (int)count); | ||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) && | ||
| EINSUM_IS_ALIGNED(data_out); | ||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| EINSUM_IS_ALIGNED(data_out); | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
|
|
||
| /**begin repeat2 | ||
|
|
@@ -290,19 +293,15 @@ static void | |
| npyv_@st@_@sfx@(data_out, abc0); | ||
| npyv_@st@_@sfx@(data_out + vstep, abc1); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
| } | ||
| /**end repeat2**/ | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) { | ||
| /**begin repeat2 | ||
| * #i = 0, 1, 2, 3# | ||
|
|
@@ -345,12 +344,7 @@ static void | |
|
|
||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out); | ||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
| const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar); | ||
|
|
||
|
|
@@ -392,19 +386,15 @@ static void | |
| npyv_@st@_@sfx@(data_out, abc0); | ||
| npyv_@st@_@sfx@(data_out + vstep, abc1); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
eric-wieser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| /**end repeat2**/ | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| for (; count >= 4; count -= 4, data1 += 4, data_out += 4) { | ||
| /**begin repeat2 | ||
| * #i = 0, 1, 2, 3# | ||
|
|
@@ -442,12 +432,7 @@ static void | |
| (int)count); | ||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out); | ||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
| const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar); | ||
|
|
||
|
|
@@ -489,19 +474,15 @@ static void | |
| npyv_@st@_@sfx@(data_out, abc0); | ||
| npyv_@st@_@sfx@(data_out + vstep, abc1); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
| } | ||
| /**end repeat2**/ | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| for (; count >= 4; count -= 4, data0 += 4, data_out += 4) { | ||
| /**begin repeat2 | ||
| * #i = 0, 1, 2, 3# | ||
|
|
@@ -540,12 +521,7 @@ static void | |
| (int)count); | ||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1); | ||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
| npyv_@sfx@ vaccum = npyv_zero_@sfx@(); | ||
|
|
||
|
|
@@ -579,20 +555,16 @@ static void | |
| npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum); | ||
| vaccum = npyv_muladd_@sfx@(a0, b0, ab1); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
| } | ||
| /**end repeat2**/ | ||
| accum = npyv_sum_@sfx@(vaccum); | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| for (; count >= 4; count -= 4, data0 += 4, data1 += 4) { | ||
| /**begin repeat2 | ||
| * #i = 0, 1, 2, 3# | ||
|
|
@@ -622,12 +594,7 @@ static void | |
| (int)count); | ||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data1); | ||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
| npyv_@sfx@ vaccum = npyv_zero_@sfx@(); | ||
|
|
||
|
|
@@ -658,20 +625,16 @@ static void | |
| npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1); | ||
| vaccum = npyv_add_@sfx@(b01, vaccum); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
| } | ||
| /**end repeat2**/ | ||
| accum = npyv_sum_@sfx@(vaccum); | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| for (; count >= 4; count -= 4, data1 += 4) { | ||
| const @type@ b01 = @from@(data1[0]) + @from@(data1[1]); | ||
| const @type@ b23 = @from@(data1[2]) + @from@(data1[3]); | ||
|
|
@@ -695,12 +658,7 @@ static void | |
| (int)count); | ||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data0); | ||
Qiyu8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
| npyv_@sfx@ vaccum = npyv_zero_@sfx@(); | ||
|
|
||
|
|
@@ -731,20 +689,16 @@ static void | |
| npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); | ||
| vaccum = npyv_add_@sfx@(a01, vaccum); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
| } | ||
| /**end repeat2**/ | ||
| accum = npyv_sum_@sfx@(vaccum); | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| for (; count >= 4; count -= 4, data0 += 4) { | ||
| const @type@ a01 = @from@(data0[0]) + @from@(data0[1]); | ||
| const @type@ a23 = @from@(data0[2]) + @from@(data0[3]); | ||
|
|
@@ -871,12 +825,7 @@ static void | |
| NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count); | ||
| #if @NPYV_CHK@ // NPYV check for @type@ | ||
| /* Use aligned instructions if possible */ | ||
| #ifndef NPY_HAVE_NEON | ||
| const int is_aligned = EINSUM_IS_ALIGNED(data0); | ||
| #else | ||
| // ARM/Neon don't have instructions for aligned memory access | ||
| const int is_aligned = 0; | ||
| #endif | ||
| const int vstep = npyv_nlanes_@sfx@; | ||
| npyv_@sfx@ vaccum = npyv_zero_@sfx@(); | ||
|
|
||
|
|
@@ -907,20 +856,16 @@ static void | |
| npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); | ||
| vaccum = npyv_add_@sfx@(a01, vaccum); | ||
| } | ||
| #else | ||
| #error "Invalid unroll_by = @unroll_by@" | ||
| #endif | ||
| } | ||
| /**end repeat2**/ | ||
| accum = npyv_sum_@sfx@(vaccum); | ||
| npyv_cleanup(); | ||
| #endif // NPYV check for @type@ | ||
| /** | ||
| * Unroll by four/eight scalars in case of: | ||
| * - The SIMD width is higher than 128bit since we unroll by x2/x4 | ||
| * and that may lead to performance loss on small arrays. | ||
| * - To give the change to the compiler to | ||
| * auto-vectorize in case of NPYV wasn't available. | ||
| */ | ||
| #if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128) | ||
|
|
||
| #if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@) | ||
| #if @complex@ | ||
| for (; count > 4; count -= 4, data0 += 4*2) { | ||
| const @temptype@ re01 = data0[0] + data0[2]; | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.