Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e26dcf7
new npyv intrinsics
Qiyu8 Aug 11, 2020
47118fb
einsum dispatch and usimd process
Qiyu8 Aug 11, 2020
ad0b3b4
update
Qiyu8 Aug 11, 2020
55200fc
add float32 benchmark case
Qiyu8 Aug 11, 2020
94cff77
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Aug 12, 2020
4d7d94d
update
Qiyu8 Aug 12, 2020
ae53e35
fix typos
Qiyu8 Aug 12, 2020
2e713b0
add avx512 reduce sum comments
Qiyu8 Aug 13, 2020
5e7cbd1
add non_contigous arrays ,improve reduce the sum
Qiyu8 Aug 20, 2020
80c0ed4
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Aug 24, 2020
9060231
rebase after split for a better review
Qiyu8 Aug 24, 2020
b0375dc
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Aug 25, 2020
1990c13
headers reconstruct
Qiyu8 Aug 25, 2020
7b756af
use for loop replace begin repeat for readability
Qiyu8 Aug 25, 2020
4877e40
add ivdeps and handle header dependency
Qiyu8 Aug 26, 2020
168c6c9
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Aug 26, 2020
954e642
revert to faster simd code
Qiyu8 Aug 27, 2020
50c6b7e
changed to baseline solution
Qiyu8 Aug 28, 2020
23e28c0
remove redundant typedef
Qiyu8 Aug 31, 2020
21f1c0b
update
Qiyu8 Sep 1, 2020
a07455a
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Sep 10, 2020
d298c8e
remove redundant intrinsics
Qiyu8 Sep 10, 2020
6dac52e
add blank lines
Qiyu8 Sep 11, 2020
985e5b2
add format
Qiyu8 Sep 14, 2020
88c2747
Update numpy/core/src/common/simd/avx512/arithmetic.h
Qiyu8 Sep 14, 2020
90026f9
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Sep 15, 2020
54943e0
modify the int to npy_intp
Qiyu8 Sep 15, 2020
e993af2
split benchmark and define common macro
Qiyu8 Sep 18, 2020
38f7382
avx2 test
Qiyu8 Sep 18, 2020
f351665
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Sep 18, 2020
c6c1e30
explain for auto-vectorize part
Qiyu8 Sep 18, 2020
f18ade4
add explantion
Qiyu8 Sep 18, 2020
33b7d2a
remove duplicated message
Qiyu8 Sep 19, 2020
5a692ed
Update benchmarks/benchmarks/bench_linalg.py
Qiyu8 Sep 29, 2020
20d5cda
Update numpy/core/src/multiarray/einsum_sumprod.c.src
Qiyu8 Sep 30, 2020
83734bf
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Oct 9, 2020
f8f7482
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Oct 9, 2020
1889738
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Oct 12, 2020
7ff7324
fix typos
Qiyu8 Oct 12, 2020
73f61c3
remove extra test
Qiyu8 Oct 13, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
split benchmark and define common macro
  • Loading branch information
Qiyu8 committed Sep 18, 2020
commit e993af2dca9b658cb08aa0111bc031a97dbe6430
35 changes: 28 additions & 7 deletions benchmarks/benchmarks/bench_linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,34 +126,55 @@ def setup(self, dtype):
# outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
def time_einsum_outer(self, dtype):
np.einsum("i,j", self.one_dim, self.one_dim, optimize=True)
np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)

# multiply(a, b):trigger sum_of_products_contig_two
def time_einsum_multiply(self, dtype):
np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True)
np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)

# sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two
def time_einsum_sum_mul(self, dtype):
np.einsum(",i...->", 300, self.three_dim_small, optimize=True)
np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)

# sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
def time_einsum_sum_mul2(self, dtype):
np.einsum("i...,->", self.three_dim_small, 300, optimize=True)
np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)

# scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
def time_einsum_mul(self, dtype):
np.einsum("i,->i", self.one_dim_big, 300, optimize=True)
np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)

# trigger contig_contig_outstride0_two
def time_einsum_contig_contig(self, dtype):
np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True)
np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)

# trigger sum_of_products_contig_outstride0_one
def time_einsum_contig_outstride0(self, dtype):
np.einsum("i->", self.one_dim_big, optimize=True)
np.einsum("i->", self.non_contigous_dim1, optimize=True)

# outer(a,b): non_contigous arrays
def time_einsum_noncon_outer(self, dtype):
np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)

# multiply(a, b):non_contigous arrays
def time_einsum_noncon_multiply(self, dtype):
np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)

# sum and multiply:non_contigous arrays
def time_einsum_noncon_sum_mul(self, dtype):
np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)

# sum and multiply:non_contigous arrays
def time_einsum_noncon_sum_mul2(self, dtype):
np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)

# scalar mul: non_contigous arrays
def time_einsum_noncon_mul(self, dtype):
np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)

# contig_contig_outstride0_two: non_contigous arrays
def time_einsum_noncon_contig_contig(self, dtype):
np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)

# sum_of_products_contig_outstride0_one:non_contigous arrays
def time_einsum_noncon_contig_outstride0(self, dtype):
np.einsum("i->", self.non_contigous_dim1, optimize=True)
4 changes: 4 additions & 0 deletions numpy/core/src/multiarray/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,11 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
* This test is faster than a direct modulo.
* Note alignment value of 0 is allowed and returns False.
*/
#ifdef NPY_HAVE_NEON
return 0;
#else
return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
#endif
}

/* Get equivalent "uint" alignment given an itemsize, for use in copy code */
Expand Down
129 changes: 37 additions & 92 deletions numpy/core/src/multiarray/einsum_sumprod.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
#include "common.h"

#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
/**
* Unroll by four/eight scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
Copy link
Member

@eric-wieser eric-wieser Sep 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what I'm missing here is the connection between 2, 4, and 128.

* and that may lead to performance loss on small arrays.
* - To give the chance to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))

/**begin repeat
* #name = byte, short, int, long, longlong,
Expand Down Expand Up @@ -240,13 +248,8 @@ static void
(int)count);
#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
EINSUM_IS_ALIGNED(data_out);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
EINSUM_IS_ALIGNED(data_out);
const int vstep = npyv_nlanes_@sfx@;

/**begin repeat2
Expand Down Expand Up @@ -290,19 +293,15 @@ static void
npyv_@st@_@sfx@(data_out, abc0);
npyv_@st@_@sfx@(data_out + vstep, abc1);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
/**begin repeat2
* #i = 0, 1, 2, 3#
Expand Down Expand Up @@ -345,12 +344,7 @@ static void

#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
const int vstep = npyv_nlanes_@sfx@;
const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar);

Expand Down Expand Up @@ -392,19 +386,15 @@ static void
npyv_@st@_@sfx@(data_out, abc0);
npyv_@st@_@sfx@(data_out + vstep, abc1);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
for (; count >= 4; count -= 4, data1 += 4, data_out += 4) {
/**begin repeat2
* #i = 0, 1, 2, 3#
Expand Down Expand Up @@ -442,12 +432,7 @@ static void
(int)count);
#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
const int vstep = npyv_nlanes_@sfx@;
const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar);

Expand Down Expand Up @@ -489,19 +474,15 @@ static void
npyv_@st@_@sfx@(data_out, abc0);
npyv_@st@_@sfx@(data_out + vstep, abc1);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
for (; count >= 4; count -= 4, data0 += 4, data_out += 4) {
/**begin repeat2
* #i = 0, 1, 2, 3#
Expand Down Expand Up @@ -540,12 +521,7 @@ static void
(int)count);
#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
const int vstep = npyv_nlanes_@sfx@;
npyv_@sfx@ vaccum = npyv_zero_@sfx@();

Expand Down Expand Up @@ -579,20 +555,16 @@ static void
npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum);
vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
accum = npyv_sum_@sfx@(vaccum);
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
/**begin repeat2
* #i = 0, 1, 2, 3#
Expand Down Expand Up @@ -622,12 +594,7 @@ static void
(int)count);
#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data1);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
const int vstep = npyv_nlanes_@sfx@;
npyv_@sfx@ vaccum = npyv_zero_@sfx@();

Expand Down Expand Up @@ -658,20 +625,16 @@ static void
npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1);
vaccum = npyv_add_@sfx@(b01, vaccum);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
accum = npyv_sum_@sfx@(vaccum);
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
for (; count >= 4; count -= 4, data1 += 4) {
const @type@ b01 = @from@(data1[0]) + @from@(data1[1]);
const @type@ b23 = @from@(data1[2]) + @from@(data1[3]);
Expand All @@ -695,12 +658,7 @@ static void
(int)count);
#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data0);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
const int vstep = npyv_nlanes_@sfx@;
npyv_@sfx@ vaccum = npyv_zero_@sfx@();

Expand Down Expand Up @@ -731,20 +689,16 @@ static void
npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
vaccum = npyv_add_@sfx@(a01, vaccum);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
accum = npyv_sum_@sfx@(vaccum);
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
for (; count >= 4; count -= 4, data0 += 4) {
const @type@ a01 = @from@(data0[0]) + @from@(data0[1]);
const @type@ a23 = @from@(data0[2]) + @from@(data0[3]);
Expand Down Expand Up @@ -871,12 +825,7 @@ static void
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
#ifndef NPY_HAVE_NEON
const int is_aligned = EINSUM_IS_ALIGNED(data0);
#else
// ARM/Neon don't have instructions for aligned memory access
const int is_aligned = 0;
#endif
const int vstep = npyv_nlanes_@sfx@;
npyv_@sfx@ vaccum = npyv_zero_@sfx@();

Expand Down Expand Up @@ -907,20 +856,16 @@ static void
npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
vaccum = npyv_add_@sfx@(a01, vaccum);
}
#else
#error "Invalid unroll_by = @unroll_by@"
#endif
}
/**end repeat2**/
accum = npyv_sum_@sfx@(vaccum);
npyv_cleanup();
#endif // NPYV check for @type@
/**
* Unroll by four/eight scalars in case of:
* - The SIMD width is higher than 128bit since we unroll by x2/x4
* and that may lead to performance loss on small arrays.
* - To give the change to the compiler to
* auto-vectorize in case of NPYV wasn't available.
*/
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)

#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
#if @complex@
for (; count > 4; count -= 4, data0 += 4*2) {
const @temptype@ re01 = data0[0] + data0[2];
Expand Down