-
-
Notifications
You must be signed in to change notification settings - Fork 11k
SIMD: Optimize the performance of np.packbits in AVX2/AVX512F/VSX. #17102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f145554
13ee23c
5baccbc
748b21b
c30f729
72a5a89
81e08d9
07a584b
26c6ec3
36d2f9f
d0a4f45
9746036
a5a4fe7
1d0ca53
0386e61
35f7783
94b2d86
57767fb
42f57cb
c480ffb
567056e
711482a
95cd706
0ece23b
3768b8a
797477d
a08acd4
b5c5ad8
b156231
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,12 @@ | |
#include "alloc.h" | ||
#include "ctors.h" | ||
#include "common.h" | ||
#include "simd/simd.h" | ||
|
||
typedef enum { | ||
PACK_ORDER_LITTLE = 0, | ||
PACK_ORDER_BIG | ||
} PACK_ORDER; | ||
|
||
/* | ||
* Returns -1 if the array is monotonic decreasing, | ||
|
@@ -1465,35 +1470,20 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args) | |
Py_RETURN_NONE; | ||
} | ||
|
||
#if defined NPY_HAVE_SSE2_INTRINSICS | ||
#include <emmintrin.h> | ||
#endif | ||
|
||
#ifdef NPY_HAVE_NEON | ||
typedef npy_uint64 uint64_unaligned __attribute__((aligned(16))); | ||
static NPY_INLINE int32_t | ||
sign_mask(uint8x16_t input) | ||
{ | ||
int8x8_t m0 = vcreate_s8(0x0706050403020100ULL); | ||
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(input, 7), vcombine_s8(m0, m0)); | ||
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); | ||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); | ||
} | ||
#endif | ||
/* | ||
* This function packs boolean values in the input array into the bits of a | ||
* byte array. Truth values are determined as usual: 0 is false, everything | ||
* else is true. | ||
*/ | ||
static NPY_INLINE void | ||
static NPY_GCC_OPT_3 NPY_INLINE void | ||
pack_inner(const char *inptr, | ||
npy_intp element_size, /* in bytes */ | ||
npy_intp n_in, | ||
npy_intp in_stride, | ||
char *outptr, | ||
npy_intp n_out, | ||
npy_intp out_stride, | ||
char order) | ||
PACK_ORDER order) | ||
{ | ||
/* | ||
* Loop through the elements of inptr. | ||
|
@@ -1505,62 +1495,64 @@ pack_inner(const char *inptr, | |
npy_intp index = 0; | ||
int remain = n_in % 8; /* uneven bits */ | ||
|
||
#if defined NPY_HAVE_SSE2_INTRINSICS && defined HAVE__M_FROM_INT64 | ||
#if NPY_SIMD | ||
Qiyu8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (in_stride == 1 && element_size == 1 && n_out > 2) { | ||
__m128i zero = _mm_setzero_si128(); | ||
npyv_u8 v_zero = npyv_zero_u8(); | ||
/* don't handle non-full 8-byte remainder */ | ||
npy_intp vn_out = n_out - (remain ? 1 : 0); | ||
vn_out -= (vn_out & 1); | ||
for (index = 0; index < vn_out; index += 2) { | ||
unsigned int r; | ||
npy_uint64 a = *(npy_uint64*)inptr; | ||
npy_uint64 b = *(npy_uint64*)(inptr + 8); | ||
if (order == 'b') { | ||
a = npy_bswap8(a); | ||
b = npy_bswap8(b); | ||
const int vstep = npyv_nlanes_u64; | ||
const int vstepx4 = vstep * 4; | ||
const int isAligned = npy_is_aligned(outptr, sizeof(npy_uint64)); | ||
vn_out -= (vn_out & (vstep - 1)); | ||
for (; index <= vn_out - vstepx4; index += vstepx4, inptr += npyv_nlanes_u8 * 4) { | ||
npyv_u8 v0 = npyv_load_u8((const npy_uint8*)inptr); | ||
npyv_u8 v1 = npyv_load_u8((const npy_uint8*)inptr + npyv_nlanes_u8 * 1); | ||
npyv_u8 v2 = npyv_load_u8((const npy_uint8*)inptr + npyv_nlanes_u8 * 2); | ||
npyv_u8 v3 = npyv_load_u8((const npy_uint8*)inptr + npyv_nlanes_u8 * 3); | ||
if (order == PACK_ORDER_BIG) { | ||
Qiyu8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
v0 = npyv_rev64_u8(v0); | ||
v1 = npyv_rev64_u8(v1); | ||
v2 = npyv_rev64_u8(v2); | ||
v3 = npyv_rev64_u8(v3); | ||
} | ||
|
||
/* note x86 can load unaligned */ | ||
__m128i v = _mm_set_epi64(_m_from_int64(b), _m_from_int64(a)); | ||
/* false -> 0x00 and true -> 0xFF (there is no cmpneq) */ | ||
v = _mm_cmpeq_epi8(v, zero); | ||
v = _mm_cmpeq_epi8(v, zero); | ||
/* extract msb of 16 bytes and pack it into 16 bit */ | ||
r = _mm_movemask_epi8(v); | ||
/* store result */ | ||
memcpy(outptr, &r, 1); | ||
outptr += out_stride; | ||
memcpy(outptr, (char*)&r + 1, 1); | ||
outptr += out_stride; | ||
inptr += 16; | ||
} | ||
} | ||
#elif defined NPY_HAVE_NEON | ||
if (in_stride == 1 && element_size == 1 && n_out > 2) { | ||
/* don't handle non-full 8-byte remainder */ | ||
npy_intp vn_out = n_out - (remain ? 1 : 0); | ||
vn_out -= (vn_out & 1); | ||
for (index = 0; index < vn_out; index += 2) { | ||
unsigned int r; | ||
npy_uint64 a = *((uint64_unaligned*)inptr); | ||
npy_uint64 b = *((uint64_unaligned*)(inptr + 8)); | ||
if (order == 'b') { | ||
a = npy_bswap8(a); | ||
b = npy_bswap8(b); | ||
npy_uint64 bb[4]; | ||
bb[0] = npyv_tobits_b8(npyv_cmpneq_u8(v0, v_zero)); | ||
bb[1] = npyv_tobits_b8(npyv_cmpneq_u8(v1, v_zero)); | ||
bb[2] = npyv_tobits_b8(npyv_cmpneq_u8(v2, v_zero)); | ||
bb[3] = npyv_tobits_b8(npyv_cmpneq_u8(v3, v_zero)); | ||
if(out_stride == 1 && | ||
(!NPY_STRONG_ALIGNMENT || isAligned)) { | ||
npy_uint64 *ptr64 = (npy_uint64*)outptr; | ||
#if NPY_SIMD_WIDTH == 16 | ||
npy_uint64 bcomp = bb[0] | (bb[1] << 16) | (bb[2] << 32) | (bb[3] << 48); | ||
ptr64[0] = bcomp; | ||
#elif NPY_SIMD_WIDTH == 32 | ||
ptr64[0] = bb[0] | (bb[1] << 32); | ||
ptr64[1] = bb[2] | (bb[3] << 32); | ||
#else | ||
ptr64[0] = bb[0]; ptr64[1] = bb[1]; | ||
ptr64[2] = bb[2]; ptr64[3] = bb[3]; | ||
#endif | ||
outptr += vstepx4; | ||
} else { | ||
for(int i = 0; i < 4; i++) { | ||
for (int j = 0; j < vstep; j++) { | ||
memcpy(outptr, (char*)&bb[i] + j, 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know the compiler gonna optimize it but why do we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. non-contiguous/partial memory access intrinsics for "u8, s8, u16, s16" will be useful for other SIMD kernels but not this one. I just suggested storing each bye via dereferencing the output ptr. |
||
outptr += out_stride; | ||
} | ||
} | ||
} | ||
} | ||
for (; index < vn_out; index += vstep, inptr += npyv_nlanes_u8) { | ||
npyv_u8 va = npyv_load_u8((const npy_uint8*)inptr); | ||
if (order == PACK_ORDER_BIG) { | ||
va = npyv_rev64_u8(va); | ||
} | ||
npy_uint64 bb = npyv_tobits_b8(npyv_cmpneq_u8(va, v_zero)); | ||
for (int i = 0; i < vstep; ++i) { | ||
memcpy(outptr, (char*)&bb + i, 1); | ||
outptr += out_stride; | ||
} | ||
uint64x2_t v = vcombine_u64(vcreate_u64(a), vcreate_u64(b)); | ||
uint64x2_t zero = vdupq_n_u64(0); | ||
/* false -> 0x00 and true -> 0xFF */ | ||
v = vreinterpretq_u64_u8(vmvnq_u8(vceqq_u8(vreinterpretq_u8_u64(v), vreinterpretq_u8_u64(zero)))); | ||
/* extract msb of 16 bytes and pack it into 16 bit */ | ||
uint8x16_t input = vreinterpretq_u8_u64(v); | ||
r = sign_mask(input); | ||
/* store result */ | ||
memcpy(outptr, &r, 1); | ||
outptr += out_stride; | ||
memcpy(outptr, (char*)&r + 1, 1); | ||
outptr += out_stride; | ||
inptr += 16; | ||
} | ||
} | ||
#endif | ||
|
@@ -1571,14 +1563,11 @@ pack_inner(const char *inptr, | |
/* Don't reset index. Just handle remainder of above block */ | ||
for (; index < n_out; index++) { | ||
Qiyu8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
unsigned char build = 0; | ||
int i, maxi; | ||
npy_intp j; | ||
|
||
maxi = (index == n_out - 1) ? remain : 8; | ||
if (order == 'b') { | ||
for (i = 0; i < maxi; i++) { | ||
int maxi = (index == n_out - 1) ? remain : 8; | ||
if (order == PACK_ORDER_BIG) { | ||
for (int i = 0; i < maxi; i++) { | ||
build <<= 1; | ||
for (j = 0; j < element_size; j++) { | ||
for (npy_intp j = 0; j < element_size; j++) { | ||
build |= (inptr[j] != 0); | ||
} | ||
inptr += in_stride; | ||
|
@@ -1589,9 +1578,9 @@ pack_inner(const char *inptr, | |
} | ||
else | ||
{ | ||
for (i = 0; i < maxi; i++) { | ||
for (int i = 0; i < maxi; i++) { | ||
build >>= 1; | ||
for (j = 0; j < element_size; j++) { | ||
for (npy_intp j = 0; j < element_size; j++) { | ||
build |= (inptr[j] != 0) ? 128 : 0; | ||
} | ||
inptr += in_stride; | ||
|
@@ -1685,13 +1674,13 @@ pack_bits(PyObject *input, int axis, char order) | |
Py_XDECREF(ot); | ||
goto fail; | ||
} | ||
|
||
const PACK_ORDER ordere = order == 'b' ? PACK_ORDER_BIG : PACK_ORDER_LITTLE; | ||
NPY_BEGIN_THREADS_THRESHOLDED(PyArray_DIM(out, axis)); | ||
while (PyArray_ITER_NOTDONE(it)) { | ||
pack_inner(PyArray_ITER_DATA(it), PyArray_ITEMSIZE(new), | ||
PyArray_DIM(new, axis), PyArray_STRIDE(new, axis), | ||
PyArray_ITER_DATA(ot), PyArray_DIM(out, axis), | ||
PyArray_STRIDE(out, axis), order); | ||
PyArray_STRIDE(out, axis), ordere); | ||
PyArray_ITER_NEXT(it); | ||
PyArray_ITER_NEXT(ot); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just noticed this addition. Is this the same (idea) as
NPY_CPU_HAVE_UNALIGNED_ACCESS
, which we do already use in a handful of places? I am a bit worried of duplicating this logic, especially since this seems to assume that all CPUs have unaligned access instead of the opposite of assuming x86 and amd64 always supporting unaligned access?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing this out, in this case, only
armv7
needs aligned access, which assume that other CPUs supporting unaligned access, soNPY_CPU_HAVE_UNALIGNED_ACCESS
andNPY_STRONG_ALIGNMENT
are like two sides of a coin, you are right about the duplicating logic, will integrate together in follow-up PR.