Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH: Vectorize INT_FastClip operation using AVX2 #9037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 96 additions & 9 deletions numpy/core/src/multiarray/arraytypes.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@
#include "_datetime.h"
#include "arrayobject.h"
#include "alloc.h"
#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#endif
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#endif
Expand Down Expand Up @@ -3699,6 +3706,7 @@ static void
* npy_long, npy_ulong, npy_longlong, npy_ulonglong,
* npy_half, npy_float, npy_double, npy_longdouble,
* npy_datetime, npy_timedelta#
* #isint = 1*10, 0*7#
* #isfloat = 0*11, 1*4, 0*2#
* #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
* #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
Expand All @@ -3709,31 +3717,108 @@ static void
{
npy_intp i;
@type@ max_val = 0, min_val = 0;

// The following portion provides Optimization for INT_fastclip using avx2 extension.
#if HAVE_ATTRIBUTE_TARGET_AVX2 && @isint@ && NPY_BITSOF_@type@ == 32

/*It looks at total length of the input array and find out how many bytes are unaligned
with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.*/
int unaligned = ni%8;
// Declaration of vector registers uing avx2
__m128i vec_max,vec_min;
__m256i vec_max_256,vec_min_256,vec_array;
if (max != NULL) {
max_val = *max;
//Setting up vector registers with all value as 32 byte maximum value
vec_max = _mm_set1_epi32(max_val);
vec_max_256 = _mm256_broadcastd_epi32(vec_max);
}
if (min != NULL) {
min_val = *min;
//Setting up vector registers with all value as 32 byte minimum value
vec_min = _mm_set1_epi32(min_val);
vec_min_256 = _mm256_broadcastd_epi32(vec_min);
}

/* Parallel comparision is made between loaded data and vec_max_256 first
and do a vectorized min operation betwwen data and max value. If data is
greater than max value , it gets replaced by max value, otherwise with
data.We then use the register where result of above min comparison is stored
and do a max operation between the data and min value. If data is lesser than
the min value , it gets replaced by min value, otherwise with data.After that
we take care of the unaligned portion by simply looping over it, like the previous
algorithm was doing. For a simple benchmark, this algorithm gave around 40-50%
performance boost.
*/

if (max == NULL) {
for(i=0;i<ni/8;i++){
vec_array = _mm256_loadu_si256(in+(i*8));
vec_array = _mm256_max_epi32(vec_array,vec_min_256);
_mm256_storeu_si256(out+(i*8),vec_array);
}
for(npy_int j=0;j<unaligned;j++){
if (in[i*8+j]<min_val) {
out[i*8+j] = min_val;
}
}
}
else if (min == NULL) {
for(i=0;i<ni/8;i++){
vec_array = _mm256_loadu_si256(in+(i*8));
vec_array = _mm256_min_epi32(vec_array,vec_max_256);
_mm256_storeu_si256(out+(i*8),vec_array);
}
for(npy_int j=0;j<unaligned;j++){
if (in[i*8+j]>max_val) {
out[i*8+j] = max_val;
}
}
}
else {
for(i=0;i<ni/8;i++){
vec_array = _mm256_loadu_si256(in+(i*8));
vec_array = _mm256_max_epi32(vec_array,vec_min_256);
vec_array = _mm256_min_epi32(vec_array,vec_max_256);
_mm256_storeu_si256(out+(i*8),vec_array);
}
for(npy_int j=0;j<unaligned;j++){
if (in[i*8+j]<min_val) {
out[i*8+j] = min_val;
}
else if (in[i*8+j]>max_val) {
out[i*8+j] = max_val;
}
}
}

/**** The optimization portion ends here. ***/

#else
if (max != NULL) {
max_val = *max;
#if @isfloat@
/* NaNs result in no clipping, so optimize the case away */
#if @isfloat@
/* NaNs result in no clipping, so optimize the case away */
if (@isnan@(max_val)) {
if (min == NULL) {
memmove(out, in, ni * sizeof(@type@));
return;
}
max = NULL;
}
#endif
#endif
}
if (min != NULL) {
min_val = *min;
#if @isfloat@
#if @isfloat@
if (@isnan@(min_val)) {
if (max == NULL) {
memmove(out, in, ni * sizeof(@type@));
return;
}
min = NULL;
}
#endif
#endif
}
if (max == NULL) {
for (i = 0; i < ni; i++) {
Expand All @@ -3757,11 +3842,11 @@ static void
}
else {
/*
* Visual Studio 2015 loop vectorizer handles NaN in an unexpected
* manner, see: https://github.com/numpy/numpy/issues/7601
*/
* Visual Studio 2015 loop vectorizer handles NaN in an unexpected
* manner, see: https://github.com/numpy/numpy/issues/7601
*/
#if (_MSC_VER == 1900)
#pragma loop( no_vector )
#pragma loop( no_vector )
#endif
for (i = 0; i < ni; i++) {
if (@lt@(in[i], min_val)) {
Expand All @@ -3775,9 +3860,11 @@ static void
}
}
}
#endif
}
/**end repeat**/


#undef _LESS_THAN
#undef _GREATER_THAN
#undef _HALF_LESS_THAN
Expand Down