diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 49d6ae1d2288..41a938cc3653 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -23,6 +23,13 @@ #include "_datetime.h" #include "arrayobject.h" #include "alloc.h" +#if defined(_MSC_VER) + /* Microsoft C/C++-compatible compiler */ + #include +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + /* GCC-compatible compiler, targeting x86/x86-64 */ + #include +#endif #ifdef NPY_HAVE_SSE2_INTRINSICS #include #endif @@ -3699,6 +3706,7 @@ static void * npy_long, npy_ulong, npy_longlong, npy_ulonglong, * npy_half, npy_float, npy_double, npy_longdouble, * npy_datetime, npy_timedelta# + * #isint = 1*10, 0*7# * #isfloat = 0*11, 1*4, 0*2# * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2# * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5# @@ -3709,11 +3717,88 @@ static void { npy_intp i; @type@ max_val = 0, min_val = 0; + +// The following portion provides Optimization for INT_fastclip using avx2 extension. + #if HAVE_ATTRIBUTE_TARGET_AVX2 && @isint@ && NPY_BITSOF_@type@ == 32 + +/*It looks at total length of the input array and find out how many bytes are unaligned + with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.*/ + int unaligned = ni%8; +// Declaration of vector registers uing avx2 + __m128i vec_max,vec_min; + __m256i vec_max_256,vec_min_256,vec_array; + if (max != NULL) { + max_val = *max; +//Setting up vector registers with all value as 32 byte maximum value + vec_max = _mm_set1_epi32(max_val); + vec_max_256 = _mm256_broadcastd_epi32(vec_max); + } + if (min != NULL) { + min_val = *min; +//Setting up vector registers with all value as 32 byte minimum value + vec_min = _mm_set1_epi32(min_val); + vec_min_256 = _mm256_broadcastd_epi32(vec_min); + } + +/* Parallel comparision is made between loaded data and vec_max_256 first + and do a vectorized min operation betwwen data and max value. If data is + greater than max value , it gets replaced by max value, otherwise with + data.We then use the register where result of above min comparison is stored + and do a max operation between the data and min value. If data is lesser than + the min value , it gets replaced by min value, otherwise with data.After that + we take care of the unaligned portion by simply looping over it, like the previous + algorithm was doing. For a simple benchmark, this algorithm gave around 40-50% + performance boost. +*/ + + if (max == NULL) { + for(i=0;imax_val) { + out[i*8+j] = max_val; + } + } + } + else { + for(i=0;imax_val) { + out[i*8+j] = max_val; + } + } + } + +/**** The optimization portion ends here. ***/ + #else if (max != NULL) { max_val = *max; -#if @isfloat@ - /* NaNs result in no clipping, so optimize the case away */ + #if @isfloat@ + /* NaNs result in no clipping, so optimize the case away */ if (@isnan@(max_val)) { if (min == NULL) { memmove(out, in, ni * sizeof(@type@)); @@ -3721,11 +3806,11 @@ static void } max = NULL; } -#endif + #endif } if (min != NULL) { min_val = *min; -#if @isfloat@ + #if @isfloat@ if (@isnan@(min_val)) { if (max == NULL) { memmove(out, in, ni * sizeof(@type@)); @@ -3733,7 +3818,7 @@ static void } min = NULL; } -#endif + #endif } if (max == NULL) { for (i = 0; i < ni; i++) { @@ -3757,11 +3842,11 @@ static void } else { /* - * Visual Studio 2015 loop vectorizer handles NaN in an unexpected - * manner, see: https://github.com/numpy/numpy/issues/7601 - */ + * Visual Studio 2015 loop vectorizer handles NaN in an unexpected + * manner, see: https://github.com/numpy/numpy/issues/7601 + */ #if (_MSC_VER == 1900) - #pragma loop( no_vector ) + #pragma loop( no_vector ) #endif for (i = 0; i < ni; i++) { if (@lt@(in[i], min_val)) { @@ -3775,9 +3860,11 @@ static void } } } + #endif } /**end repeat**/ + #undef _LESS_THAN #undef _GREATER_THAN #undef _HALF_LESS_THAN