From 4edb8d832faf7300ae5482d8c4b6bb1d6329d6b4 Mon Sep 17 00:00:00 2001 From: ysingh7 Date: Mon, 1 May 2017 11:27:27 -0700 Subject: [PATCH 1/5] Added algorithm to vectorize INT_FastClip operation using AVX2 --- numpy/core/src/multiarray/arraytypes.c.src | 174 +++++++++++++++++++++ 1 file changed, 174 insertions(+) diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 49d6ae1d2288..6f161c5397d2 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -23,6 +23,7 @@ #include "_datetime.h" #include "arrayobject.h" #include "alloc.h" +#include #ifdef NPY_HAVE_SSE2_INTRINSICS #include #endif @@ -3687,6 +3688,177 @@ static void #define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b)) #define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a)) +/* The following function provides Optimization for INT_fastclip using avx2 extension. +First it checks if AVX2 attribute is present or not.If AVX2 support is present, then it +breaks down the Algorithm into two parts. +1. First it looks at total length of the input array and find out how many bytes are unaligned + with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size. +2. Then for the portion it is aligned with 256 bytes, it loops over the data and loads into one of the vector registers. +3. It loads vector registers vec_max_256 as vector of maximum value and vec_min_max as vector of minimum values, each of 4 Bytes. +4. Parallel comparision is made between loaded data and vec_max_256 first and do a vectorized min operation betwwen data and max value. If data is greater than max value , it gets replaced by max value, otherwise with data. +5. We then use the register where result of above min comparison is tored and do a max operation between the data and min value. If data is lesser than the min value , it gets replaced by min value, otherwise with data. +6. After that we take care of the unaligned portion by simply looping over it, like the previous algorithm was doing. +7. For a simple benchmark, this algorithm gave around 40-50% performance boost. +*/ + + + +#if HAVE_ATTRIBUTE_TARGET_AVX2 +static void +INT_fastclip(npy_int *in, npy_intp ni, npy_int *min, npy_int *max, npy_int *out) +{ + npy_intp i; + npy_int max_val = 0, min_val = 0; + int unaligned = ni%8; + __m128i vec_max,vec_min; + __m256i vec_max_256,vec_min_256,vec_array; + + if (max != NULL) { + max_val = *max; + vec_max = _mm_set1_epi32(max_val); + vec_max_256 = _mm256_broadcastd_epi32(vec_max); + } + if (min != NULL) { + min_val = *min; + vec_min = _mm_set1_epi32(min_val); + vec_min_256 = _mm256_broadcastd_epi32(vec_min); + + } + if (max == NULL) { + for(i=0;imax_val) { + out[i*8+j] = max_val; + } + } + } + else { + for(i=0;imax_val) { + out[i*8+j] = max_val; + } + + } + } +} +/**begin repeat + * + * #name = BOOL, + * BYTE, UBYTE, SHORT, USHORT,UINT, + * LONG, ULONG, LONGLONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE, + * DATETIME, TIMEDELTA# + * #type = npy_bool, + * npy_byte, npy_ubyte, npy_short, npy_ushort,npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong, + * npy_half, npy_float, npy_double, npy_longdouble, + * npy_datetime, npy_timedelta# + * #isfloat = 0*10, 1*4, 0*2# + * #isnan = nop*10, npy_half_isnan, npy_isnan*3, nop*2# + * #lt = _LESS_THAN*10, _HALF_LESS_THAN, _LESS_THAN*5# + * #gt = _GREATER_THAN*10, _HALF_GREATER_THAN, _GREATER_THAN*5# + */ +static void +@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out) +{ + npy_intp i; + @type@ max_val = 0, min_val = 0; + + if (max != NULL) { + max_val = *max; +#if @isfloat@ + /* NaNs result in no clipping, so optimize the case away */ + if (@isnan@(max_val)) { + if (min == NULL) { + memmove(out, in, ni * sizeof(@type@)); + return; + } + max = NULL; + } +#endif + } + if (min != NULL) { + min_val = *min; +#if @isfloat@ + if (@isnan@(min_val)) { + if (max == NULL) { + memmove(out, in, ni * sizeof(@type@)); + return; + } + min = NULL; + } +#endif + } + if (max == NULL) { + for (i = 0; i < ni; i++) { + if (@lt@(in[i], min_val)) { + out[i] = min_val; + } + else { + out[i] = in[i]; + } + } + } + else if (min == NULL) { + for (i = 0; i < ni; i++) { + if (@gt@(in[i], max_val)) { + out[i] = max_val; + } + else { + out[i] = in[i]; + } + } + } + else { + /* + * Visual Studio 2015 loop vectorizer handles NaN in an unexpected + * manner, see: https://github.com/numpy/numpy/issues/7601 + */ + #if (_MSC_VER == 1900) + #pragma loop( no_vector ) + #endif + for (i = 0; i < ni; i++) { + if (@lt@(in[i], min_val)) { + out[i] = min_val; + } + else if (@gt@(in[i], max_val)) { + out[i] = max_val; + } + else { + out[i] = in[i]; + } + } + } +} + +/**end repeat**/ +#else /**begin repeat * * #name = BOOL, @@ -3777,6 +3949,8 @@ static void } } /**end repeat**/ +#endif + #undef _LESS_THAN #undef _GREATER_THAN From 9ffbb310fe035f78e3f2fc5fdf3572d30fae79b8 Mon Sep 17 00:00:00 2001 From: ysingh7 Date: Mon, 1 May 2017 11:52:28 -0700 Subject: [PATCH 2/5] Adding conditional inclusion of files for Windows,gcc built --- numpy/core/src/multiarray/arraytypes.c.src | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 6f161c5397d2..13e4a06c2936 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -23,7 +23,12 @@ #include "_datetime.h" #include "arrayobject.h" #include "alloc.h" -#include +#if defined(_MSC_VER) + /* Microsoft C/C++-compatible compiler */ + #include +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + /* GCC-compatible compiler, targeting x86/x86-64 */ + #include #ifdef NPY_HAVE_SSE2_INTRINSICS #include #endif From c1afedc739ea76822a68f9c5022c7d1f991d1f53 Mon Sep 17 00:00:00 2001 From: ysingh7 Date: Mon, 1 May 2017 12:01:04 -0700 Subject: [PATCH 3/5] Adding conditional inclusion of files for Windows,gcc built --- numpy/core/src/multiarray/arraytypes.c.src | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 13e4a06c2936..e208b53c5845 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -29,6 +29,7 @@ #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) /* GCC-compatible compiler, targeting x86/x86-64 */ #include +#endif #ifdef NPY_HAVE_SSE2_INTRINSICS #include #endif From dcec35516f927bdc39eb657ee15be2fe89aa1e84 Mon Sep 17 00:00:00 2001 From: ysingh7 Date: Mon, 1 May 2017 14:49:30 -0700 Subject: [PATCH 4/5] Added check for integer size = 32 and Moved the check HAVE_ATTRIBUTE_TARGET_AVX2 inside the function --- numpy/core/src/multiarray/arraytypes.c.src | 144 ++++----------------- 1 file changed, 23 insertions(+), 121 deletions(-) diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index e208b53c5845..11ef68d3f00b 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -3707,14 +3707,30 @@ breaks down the Algorithm into two parts. 7. For a simple benchmark, this algorithm gave around 40-50% performance boost. */ - - -#if HAVE_ATTRIBUTE_TARGET_AVX2 +/**begin repeat + * + * #name = BOOL, + * BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE, + * DATETIME, TIMEDELTA# + * #type = npy_bool, + * npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong, + * npy_half, npy_float, npy_double, npy_longdouble, + * npy_datetime, npy_timedelta# + * #isfloat = 0*11, 1*4, 0*2# + * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2# + * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5# + * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5# + */ static void -INT_fastclip(npy_int *in, npy_intp ni, npy_int *min, npy_int *max, npy_int *out) +@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out) { npy_intp i; - npy_int max_val = 0, min_val = 0; + @type@ max_val = 0, min_val = 0; + +#if HAVE_ATTRIBUTE_TARGET_AVX2 && @type@==npy_int && NPY_BITSOF_@type@ == 32 int unaligned = ni%8; __m128i vec_max,vec_min; __m256i vec_max_256,vec_min_256,vec_array; @@ -3771,122 +3787,7 @@ INT_fastclip(npy_int *in, npy_intp ni, npy_int *min, npy_int *max, npy_int *out) } } - } -} -/**begin repeat - * - * #name = BOOL, - * BYTE, UBYTE, SHORT, USHORT,UINT, - * LONG, ULONG, LONGLONG, ULONGLONG, - * HALF, FLOAT, DOUBLE, LONGDOUBLE, - * DATETIME, TIMEDELTA# - * #type = npy_bool, - * npy_byte, npy_ubyte, npy_short, npy_ushort,npy_uint, - * npy_long, npy_ulong, npy_longlong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_datetime, npy_timedelta# - * #isfloat = 0*10, 1*4, 0*2# - * #isnan = nop*10, npy_half_isnan, npy_isnan*3, nop*2# - * #lt = _LESS_THAN*10, _HALF_LESS_THAN, _LESS_THAN*5# - * #gt = _GREATER_THAN*10, _HALF_GREATER_THAN, _GREATER_THAN*5# - */ -static void -@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out) -{ - npy_intp i; - @type@ max_val = 0, min_val = 0; - - if (max != NULL) { - max_val = *max; -#if @isfloat@ - /* NaNs result in no clipping, so optimize the case away */ - if (@isnan@(max_val)) { - if (min == NULL) { - memmove(out, in, ni * sizeof(@type@)); - return; - } - max = NULL; - } -#endif - } - if (min != NULL) { - min_val = *min; -#if @isfloat@ - if (@isnan@(min_val)) { - if (max == NULL) { - memmove(out, in, ni * sizeof(@type@)); - return; - } - min = NULL; - } -#endif - } - if (max == NULL) { - for (i = 0; i < ni; i++) { - if (@lt@(in[i], min_val)) { - out[i] = min_val; - } - else { - out[i] = in[i]; - } - } - } - else if (min == NULL) { - for (i = 0; i < ni; i++) { - if (@gt@(in[i], max_val)) { - out[i] = max_val; - } - else { - out[i] = in[i]; - } - } - } - else { - /* - * Visual Studio 2015 loop vectorizer handles NaN in an unexpected - * manner, see: https://github.com/numpy/numpy/issues/7601 - */ - #if (_MSC_VER == 1900) - #pragma loop( no_vector ) - #endif - for (i = 0; i < ni; i++) { - if (@lt@(in[i], min_val)) { - out[i] = min_val; - } - else if (@gt@(in[i], max_val)) { - out[i] = max_val; - } - else { - out[i] = in[i]; - } - } - } -} - -/**end repeat**/ #else -/**begin repeat - * - * #name = BOOL, - * BYTE, UBYTE, SHORT, USHORT, INT, UINT, - * LONG, ULONG, LONGLONG, ULONGLONG, - * HALF, FLOAT, DOUBLE, LONGDOUBLE, - * DATETIME, TIMEDELTA# - * #type = npy_bool, - * npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, - * npy_long, npy_ulong, npy_longlong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_datetime, npy_timedelta# - * #isfloat = 0*11, 1*4, 0*2# - * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2# - * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5# - * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5# - */ -static void -@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out) -{ - npy_intp i; - @type@ max_val = 0, min_val = 0; if (max != NULL) { max_val = *max; @@ -3953,9 +3854,10 @@ static void } } } +#endif } /**end repeat**/ -#endif +//#endif #undef _LESS_THAN From b118a2e9751d57adc496d368d095e52a740cc4e6 Mon Sep 17 00:00:00 2001 From: ysingh7 Date: Mon, 1 May 2017 17:47:52 -0700 Subject: [PATCH 5/5] Improved the indentation and added isint check --- numpy/core/src/multiarray/arraytypes.c.src | 139 +++++++++++---------- 1 file changed, 72 insertions(+), 67 deletions(-) diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 11ef68d3f00b..41a938cc3653 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -3694,19 +3694,6 @@ static void #define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b)) #define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a)) -/* The following function provides Optimization for INT_fastclip using avx2 extension. -First it checks if AVX2 attribute is present or not.If AVX2 support is present, then it -breaks down the Algorithm into two parts. -1. First it looks at total length of the input array and find out how many bytes are unaligned - with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size. -2. Then for the portion it is aligned with 256 bytes, it loops over the data and loads into one of the vector registers. -3. It loads vector registers vec_max_256 as vector of maximum value and vec_min_max as vector of minimum values, each of 4 Bytes. -4. Parallel comparision is made between loaded data and vec_max_256 first and do a vectorized min operation betwwen data and max value. If data is greater than max value , it gets replaced by max value, otherwise with data. -5. We then use the register where result of above min comparison is tored and do a max operation between the data and min value. If data is lesser than the min value , it gets replaced by min value, otherwise with data. -6. After that we take care of the unaligned portion by simply looping over it, like the previous algorithm was doing. -7. For a simple benchmark, this algorithm gave around 40-50% performance boost. -*/ - /**begin repeat * * #name = BOOL, @@ -3719,6 +3706,7 @@ breaks down the Algorithm into two parts. * npy_long, npy_ulong, npy_longlong, npy_ulonglong, * npy_half, npy_float, npy_double, npy_longdouble, * npy_datetime, npy_timedelta# + * #isint = 1*10, 0*7# * #isfloat = 0*11, 1*4, 0*2# * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2# * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5# @@ -3729,70 +3717,88 @@ static void { npy_intp i; @type@ max_val = 0, min_val = 0; - -#if HAVE_ATTRIBUTE_TARGET_AVX2 && @type@==npy_int && NPY_BITSOF_@type@ == 32 - int unaligned = ni%8; - __m128i vec_max,vec_min; - __m256i vec_max_256,vec_min_256,vec_array; - + +// The following portion provides Optimization for INT_fastclip using avx2 extension. + #if HAVE_ATTRIBUTE_TARGET_AVX2 && @isint@ && NPY_BITSOF_@type@ == 32 + +/*It looks at total length of the input array and find out how many bytes are unaligned + with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.*/ + int unaligned = ni%8; +// Declaration of vector registers uing avx2 + __m128i vec_max,vec_min; + __m256i vec_max_256,vec_min_256,vec_array; if (max != NULL) { - max_val = *max; - vec_max = _mm_set1_epi32(max_val); - vec_max_256 = _mm256_broadcastd_epi32(vec_max); + max_val = *max; +//Setting up vector registers with all value as 32 byte maximum value + vec_max = _mm_set1_epi32(max_val); + vec_max_256 = _mm256_broadcastd_epi32(vec_max); } if (min != NULL) { min_val = *min; +//Setting up vector registers with all value as 32 byte minimum value vec_min = _mm_set1_epi32(min_val); vec_min_256 = _mm256_broadcastd_epi32(vec_min); - } + +/* Parallel comparision is made between loaded data and vec_max_256 first + and do a vectorized min operation betwwen data and max value. If data is + greater than max value , it gets replaced by max value, otherwise with + data.We then use the register where result of above min comparison is stored + and do a max operation between the data and min value. If data is lesser than + the min value , it gets replaced by min value, otherwise with data.After that + we take care of the unaligned portion by simply looping over it, like the previous + algorithm was doing. For a simple benchmark, this algorithm gave around 40-50% + performance boost. +*/ + if (max == NULL) { - for(i=0;imax_val) { + for(i=0;imax_val) { out[i*8+j] = max_val; } - } - } - else { - for(i=0;imax_val) { - out[i*8+j] = max_val; - } + out[i*8+j] = min_val; + } + else if (in[i*8+j]>max_val) { + out[i*8+j] = max_val; + } + } + } - } -#else +/**** The optimization portion ends here. ***/ + #else if (max != NULL) { max_val = *max; -#if @isfloat@ - /* NaNs result in no clipping, so optimize the case away */ + #if @isfloat@ + /* NaNs result in no clipping, so optimize the case away */ if (@isnan@(max_val)) { if (min == NULL) { memmove(out, in, ni * sizeof(@type@)); @@ -3800,11 +3806,11 @@ static void } max = NULL; } -#endif + #endif } if (min != NULL) { min_val = *min; -#if @isfloat@ + #if @isfloat@ if (@isnan@(min_val)) { if (max == NULL) { memmove(out, in, ni * sizeof(@type@)); @@ -3812,7 +3818,7 @@ static void } min = NULL; } -#endif + #endif } if (max == NULL) { for (i = 0; i < ni; i++) { @@ -3836,11 +3842,11 @@ static void } else { /* - * Visual Studio 2015 loop vectorizer handles NaN in an unexpected - * manner, see: https://github.com/numpy/numpy/issues/7601 - */ + * Visual Studio 2015 loop vectorizer handles NaN in an unexpected + * manner, see: https://github.com/numpy/numpy/issues/7601 + */ #if (_MSC_VER == 1900) - #pragma loop( no_vector ) + #pragma loop( no_vector ) #endif for (i = 0; i < ni; i++) { if (@lt@(in[i], min_val)) { @@ -3854,10 +3860,9 @@ static void } } } -#endif + #endif } /**end repeat**/ -//#endif #undef _LESS_THAN