numpy · ysingh7 · May 1, 2017 · May 1, 2017 · May 1, 2017 · May 1, 2017
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
@@ -23,6 +23,13 @@
 #include "_datetime.h"
 #include "arrayobject.h"
 #include "alloc.h"
+#if defined(_MSC_VER)
+     /* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+     /* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
+#endif
 #ifdef NPY_HAVE_SSE2_INTRINSICS
 #include <emmintrin.h>
 #endif
@@ -3699,6 +3706,7 @@ static void
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble,
  *         npy_datetime, npy_timedelta#
+ * #isint = 1*10, 0*7#
  * #isfloat = 0*11, 1*4, 0*2#
  * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
  * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
@@ -3709,31 +3717,108 @@ static void
 {
     npy_intp i;
     @type@ max_val = 0, min_val = 0;
+
+// The following portion provides Optimization for INT_fastclip using avx2 extension.
+    #if HAVE_ATTRIBUTE_TARGET_AVX2 && @isint@ && NPY_BITSOF_@type@ == 32
+
+/*It looks at total length of the input array and find out how many bytes are unaligned
+   with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.*/
+        int unaligned = ni%8;
+// Declaration of vector registers uing avx2
+        __m128i vec_max,vec_min;
+        __m256i vec_max_256,vec_min_256,vec_array;
+    if (max != NULL) {
+        max_val = *max;
+//Setting up vector registers with all value as 32 byte maximum value
+        vec_max = _mm_set1_epi32(max_val);
+        vec_max_256 = _mm256_broadcastd_epi32(vec_max);
+    }
+    if (min != NULL) {
+        min_val = *min;
+//Setting up vector registers with all value as 32 byte minimum value
+        vec_min = _mm_set1_epi32(min_val);
+        vec_min_256 = _mm256_broadcastd_epi32(vec_min);
+    }
+
+/* Parallel comparision is made between loaded data and vec_max_256 first 
+   and do a vectorized min operation betwwen data and max value. If data is
+   greater than max value , it gets replaced by max value, otherwise with 
+   data.We then use the register where result of above min comparison is stored
+   and do a max operation between the data and min value. If data is lesser than
+   the min value , it gets replaced by min value, otherwise with data.After that
+   we take care of the unaligned portion by simply looping over it, like the previous
+   algorithm was doing. For a simple benchmark, this algorithm gave around 40-50% 
+   performance boost.
+*/
+
+    if (max == NULL) {
+        for(i=0;i<ni/8;i++){
+            vec_array = _mm256_loadu_si256(in+(i*8));
+            vec_array = _mm256_max_epi32(vec_array,vec_min_256);
+            _mm256_storeu_si256(out+(i*8),vec_array);
+        }
+        for(npy_int j=0;j<unaligned;j++){
+            if (in[i*8+j]<min_val) {
+                out[i*8+j]   = min_val;
+            }
+        }
+    }
+    else if (min == NULL) {
+        for(i=0;i<ni/8;i++){
+            vec_array = _mm256_loadu_si256(in+(i*8));
+            vec_array = _mm256_min_epi32(vec_array,vec_max_256);
+            _mm256_storeu_si256(out+(i*8),vec_array);
+        }
+        for(npy_int j=0;j<unaligned;j++){
+            if (in[i*8+j]>max_val) {
+                out[i*8+j]   = max_val;
+            }
+        }
+    }
+    else {
+        for(i=0;i<ni/8;i++){
+            vec_array = _mm256_loadu_si256(in+(i*8));
+            vec_array = _mm256_max_epi32(vec_array,vec_min_256);
+            vec_array = _mm256_min_epi32(vec_array,vec_max_256);
+            _mm256_storeu_si256(out+(i*8),vec_array);
+        }
+        for(npy_int j=0;j<unaligned;j++){
+            if (in[i*8+j]<min_val) {
+                out[i*8+j]   = min_val;
+            }
+            else if (in[i*8+j]>max_val) {
+                out[i*8+j]   = max_val;
+            }
+        }
+    }
+
+/**** The optimization portion ends here. ***/
 
+    #else
     if (max != NULL) {
         max_val = *max;
-#if @isfloat@
-        /* NaNs result in no clipping, so optimize the case away */
+        #if @isfloat@
+            /* NaNs result in no clipping, so optimize the case away */
         if (@isnan@(max_val)) {
             if (min == NULL) {
                 memmove(out, in, ni * sizeof(@type@));
                 return;
             }
             max = NULL;
         }
-#endif
+        #endif
     }
     if (min != NULL) {
         min_val = *min;
-#if @isfloat@
+        #if @isfloat@
         if (@isnan@(min_val)) {
             if (max == NULL) {
                 memmove(out, in, ni * sizeof(@type@));
                 return;
             }
             min = NULL;
         }
-#endif
+        #endif
     }
     if (max == NULL) {
         for (i = 0; i < ni; i++) {
@@ -3757,11 +3842,11 @@ static void
     }
     else {
         /*
-         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
-         * manner, see: https://github.com/numpy/numpy/issues/7601
-         */
+        * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
+        * manner, see: https://github.com/numpy/numpy/issues/7601
+        */
         #if (_MSC_VER == 1900)
-        #pragma loop( no_vector )
+            #pragma loop( no_vector )
         #endif
         for (i = 0; i < ni; i++) {
             if (@lt@(in[i], min_val)) {
@@ -3775,9 +3860,11 @@ static void
             }
         }
     }
+    #endif
 }
 /**end repeat**/
 
+
 #undef _LESS_THAN
 #undef _GREATER_THAN
 #undef _HALF_LESS_THAN