From 4edb8d832faf7300ae5482d8c4b6bb1d6329d6b4 Mon Sep 17 00:00:00 2001
From: ysingh7 <yashnitkkr7@gmail.com>
Date: Mon, 1 May 2017 11:27:27 -0700
Subject: [PATCH 1/5] Added algorithm to vectorize INT_FastClip operation using
 AVX2

---
 numpy/core/src/multiarray/arraytypes.c.src | 174 +++++++++++++++++++++
 1 file changed, 174 insertions(+)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 49d6ae1d2288..6f161c5397d2 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -23,6 +23,7 @@
 #include "_datetime.h"
 #include "arrayobject.h"
 #include "alloc.h"
+#include <x86intrin.h>
 #ifdef NPY_HAVE_SSE2_INTRINSICS
 #include <emmintrin.h>
 #endif
@@ -3687,6 +3688,177 @@ static void
 #define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b))
 #define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a))
 
+/* The following function provides Optimization for INT_fastclip using avx2 extension.
+First it checks if AVX2 attribute is present or not.If AVX2 support is present, then it
+breaks down the Algorithm into two parts.
+1. First it looks at total length of the input array and find out how many bytes are unaligned
+   with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.
+2. Then for the portion it is aligned with 256 bytes, it loops over the data and loads into one of the vector registers.
+3. It loads vector registers vec_max_256 as vector of maximum value and vec_min_max as vector of minimum values, each of 4 Bytes.
+4. Parallel comparision is made between loaded data and vec_max_256 first and do a vectorized min operation betwwen data and max value. If data is greater than max value , it gets replaced by max value, otherwise with data.
+5. We then use the register where result of above min comparison is tored and do a max operation between the data and min value. If data is lesser than the min value , it gets replaced by min value, otherwise with data. 
+6. After that we take care of the unaligned portion by simply looping over it, like the previous algorithm was doing.
+7. For a simple benchmark, this algorithm gave around 40-50% performance boost.
+*/
+
+
+ 
+#if HAVE_ATTRIBUTE_TARGET_AVX2
+static void
+INT_fastclip(npy_int *in, npy_intp ni, npy_int *min, npy_int *max, npy_int *out)
+{
+    npy_intp i;
+    npy_int max_val = 0, min_val = 0;
+    int unaligned = ni%8;
+    __m128i vec_max,vec_min;
+    __m256i vec_max_256,vec_min_256,vec_array;
+
+    if (max != NULL) {
+       max_val = *max;
+       vec_max = _mm_set1_epi32(max_val);
+       vec_max_256 = _mm256_broadcastd_epi32(vec_max);
+    }
+    if (min != NULL) {
+        min_val = *min;
+        vec_min = _mm_set1_epi32(min_val);
+        vec_min_256 = _mm256_broadcastd_epi32(vec_min);
+	
+    }
+    if (max == NULL) {
+	for(i=0;i<ni/8;i++){
+           vec_array = _mm256_loadu_si256(in+(i*8));
+           vec_array = _mm256_max_epi32(vec_array,vec_min_256);
+           _mm256_storeu_si256(out+(i*8),vec_array);
+       }
+       for(npy_int j=0;j<unaligned;j++){
+             if (in[i*8+j]<min_val) {
+                out[i*8+j]   = min_val;
+            }
+	}
+    	
+    }
+    else if (min == NULL) {
+	for(i=0;i<ni/8;i++){
+       vec_array = _mm256_loadu_si256(in+(i*8));
+       vec_array = _mm256_min_epi32(vec_array,vec_max_256);
+       _mm256_storeu_si256(out+(i*8),vec_array);
+   }
+       for(npy_int j=0;j<unaligned;j++){
+             if (in[i*8+j]>max_val) {
+                out[i*8+j]   = max_val;
+            }
+	}
+   }
+   else {
+  	for(i=0;i<ni/8;i++){
+           vec_array = _mm256_loadu_si256(in+(i*8));
+           vec_array = _mm256_max_epi32(vec_array,vec_min_256);
+           vec_array = _mm256_min_epi32(vec_array,vec_max_256);
+           _mm256_storeu_si256(out+(i*8),vec_array);
+       }
+       for(npy_int j=0;j<unaligned;j++){
+            if (in[i*8+j]<min_val) {
+                    out[i*8+j]   = min_val;
+                }
+                else if (in[i*8+j]>max_val) {
+                    out[i*8+j]   = max_val;
+                }
+
+       }
+    }
+}
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT,UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort,npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*10, 1*4, 0*2#
+ * #isnan = nop*10, npy_half_isnan, npy_isnan*3, nop*2#
+ * #lt = _LESS_THAN*10, _HALF_LESS_THAN, _LESS_THAN*5#
+ * #gt = _GREATER_THAN*10, _HALF_GREATER_THAN, _GREATER_THAN*5#
+ */
+static void
+@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
+{
+    npy_intp i;
+    @type@ max_val = 0, min_val = 0;
+
+    if (max != NULL) {
+        max_val = *max;
+#if @isfloat@
+        /* NaNs result in no clipping, so optimize the case away */
+        if (@isnan@(max_val)) {
+            if (min == NULL) {
+                memmove(out, in, ni * sizeof(@type@));
+                return;
+            }
+            max = NULL;
+        }
+#endif
+    }
+    if (min != NULL) {
+        min_val = *min;
+#if @isfloat@
+        if (@isnan@(min_val)) {
+            if (max == NULL) {
+                memmove(out, in, ni * sizeof(@type@));
+                return;
+            }
+            min = NULL;
+        }
+#endif
+    }
+    if (max == NULL) {
+        for (i = 0; i < ni; i++) {
+            if (@lt@(in[i], min_val)) {
+                out[i] = min_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+    else if (min == NULL) {
+        for (i = 0; i < ni; i++) {
+            if (@gt@(in[i], max_val)) {
+                out[i] = max_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+    else {
+        /*
+         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
+         * manner, see: https://github.com/numpy/numpy/issues/7601
+         */
+        #if (_MSC_VER == 1900)
+        #pragma loop( no_vector )
+        #endif
+        for (i = 0; i < ni; i++) {
+            if (@lt@(in[i], min_val)) {
+                out[i]   = min_val;
+            }
+            else if (@gt@(in[i], max_val)) {
+                out[i]   = max_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+}
+
+/**end repeat**/
+#else
 /**begin repeat
  *
  * #name = BOOL,
@@ -3777,6 +3949,8 @@ static void
     }
 }
 /**end repeat**/
+#endif
+
 
 #undef _LESS_THAN
 #undef _GREATER_THAN

From 9ffbb310fe035f78e3f2fc5fdf3572d30fae79b8 Mon Sep 17 00:00:00 2001
From: ysingh7 <yashnitkkr7@gmail.com>
Date: Mon, 1 May 2017 11:52:28 -0700
Subject: [PATCH 2/5] Adding conditional inclusion of files for Windows,gcc
 built

---
 numpy/core/src/multiarray/arraytypes.c.src | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 6f161c5397d2..13e4a06c2936 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -23,7 +23,12 @@
 #include "_datetime.h"
 #include "arrayobject.h"
 #include "alloc.h"
-#include <x86intrin.h>
+#if defined(_MSC_VER)
+     /* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+     /* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
 #ifdef NPY_HAVE_SSE2_INTRINSICS
 #include <emmintrin.h>
 #endif

From c1afedc739ea76822a68f9c5022c7d1f991d1f53 Mon Sep 17 00:00:00 2001
From: ysingh7 <yashnitkkr7@gmail.com>
Date: Mon, 1 May 2017 12:01:04 -0700
Subject: [PATCH 3/5] Adding conditional inclusion of files for Windows,gcc
 built

---
 numpy/core/src/multiarray/arraytypes.c.src | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 13e4a06c2936..e208b53c5845 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -29,6 +29,7 @@
 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
      /* GCC-compatible compiler, targeting x86/x86-64 */
      #include <x86intrin.h>
+#endif
 #ifdef NPY_HAVE_SSE2_INTRINSICS
 #include <emmintrin.h>
 #endif

From dcec35516f927bdc39eb657ee15be2fe89aa1e84 Mon Sep 17 00:00:00 2001
From: ysingh7 <yashnitkkr7@gmail.com>
Date: Mon, 1 May 2017 14:49:30 -0700
Subject: [PATCH 4/5] Added check for integer size = 32 and Moved the check
 HAVE_ATTRIBUTE_TARGET_AVX2 inside the function

---
 numpy/core/src/multiarray/arraytypes.c.src | 144 ++++-----------------
 1 file changed, 23 insertions(+), 121 deletions(-)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index e208b53c5845..11ef68d3f00b 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3707,14 +3707,30 @@ breaks down the Algorithm into two parts.
 7. For a simple benchmark, this algorithm gave around 40-50% performance boost.
 */
 
-
- 
-#if HAVE_ATTRIBUTE_TARGET_AVX2
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*11, 1*4, 0*2#
+ * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
+ * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
+ * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5#
+ */
 static void
-INT_fastclip(npy_int *in, npy_intp ni, npy_int *min, npy_int *max, npy_int *out)
+@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
 {
     npy_intp i;
-    npy_int max_val = 0, min_val = 0;
+    @type@ max_val = 0, min_val = 0;
+
+#if HAVE_ATTRIBUTE_TARGET_AVX2 && @type@==npy_int && NPY_BITSOF_@type@ == 32 
     int unaligned = ni%8;
     __m128i vec_max,vec_min;
     __m256i vec_max_256,vec_min_256,vec_array;
@@ -3771,122 +3787,7 @@ INT_fastclip(npy_int *in, npy_intp ni, npy_int *min, npy_int *max, npy_int *out)
                 }
 
        }
-    }
-}
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT,UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- * #type = npy_bool,
- *         npy_byte, npy_ubyte, npy_short, npy_ushort,npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_datetime, npy_timedelta#
- * #isfloat = 0*10, 1*4, 0*2#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*3, nop*2#
- * #lt = _LESS_THAN*10, _HALF_LESS_THAN, _LESS_THAN*5#
- * #gt = _GREATER_THAN*10, _HALF_GREATER_THAN, _GREATER_THAN*5#
- */
-static void
-@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
-{
-    npy_intp i;
-    @type@ max_val = 0, min_val = 0;
-
-    if (max != NULL) {
-        max_val = *max;
-#if @isfloat@
-        /* NaNs result in no clipping, so optimize the case away */
-        if (@isnan@(max_val)) {
-            if (min == NULL) {
-                memmove(out, in, ni * sizeof(@type@));
-                return;
-            }
-            max = NULL;
-        }
-#endif
-    }
-    if (min != NULL) {
-        min_val = *min;
-#if @isfloat@
-        if (@isnan@(min_val)) {
-            if (max == NULL) {
-                memmove(out, in, ni * sizeof(@type@));
-                return;
-            }
-            min = NULL;
-        }
-#endif
-    }
-    if (max == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (@lt@(in[i], min_val)) {
-                out[i] = min_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else if (min == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (@gt@(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else {
-        /*
-         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
-         * manner, see: https://github.com/numpy/numpy/issues/7601
-         */
-        #if (_MSC_VER == 1900)
-        #pragma loop( no_vector )
-        #endif
-        for (i = 0; i < ni; i++) {
-            if (@lt@(in[i], min_val)) {
-                out[i]   = min_val;
-            }
-            else if (@gt@(in[i], max_val)) {
-                out[i]   = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-}
-
-/**end repeat**/
 #else
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- * #type = npy_bool,
- *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_datetime, npy_timedelta#
- * #isfloat = 0*11, 1*4, 0*2#
- * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
- * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
- * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5#
- */
-static void
-@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
-{
-    npy_intp i;
-    @type@ max_val = 0, min_val = 0;
 
     if (max != NULL) {
         max_val = *max;
@@ -3953,9 +3854,10 @@ static void
             }
         }
     }
+#endif
 }
 /**end repeat**/
-#endif
+//#endif
 
 
 #undef _LESS_THAN

From b118a2e9751d57adc496d368d095e52a740cc4e6 Mon Sep 17 00:00:00 2001
From: ysingh7 <yashnitkkr7@gmail.com>
Date: Mon, 1 May 2017 17:47:52 -0700
Subject: [PATCH 5/5] Improved the indentation and added isint check

---
 numpy/core/src/multiarray/arraytypes.c.src | 139 +++++++++++----------
 1 file changed, 72 insertions(+), 67 deletions(-)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 11ef68d3f00b..41a938cc3653 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3694,19 +3694,6 @@ static void
 #define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b))
 #define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a))
 
-/* The following function provides Optimization for INT_fastclip using avx2 extension.
-First it checks if AVX2 attribute is present or not.If AVX2 support is present, then it
-breaks down the Algorithm into two parts.
-1. First it looks at total length of the input array and find out how many bytes are unaligned
-   with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.
-2. Then for the portion it is aligned with 256 bytes, it loops over the data and loads into one of the vector registers.
-3. It loads vector registers vec_max_256 as vector of maximum value and vec_min_max as vector of minimum values, each of 4 Bytes.
-4. Parallel comparision is made between loaded data and vec_max_256 first and do a vectorized min operation betwwen data and max value. If data is greater than max value , it gets replaced by max value, otherwise with data.
-5. We then use the register where result of above min comparison is tored and do a max operation between the data and min value. If data is lesser than the min value , it gets replaced by min value, otherwise with data. 
-6. After that we take care of the unaligned portion by simply looping over it, like the previous algorithm was doing.
-7. For a simple benchmark, this algorithm gave around 40-50% performance boost.
-*/
-
 /**begin repeat
  *
  * #name = BOOL,
@@ -3719,6 +3706,7 @@ breaks down the Algorithm into two parts.
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble,
  *         npy_datetime, npy_timedelta#
+ * #isint = 1*10, 0*7#
  * #isfloat = 0*11, 1*4, 0*2#
  * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
  * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
@@ -3729,70 +3717,88 @@ static void
 {
     npy_intp i;
     @type@ max_val = 0, min_val = 0;
-
-#if HAVE_ATTRIBUTE_TARGET_AVX2 && @type@==npy_int && NPY_BITSOF_@type@ == 32 
-    int unaligned = ni%8;
-    __m128i vec_max,vec_min;
-    __m256i vec_max_256,vec_min_256,vec_array;
-
+    
+// The following portion provides Optimization for INT_fastclip using avx2 extension.
+    #if HAVE_ATTRIBUTE_TARGET_AVX2 && @isint@ && NPY_BITSOF_@type@ == 32
+
+/*It looks at total length of the input array and find out how many bytes are unaligned
+   with respect to 256 bytes. Because avx2 registers used here are 256 bytes in size.*/
+        int unaligned = ni%8;
+// Declaration of vector registers uing avx2
+        __m128i vec_max,vec_min;
+        __m256i vec_max_256,vec_min_256,vec_array;
     if (max != NULL) {
-       max_val = *max;
-       vec_max = _mm_set1_epi32(max_val);
-       vec_max_256 = _mm256_broadcastd_epi32(vec_max);
+        max_val = *max;
+//Setting up vector registers with all value as 32 byte maximum value
+        vec_max = _mm_set1_epi32(max_val);
+        vec_max_256 = _mm256_broadcastd_epi32(vec_max);
     }
     if (min != NULL) {
         min_val = *min;
+//Setting up vector registers with all value as 32 byte minimum value
         vec_min = _mm_set1_epi32(min_val);
         vec_min_256 = _mm256_broadcastd_epi32(vec_min);
-	
     }
+
+/* Parallel comparision is made between loaded data and vec_max_256 first 
+   and do a vectorized min operation betwwen data and max value. If data is
+   greater than max value , it gets replaced by max value, otherwise with 
+   data.We then use the register where result of above min comparison is stored
+   and do a max operation between the data and min value. If data is lesser than
+   the min value , it gets replaced by min value, otherwise with data.After that
+   we take care of the unaligned portion by simply looping over it, like the previous
+   algorithm was doing. For a simple benchmark, this algorithm gave around 40-50% 
+   performance boost.
+*/
+
     if (max == NULL) {
-	for(i=0;i<ni/8;i++){
-           vec_array = _mm256_loadu_si256(in+(i*8));
-           vec_array = _mm256_max_epi32(vec_array,vec_min_256);
-           _mm256_storeu_si256(out+(i*8),vec_array);
-       }
-       for(npy_int j=0;j<unaligned;j++){
-             if (in[i*8+j]<min_val) {
+        for(i=0;i<ni/8;i++){
+            vec_array = _mm256_loadu_si256(in+(i*8));
+            vec_array = _mm256_max_epi32(vec_array,vec_min_256);
+            _mm256_storeu_si256(out+(i*8),vec_array);
+        }
+        for(npy_int j=0;j<unaligned;j++){
+            if (in[i*8+j]<min_val) {
                 out[i*8+j]   = min_val;
             }
-	}
-    	
+        }
     }
     else if (min == NULL) {
-	for(i=0;i<ni/8;i++){
-       vec_array = _mm256_loadu_si256(in+(i*8));
-       vec_array = _mm256_min_epi32(vec_array,vec_max_256);
-       _mm256_storeu_si256(out+(i*8),vec_array);
-   }
-       for(npy_int j=0;j<unaligned;j++){
-             if (in[i*8+j]>max_val) {
+        for(i=0;i<ni/8;i++){
+            vec_array = _mm256_loadu_si256(in+(i*8));
+            vec_array = _mm256_min_epi32(vec_array,vec_max_256);
+            _mm256_storeu_si256(out+(i*8),vec_array);
+        }
+        for(npy_int j=0;j<unaligned;j++){
+            if (in[i*8+j]>max_val) {
                 out[i*8+j]   = max_val;
             }
-	}
-   }
-   else {
-  	for(i=0;i<ni/8;i++){
-           vec_array = _mm256_loadu_si256(in+(i*8));
-           vec_array = _mm256_max_epi32(vec_array,vec_min_256);
-           vec_array = _mm256_min_epi32(vec_array,vec_max_256);
-           _mm256_storeu_si256(out+(i*8),vec_array);
-       }
-       for(npy_int j=0;j<unaligned;j++){
+        }
+    }
+    else {
+        for(i=0;i<ni/8;i++){
+            vec_array = _mm256_loadu_si256(in+(i*8));
+            vec_array = _mm256_max_epi32(vec_array,vec_min_256);
+            vec_array = _mm256_min_epi32(vec_array,vec_max_256);
+            _mm256_storeu_si256(out+(i*8),vec_array);
+        }
+        for(npy_int j=0;j<unaligned;j++){
             if (in[i*8+j]<min_val) {
-                    out[i*8+j]   = min_val;
-                }
-                else if (in[i*8+j]>max_val) {
-                    out[i*8+j]   = max_val;
-                }
+                out[i*8+j]   = min_val;
+            }
+            else if (in[i*8+j]>max_val) {
+                out[i*8+j]   = max_val;
+            }
+        }
+    }
 
-       }
-#else
+/**** The optimization portion ends here. ***/
 
+    #else
     if (max != NULL) {
         max_val = *max;
-#if @isfloat@
-        /* NaNs result in no clipping, so optimize the case away */
+        #if @isfloat@
+            /* NaNs result in no clipping, so optimize the case away */
         if (@isnan@(max_val)) {
             if (min == NULL) {
                 memmove(out, in, ni * sizeof(@type@));
@@ -3800,11 +3806,11 @@ static void
             }
             max = NULL;
         }
-#endif
+        #endif
     }
     if (min != NULL) {
         min_val = *min;
-#if @isfloat@
+        #if @isfloat@
         if (@isnan@(min_val)) {
             if (max == NULL) {
                 memmove(out, in, ni * sizeof(@type@));
@@ -3812,7 +3818,7 @@ static void
             }
             min = NULL;
         }
-#endif
+        #endif
     }
     if (max == NULL) {
         for (i = 0; i < ni; i++) {
@@ -3836,11 +3842,11 @@ static void
     }
     else {
         /*
-         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
-         * manner, see: https://github.com/numpy/numpy/issues/7601
-         */
+        * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
+        * manner, see: https://github.com/numpy/numpy/issues/7601
+        */
         #if (_MSC_VER == 1900)
-        #pragma loop( no_vector )
+            #pragma loop( no_vector )
         #endif
         for (i = 0; i < ni; i++) {
             if (@lt@(in[i], min_val)) {
@@ -3854,10 +3860,9 @@ static void
             }
         }
     }
-#endif
+    #endif
 }
 /**end repeat**/
-//#endif
 
 
 #undef _LESS_THAN