diff --git a/doc/release/1.8.0-notes.rst b/doc/release/1.8.0-notes.rst
index 624707219d03..127226054437 100644
--- a/doc/release/1.8.0-notes.rst
+++ b/doc/release/1.8.0-notes.rst
@@ -149,17 +149,18 @@ advantage of compiler builtins to avoid expensive calls to libc.
 This improves performance of these operations by about a factor of two on gnu
 libc systems.
 
-Performance improvements to `sqrt` and `abs`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The `sqrt` and `abs` functions for unit stride elementary operations have been
+Performance improvements to base math, `sqrt`, `absolute` and `minimum/maximum`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The base math (add, subtract, divide, multiply), `sqrt`, `absolute` and
+`minimum/maximum` functions for unit stride elementary operations have been
 improved to make use of SSE2 CPU SIMD instructions.
 This improves performance of these operations up to 4x/2x for float32/float64
 depending on the location of the data in the CPU caches. The performance gain
 is greatest for in-place operations.
 In order to use the improved functions the SSE2 instruction set must be enabled
 at compile time. It is enabled by default on x86_64 systems. On x86_32 with a
-capable CPU it must be enabled by passing the appropriate flag to CFLAGS build
-variable (-msse2 with gcc).
+capable CPU it must be enabled by passing the appropriate flag to the CFLAGS
+build variable (-msse2 with gcc).
 
 Changes
 =======
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index c0287b8c8fb3..068ecde7c2ea 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1322,6 +1322,9 @@ NPY_NO_EXPORT void
         *((@type@ *)iop1) = io1;
     }
     else {
+        if (run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+            return;
+        }
         BINARY_LOOP {
             const @type@ in1 = *(@type@ *)ip1;
             const @type@ in2 = *(@type@ *)ip2;
@@ -1418,6 +1421,9 @@ NPY_NO_EXPORT void
 {
     /*  */
     if (IS_BINARY_REDUCE) {
+        if (run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+            return;
+        }
         BINARY_REDUCE_LOOP(@type@) {
             const @type@ in2 = *(@type@ *)ip2;
             io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
@@ -1488,6 +1494,11 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
+    char * margs[] = {args[0], args[0], args[1]};
+    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
+    if (run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
+        return;
+    }
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         *((@type@ *)op1) = in1*in1;
@@ -1497,6 +1508,12 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
+    @type@ one = 1.@c@;
+    char * margs[] = {(char*)&one, args[0], args[1]};
+    npy_intp msteps[] = {0, steps[0], steps[1]};
+    if (run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
+        return;
+    }
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         *((@type@ *)op1) = 1/in1;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 916473a0b648..746943097e68 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -17,9 +17,14 @@
 
 #include "lowlevel_strided_loops.h"
 #include "npy_config.h"
+/* for NO_FLOATING_POINT_SUPPORT */
+#include "numpy/ufuncobject.h"
 #include <assert.h>
 #include <stdlib.h>
 
+int PyUFunc_getfperr(void);
+void PyUFunc_clearfperr(void);
+
 /*
  * stride is equal to element size and input and destination are equal or
  * don't overlap within one register
@@ -29,21 +34,41 @@
      (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
      ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
 
+#define IS_BLOCKABLE_REDUCE(esize, vsize) \
+    (steps[1] == (esize) && abs(args[1] - args[0]) >= (vsize))
+
+#define IS_BLOCKABLE_BINARY(esize, vsize) \
+    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)) && \
+     (abs(args[2] - args[0]) >= (vsize) || abs(args[2] - args[0]) == 0) && \
+     (abs(args[2] - args[1]) >= (vsize) || abs(args[2] - args[1]) >= 0))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
+    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     ((abs(args[2] - args[1]) >= (vsize)) || (abs(args[2] - args[1]) == 0)) && \
+     abs(args[2] - args[0]) >= (esize))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
+    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
+     ((abs(args[2] - args[0]) >= (vsize)) || (abs(args[2] - args[0]) == 0)) && \
+     abs(args[2] - args[1]) >= (esize))
 
 /* align var to alignment */
-#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
     npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
                                                 alignment, n);\
     for(i = 0; i < peel; i++)
 
-#define UNARY_LOOP_BLOCKED(type, vsize)\
+#define LOOP_BLOCKED(type, vsize)\
     for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
             i += (vsize / sizeof(type)))
 
-#define UNARY_LOOP_BLOCKED_END\
+#define LOOP_BLOCKED_END\
     for (; i < n; i++)
 
-
 /*
  * Dispatcher functions
  * decide whether the operation can be vectorized and run it
@@ -58,28 +83,80 @@
  */
 
 /**begin repeat1
- * #func = sqrt, absolute#
+ * #func = sqrt, absolute, minimum, maximum#
+ * #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE, IS_BLOCKABLE_REDUCE#
+ * #name = unary, unary, unary_reduce, unary_reduce#
  */
 
-#if @vector@
+#if @vector@ && defined HAVE_EMMINTRIN_H
 
 /* prototypes */
 static void
-sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n);
+sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
 
 #endif
 
 static NPY_INLINE int
-run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if @vector@ && defined HAVE_EMMINTRIN_H
-    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+    if (@check@(sizeof(@type@), 16)) {
         sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
         return 1;
     }
 #endif
     return 0;
 }
+
+/**end repeat1**/
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+
+#if @vector@ && defined HAVE_EMMINTRIN_H
+
+/* prototypes */
+static void
+sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined HAVE_EMMINTRIN_H
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), 16)) {
+        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), 16)) {
+        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), 16)) {
+        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
 /**end repeat1**/
 
 /**end repeat**/
@@ -89,6 +166,34 @@ run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
  * Vectorized operations
  */
 
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+
+/**begin repeat
+* horizontal reductions on a vector
+* # VOP = min, max#
+*/
+
+static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
+{
+    npy_float r;
+    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
+    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
+    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
+    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
+    return r;
+}
+
+static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
+{
+    npy_double r;
+    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
+    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
+    return r;
+}
+
+/**end repeat**/
+
 /**begin repeat
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
@@ -97,40 +202,160 @@ run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
  *  #vtype = __m128, __m128d#
  *  #vpre = _mm, _mm#
  *  #vsuf = ps, pd#
+ *  #nan = NPY_NANF, NPY_NAN#
  */
 
-#ifdef HAVE_EMMINTRIN_H
-#include <emmintrin.h>
 
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
 
 static void
-sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], 16) && npy_is_aligned(&ip2[i], 16)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, 16) {
+                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, 16) {
+                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], 16)) {
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], 16)) {
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, 16) {
+                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, 16) {
+                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
+
+
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], 16)) {
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
+
+
+void
+sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], 16)) {
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+
+/**end repeat1**/
+
+static void
+sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
 {
     /* align output to 16 bytes */
-    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
         op[i] = @scalarf@(ip[i]);
     }
     assert(npy_is_aligned(&op[i], 16));
     if (npy_is_aligned(&ip[i], 16)) {
-        UNARY_LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, 16) {
             @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
         }
     }
     else {
-        UNARY_LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, 16) {
             @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
         }
     }
-    UNARY_LOOP_BLOCKED_END {
+    LOOP_BLOCKED_END {
         op[i] = @scalarf@(ip[i]);
     }
 }
 
 
 static void
-sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+sse2_absolute_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
 {
     /*
      * get 0x7FFFFFFF mask (everything but signbit set)
@@ -140,34 +365,87 @@ sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
     const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
 
     /* align output to 16 bytes */
-    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
         const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
         /* add 0 to clear -0.0 */
         op[i] = tmp + 0;
     }
     assert(npy_is_aligned(&op[i], 16));
     if (npy_is_aligned(&ip[i], 16)) {
-        UNARY_LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, 16) {
             @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
         }
     }
     else {
-        UNARY_LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, 16) {
             @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
         }
     }
-    UNARY_LOOP_BLOCKED_END {
+    LOOP_BLOCKED_END {
         const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
         /* add 0 to clear -0.0 */
         op[i] = tmp + 0;
     }
 }
 
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #VOP = max, min#
+ * #OP = >=, <=#
+ **/
+/* arguments swapped as unary reduce has the swapped compared to unary */
+static void
+sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(ip, @type@, 16) {
+        *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
+    }
+    assert(npy_is_aligned(&ip[i], 16));
+    if (i + 2 * 16 / sizeof(@type@) <= n) {
+        /* load the first elements */
+        @vtype@ c = @vpre@_load_@vsuf@((@type@*)&ip[i]);
+#ifdef NO_FLOATING_POINT_SUPPORT
+        @vtype@ cnan = @vpre@_cmpneq_@vsuf@(c, c);
+#else
+        /* minps/minpd will set invalid flag if nan is encountered */
+        PyUFunc_clearfperr();
 #endif
+        i += 16 / sizeof(@type@);
 
+        LOOP_BLOCKED(@type@, 16) {
+            @vtype@ v = @vpre@_load_@vsuf@((@type@*)&ip[i]);
+            c = @vpre@_@VOP@_@vsuf@(c, v);
+#ifdef NO_FLOATING_POINT_SUPPORT
+            /* check for nan, breaking the loop makes non nan case slow */
+            cnan = @vpre@_or_@vsuf@(@vpre@_cmpneq_@vsuf@(v, v), cnan);
+        }
+
+        if (@vpre@_movemask_@vsuf@(cnan)) {
+            *op = @nan@;
+            return;
+        }
+#else
+        }
+#endif
+        {
+            @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c);
+            if (PyUFunc_getfperr() & UFUNC_FPE_INVALID)
+                *op = @nan@;
+            else
+                *op  = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
+        }
+    }
+    LOOP_BLOCKED_END {
+        *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
+    }
+}
+/**end repeat1**/
 
 /**end repeat**/
 
+#endif /* HAVE_EMMINTRIN_H */
+
 #endif
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 201861279557..952a89999472 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -2,6 +2,7 @@
 
 import sys
 from numpy.testing import *
+from numpy.testing.utils import gen_alignment_data
 import numpy as np
 
 types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
@@ -44,6 +45,37 @@ def test_type_create(self, level=1):
             assert_equal(a,b)
 
 
+class TestBaseMath(TestCase):
+    def test_blocked(self):
+        #test alignments offsets for simd instructions
+        for dt in [np.float32, np.float64]:
+            for out, inp1, inp2, msg in gen_alignment_data(dtype=dt,
+                                                           type='binary',
+                                                           max_size=12):
+                exp1 = np.ones_like(inp1)
+                inp1[...] = np.ones_like(inp1)
+                inp2[...] = np.zeros_like(inp2)
+                assert_almost_equal(np.add(inp1, inp2), exp1, err_msg=msg)
+                assert_almost_equal(np.add(inp1, 1), exp1 + 1, err_msg=msg)
+                assert_almost_equal(np.add(1, inp2), exp1, err_msg=msg)
+
+                np.add(inp1, inp2, out=out)
+                assert_almost_equal(out, exp1, err_msg=msg)
+
+                inp2[...] += np.arange(inp2.size, dtype=dt) + 1
+                assert_almost_equal(np.square(inp2),
+                                    np.multiply(inp2, inp2),  err_msg=msg)
+                assert_almost_equal(np.reciprocal(inp2),
+                                    np.divide(1, inp2),  err_msg=msg)
+
+                inp1[...] = np.ones_like(inp1)
+                inp2[...] = np.zeros_like(inp2)
+                np.add(inp1, 1, out=out)
+                assert_almost_equal(out, exp1 + 1, err_msg=msg)
+                np.add(1, inp2, out=out)
+                assert_almost_equal(out, exp1, err_msg=msg)
+
+
 class TestPower(TestCase):
     def test_small_types(self):
         for t in [np.int8, np.int16, np.float16]:
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 6bbb15e6bffd..c58a0d3f5c6d 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -687,6 +687,27 @@ def test_sign(self):
             np.seterr(**olderr)
 
 
+class TestMinMax(TestCase):
+    def test_minmax_blocked(self):
+        "simd tests on max/min"
+        for dt in [np.float32, np.float64]:
+            for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
+                                                    max_size=17):
+                for i in range(inp.size):
+                    inp[:] = np.arange(inp.size, dtype=dt)
+                    inp[i] = np.nan
+                    self.assertTrue(np.isnan(inp.max()),
+                                    msg=repr(inp) + '\n' + msg)
+                    self.assertTrue(np.isnan(inp.min()),
+                                    msg=repr(inp) + '\n' + msg)
+
+                    inp[i] = 1e10
+                    assert_equal(inp.max(), 1e10, err_msg=msg)
+                    inp[i] = -1e10
+                    assert_equal(inp.min(), -1e10, err_msg=msg)
+
+
+
 class TestAbsolute(TestCase):
     def test_abs_blocked(self):
         "simd tests on abs"