diff --git a/doc/release/1.8.0-notes.rst b/doc/release/1.8.0-notes.rst index 624707219d03..127226054437 100644 --- a/doc/release/1.8.0-notes.rst +++ b/doc/release/1.8.0-notes.rst @@ -149,17 +149,18 @@ advantage of compiler builtins to avoid expensive calls to libc. This improves performance of these operations by about a factor of two on gnu libc systems. -Performance improvements to `sqrt` and `abs` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The `sqrt` and `abs` functions for unit stride elementary operations have been +Performance improvements to base math, `sqrt`, `absolute` and `minimum/maximum` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The base math (add, subtract, divide, multiply), `sqrt`, `absolute` and +`minimum/maximum` functions for unit stride elementary operations have been improved to make use of SSE2 CPU SIMD instructions. This improves performance of these operations up to 4x/2x for float32/float64 depending on the location of the data in the CPU caches. The performance gain is greatest for in-place operations. In order to use the improved functions the SSE2 instruction set must be enabled at compile time. It is enabled by default on x86_64 systems. On x86_32 with a -capable CPU it must be enabled by passing the appropriate flag to CFLAGS build -variable (-msse2 with gcc). +capable CPU it must be enabled by passing the appropriate flag to the CFLAGS +build variable (-msse2 with gcc). Changes ======= diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index c0287b8c8fb3..068ecde7c2ea 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1322,6 +1322,9 @@ NPY_NO_EXPORT void *((@type@ *)iop1) = io1; } else { + if (run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { + return; + } BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; @@ -1418,6 +1421,9 @@ NPY_NO_EXPORT void { /* */ if (IS_BINARY_REDUCE) { + if (run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { + return; + } BINARY_REDUCE_LOOP(@type@) { const @type@ in2 = *(@type@ *)ip2; io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; @@ -1488,6 +1494,11 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) { + char * margs[] = {args[0], args[0], args[1]}; + npy_intp msteps[] = {steps[0], steps[0], steps[1]}; + if (run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) { + return; + } UNARY_LOOP { const @type@ in1 = *(@type@ *)ip1; *((@type@ *)op1) = in1*in1; @@ -1497,6 +1508,12 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) { + @type@ one = 1.@c@; + char * margs[] = {(char*)&one, args[0], args[1]}; + npy_intp msteps[] = {0, steps[0], steps[1]}; + if (run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) { + return; + } UNARY_LOOP { const @type@ in1 = *(@type@ *)ip1; *((@type@ *)op1) = 1/in1; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 916473a0b648..746943097e68 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -17,9 +17,14 @@ #include "lowlevel_strided_loops.h" #include "npy_config.h" +/* for NO_FLOATING_POINT_SUPPORT */ +#include "numpy/ufuncobject.h" #include #include +int PyUFunc_getfperr(void); +void PyUFunc_clearfperr(void); + /* * stride is equal to element size and input and destination are equal or * don't overlap within one register @@ -29,21 +34,41 @@ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0)))) +#define IS_BLOCKABLE_REDUCE(esize, vsize) \ + (steps[1] == (esize) && abs(args[1] - args[0]) >= (vsize)) + +#define IS_BLOCKABLE_BINARY(esize, vsize) \ + (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \ + npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \ + npy_is_aligned(args[0], (esize)) && \ + (abs(args[2] - args[0]) >= (vsize) || abs(args[2] - args[0]) == 0) && \ + (abs(args[2] - args[1]) >= (vsize) || abs(args[2] - args[1]) >= 0)) + +#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \ + (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \ + npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \ + ((abs(args[2] - args[1]) >= (vsize)) || (abs(args[2] - args[1]) == 0)) && \ + abs(args[2] - args[0]) >= (esize)) + +#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \ + (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \ + npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \ + ((abs(args[2] - args[0]) >= (vsize)) || (abs(args[2] - args[0]) == 0)) && \ + abs(args[2] - args[1]) >= (esize)) /* align var to alignment */ -#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ +#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ alignment, n);\ for(i = 0; i < peel; i++) -#define UNARY_LOOP_BLOCKED(type, vsize)\ +#define LOOP_BLOCKED(type, vsize)\ for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\ i += (vsize / sizeof(type))) -#define UNARY_LOOP_BLOCKED_END\ +#define LOOP_BLOCKED_END\ for (; i < n; i++) - /* * Dispatcher functions * decide whether the operation can be vectorized and run it @@ -58,28 +83,80 @@ */ /**begin repeat1 - * #func = sqrt, absolute# + * #func = sqrt, absolute, minimum, maximum# + * #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE, IS_BLOCKABLE_REDUCE# + * #name = unary, unary, unary_reduce, unary_reduce# */ -#if @vector@ +#if @vector@ && defined HAVE_EMMINTRIN_H /* prototypes */ static void -sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n); +sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n); #endif static NPY_INLINE int -run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) { #if @vector@ && defined HAVE_EMMINTRIN_H - if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) { + if (@check@(sizeof(@type@), 16)) { sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); return 1; } #endif return 0; } + +/**end repeat1**/ + +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ + +#if @vector@ && defined HAVE_EMMINTRIN_H + +/* prototypes */ +static void +sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); + +#endif + +static NPY_INLINE int +run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if @vector@ && defined HAVE_EMMINTRIN_H + @type@ * ip1 = (@type@ *)args[0]; + @type@ * ip2 = (@type@ *)args[1]; + @type@ * op = (@type@ *)args[2]; + npy_intp n = dimensions[0]; + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), 16)) { + sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), 16)) { + sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), 16)) { + sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } +#endif + return 0; +} + /**end repeat1**/ /**end repeat**/ @@ -89,6 +166,34 @@ run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) * Vectorized operations */ +#ifdef HAVE_EMMINTRIN_H +#include + +/**begin repeat +* horizontal reductions on a vector +* # VOP = min, max# +*/ + +static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v) +{ + npy_float r; + __m128 tmp = _mm_movehl_ps(v, v); /* c d ... */ + __m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */ + tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */ + _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */ + return r; +} + +static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) +{ + npy_double r; + __m128d tmp = _mm_unpackhi_pd(v, v); /* b b */ + _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */ + return r; +} + +/**end repeat**/ + /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# @@ -97,40 +202,160 @@ run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) * #vtype = __m128, __m128d# * #vpre = _mm, _mm# * #vsuf = ps, pd# + * #nan = NPY_NANF, NPY_NAN# */ -#ifdef HAVE_EMMINTRIN_H -#include +/**begin repeat1 +* Arithmetic +* # kind = add, subtract, multiply, divide# +* # OP = +, -, *, /# +* # VOP = add, sub, mul, div# +*/ static void -sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) +sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) + op[i] = ip1[i] @OP@ ip2[i]; + /* lots of specializations, to squeeze out max performance */ + if (npy_is_aligned(&ip1[i], 16) && npy_is_aligned(&ip2[i], 16)) { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + } + else if (npy_is_aligned(&ip1[i], 16)) { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else if (npy_is_aligned(&ip2[i], 16)) { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + } + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[i]; + } +} + + +static void +sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) + op[i] = ip1[0] @OP@ ip2[i]; + if (npy_is_aligned(&ip2[i], 16)) { + LOOP_BLOCKED(@type@, 16) { + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, 16) { + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + LOOP_BLOCKED_END { + op[i] = ip1[0] @OP@ ip2[i]; + } +} + + +void +sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) + op[i] = ip1[i] @OP@ ip2[0]; + if (npy_is_aligned(&ip1[i], 16)) { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @vpre@_store_@vsuf@(&op[i], c); + } + } + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[0]; + } +} + +/**end repeat1**/ + +static void +sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) { /* align output to 16 bytes */ - UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { op[i] = @scalarf@(ip[i]); } assert(npy_is_aligned(&op[i], 16)); if (npy_is_aligned(&ip[i], 16)) { - UNARY_LOOP_BLOCKED(@type@, 16) { + LOOP_BLOCKED(@type@, 16) { @vtype@ d = @vpre@_load_@vsuf@(&ip[i]); @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d)); } } else { - UNARY_LOOP_BLOCKED(@type@, 16) { + LOOP_BLOCKED(@type@, 16) { @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]); @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d)); } } - UNARY_LOOP_BLOCKED_END { + LOOP_BLOCKED_END { op[i] = @scalarf@(ip[i]); } } static void -sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) +sse2_absolute_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) { /* * get 0x7FFFFFFF mask (everything but signbit set) @@ -140,34 +365,87 @@ sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); /* align output to 16 bytes */ - UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i]; /* add 0 to clear -0.0 */ op[i] = tmp + 0; } assert(npy_is_aligned(&op[i], 16)); if (npy_is_aligned(&ip[i], 16)) { - UNARY_LOOP_BLOCKED(@type@, 16) { + LOOP_BLOCKED(@type@, 16) { @vtype@ a = @vpre@_load_@vsuf@(&ip[i]); @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a)); } } else { - UNARY_LOOP_BLOCKED(@type@, 16) { + LOOP_BLOCKED(@type@, 16) { @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]); @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a)); } } - UNARY_LOOP_BLOCKED_END { + LOOP_BLOCKED_END { const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i]; /* add 0 to clear -0.0 */ op[i] = tmp + 0; } } + +/**begin repeat1 + * #kind = maximum, minimum# + * #VOP = max, min# + * #OP = >=, <=# + **/ +/* arguments swapped as unary reduce has the swapped compared to unary */ +static void +sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(ip, @type@, 16) { + *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i]; + } + assert(npy_is_aligned(&ip[i], 16)); + if (i + 2 * 16 / sizeof(@type@) <= n) { + /* load the first elements */ + @vtype@ c = @vpre@_load_@vsuf@((@type@*)&ip[i]); +#ifdef NO_FLOATING_POINT_SUPPORT + @vtype@ cnan = @vpre@_cmpneq_@vsuf@(c, c); +#else + /* minps/minpd will set invalid flag if nan is encountered */ + PyUFunc_clearfperr(); #endif + i += 16 / sizeof(@type@); + LOOP_BLOCKED(@type@, 16) { + @vtype@ v = @vpre@_load_@vsuf@((@type@*)&ip[i]); + c = @vpre@_@VOP@_@vsuf@(c, v); +#ifdef NO_FLOATING_POINT_SUPPORT + /* check for nan, breaking the loop makes non nan case slow */ + cnan = @vpre@_or_@vsuf@(@vpre@_cmpneq_@vsuf@(v, v), cnan); + } + + if (@vpre@_movemask_@vsuf@(cnan)) { + *op = @nan@; + return; + } +#else + } +#endif + { + @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c); + if (PyUFunc_getfperr() & UFUNC_FPE_INVALID) + *op = @nan@; + else + *op = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp; + } + } + LOOP_BLOCKED_END { + *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i]; + } +} +/**end repeat1**/ /**end repeat**/ +#endif /* HAVE_EMMINTRIN_H */ + #endif diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py index 201861279557..952a89999472 100644 --- a/numpy/core/tests/test_scalarmath.py +++ b/numpy/core/tests/test_scalarmath.py @@ -2,6 +2,7 @@ import sys from numpy.testing import * +from numpy.testing.utils import gen_alignment_data import numpy as np types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc, @@ -44,6 +45,37 @@ def test_type_create(self, level=1): assert_equal(a,b) +class TestBaseMath(TestCase): + def test_blocked(self): + #test alignments offsets for simd instructions + for dt in [np.float32, np.float64]: + for out, inp1, inp2, msg in gen_alignment_data(dtype=dt, + type='binary', + max_size=12): + exp1 = np.ones_like(inp1) + inp1[...] = np.ones_like(inp1) + inp2[...] = np.zeros_like(inp2) + assert_almost_equal(np.add(inp1, inp2), exp1, err_msg=msg) + assert_almost_equal(np.add(inp1, 1), exp1 + 1, err_msg=msg) + assert_almost_equal(np.add(1, inp2), exp1, err_msg=msg) + + np.add(inp1, inp2, out=out) + assert_almost_equal(out, exp1, err_msg=msg) + + inp2[...] += np.arange(inp2.size, dtype=dt) + 1 + assert_almost_equal(np.square(inp2), + np.multiply(inp2, inp2), err_msg=msg) + assert_almost_equal(np.reciprocal(inp2), + np.divide(1, inp2), err_msg=msg) + + inp1[...] = np.ones_like(inp1) + inp2[...] = np.zeros_like(inp2) + np.add(inp1, 1, out=out) + assert_almost_equal(out, exp1 + 1, err_msg=msg) + np.add(1, inp2, out=out) + assert_almost_equal(out, exp1, err_msg=msg) + + class TestPower(TestCase): def test_small_types(self): for t in [np.int8, np.int16, np.float16]: diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 6bbb15e6bffd..c58a0d3f5c6d 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -687,6 +687,27 @@ def test_sign(self): np.seterr(**olderr) +class TestMinMax(TestCase): + def test_minmax_blocked(self): + "simd tests on max/min" + for dt in [np.float32, np.float64]: + for out, inp, msg in gen_alignment_data(dtype=dt, type='unary', + max_size=17): + for i in range(inp.size): + inp[:] = np.arange(inp.size, dtype=dt) + inp[i] = np.nan + self.assertTrue(np.isnan(inp.max()), + msg=repr(inp) + '\n' + msg) + self.assertTrue(np.isnan(inp.min()), + msg=repr(inp) + '\n' + msg) + + inp[i] = 1e10 + assert_equal(inp.max(), 1e10, err_msg=msg) + inp[i] = -1e10 + assert_equal(inp.min(), -1e10, err_msg=msg) + + + class TestAbsolute(TestCase): def test_abs_blocked(self): "simd tests on abs"