From c37fb93043e4526a9f8569a2646084aa9e8e31cc Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 24 May 2019 14:37:48 -0700 Subject: [PATCH 1/8] ENH: use AVX for sqrt, square, reciprocal and absolute value --- numpy/core/code_generators/generate_umath.py | 8 +- numpy/core/src/umath/loops.c.src | 56 ++- numpy/core/src/umath/loops.h.src | 13 + numpy/core/src/umath/simd.inc.src | 404 +++++++++++++++++-- numpy/core/tests/test_umath.py | 90 ++++- 5 files changed, 520 insertions(+), 51 deletions(-) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 6729fe197c88..1c87a148da8e 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -358,14 +358,14 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints)]), + TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]), TD(O, f='Py_square'), ), 'reciprocal': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.reciprocal'), None, - TD(ints+inexact, simd=[('avx2', ints)]), + TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]), TD(O, f='Py_reciprocal'), ), # This is no longer used as numpy.ones_like, however it is @@ -395,7 +395,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', - TD(bints+flts+timedeltaonly), + TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD(cmplx, out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), @@ -762,7 +762,7 @@ def english_upper(s): docstrings.get('numpy.core.umath.sqrt'), None, TD('e', f='sqrt', astype={'e':'f'}), - TD(inexactvec), + TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD('fdg' + cmplx, f='sqrt'), TD(P, f='sqrt'), ), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5443223ab52e..ad2ec9e4a0b6 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1656,6 +1656,60 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS# */ +/**begin repeat1 + * #TYPE = FLOAT, DOUBLE# + * #type = npy_float, npy_double# + * #typesub = f, # + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = npy_sqrt@typesub@(in1); + } + } +} + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ tmp = in1 > 0 ? in1 : -in1; + /* add 0 to clear -0.0 */ + *((@type@ *)op1) = tmp + 0; + } + } +} + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = in1*in1; + } + } +} + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = 1.0f/in1; + } + } +} + +/**end repeat1**/ + /**begin repeat1 * #func = exp, log# * #scalarf = npy_expf, npy_logf# @@ -1706,8 +1760,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY } /**end repeat1**/ - - /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 5070ab38b918..fe1b1145d7d1 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -175,6 +175,19 @@ NPY_NO_EXPORT void */ NPY_NO_EXPORT void @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #isa = avx512f, fma# + */ + +/**begin repeat2 + * #func = sqrt, absolute, square, reciprocal# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**end repeat2**/ +/**end repeat1**/ /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 88e5e1f1bb86..c0dc53dd8757 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -138,6 +138,37 @@ abs_ptrdiff(char *a, char *b) /* prototypes */ +/**begin repeat1 + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + */ + +/**begin repeat2 + * #func = sqrt, absolute, square, reciprocal# + */ + +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE void +@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride); +#endif + +static NPY_INLINE int +run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS + if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) { + @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); + return 1; + } + else + return 0; +#endif + return 0; +} + +/**end repeat2**/ +/**end repeat1**/ + /**begin repeat1 * #func = exp, log# */ @@ -1144,47 +1175,94 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_full_load_mask(void) +fma_get_full_load_mask_ps(void) { return _mm256_set1_ps(-1.0); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +fma_get_full_load_mask_pd(void) +{ + return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem) +fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) { float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float* addr = maskint + total_elem - num_lanes; + float* addr = maskint + num_lanes - num_elem; return _mm256_loadu_ps(addr); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) +{ + npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1}; + npy_int* addr = maskint + 2*num_lanes - 2*num_elem; + return _mm256_loadu_si256((__m256i*) addr); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_gather(__m256 src, - npy_float* addr, - __m256i vindex, - __m256 mask) +fma_masked_gather_ps(__m256 src, + npy_float* addr, + __m256i vindex, + __m256 mask) { return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +fma_masked_gather_pd(__m256d src, + npy_double* addr, + __m128i vindex, + __m256d mask) +{ + return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_load(__m256 mask, npy_float* addr) +fma_masked_load_ps(__m256 mask, npy_float* addr) { return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +fma_masked_load_pd(__m256i mask, npy_double* addr) +{ + return _mm256_maskload_pd(addr, mask); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask) +fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) { return _mm256_blendv_ps(x, val, mask); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) +{ + return _mm256_blendv_pd(x, val, mask); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_blend(__m256 x, __m256 y, __m256 ymask) { return _mm256_blendv_ps(x, y, ymask); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_invert_mask_ps(__m256 ymask) +{ + return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +fma_invert_mask_pd(__m256i ymask) +{ + return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp) { @@ -1290,48 +1368,109 @@ fma_scalef_ps(__m256 poly, __m256 quadrant) } } +/**begin repeat + * #vsub = ps, pd# + * #vtype = __m256, __m256d# + */ +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_abs_@vsub@(@vtype@ x) +{ + return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_reciprocal_@vsub@(@vtype@ x) +{ + return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x); +} +/**end repeat**/ #endif #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_full_load_mask(void) +avx512_get_full_load_mask_ps(void) { return 0xFFFF; } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_get_full_load_mask_pd(void) +{ + return 0xFF; +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem) +avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) { return (0x0001 << num_elem) - 0x0001; } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) +{ + return (0x01 << num_elem) - 0x01; +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_gather(__m512 src, - npy_float* addr, - __m512i vindex, - __mmask16 kmask) +avx512_masked_gather_ps(__m512 src, + npy_float* addr, + __m512i vindex, + __mmask16 kmask) { return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_masked_gather_pd(__m512d src, + npy_double* addr, + __m256i vindex, + __mmask8 kmask) +{ + return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_load(__mmask16 mask, npy_float* addr) +avx512_masked_load_ps(__mmask16 mask, npy_float* addr) { return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_masked_load_pd(__mmask8 mask, npy_double* addr) +{ + return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask) +avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask) { return _mm512_mask_blend_ps(mask, x, val); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask) +{ + return _mm512_mask_blend_pd(mask, x, val); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_blend(__m512 x, __m512 y, __mmask16 ymask) { return _mm512_mask_mov_ps(x, ymask, y); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_invert_mask_ps(__mmask16 ymask) +{ + return _mm512_knot(ymask); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_invert_mask_pd(__mmask8 ymask) +{ + return _mm512_knot(ymask); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp) { @@ -1361,6 +1500,22 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) { return _mm512_scalef_ps(poly, quadrant); } +/**begin repeat + * #vsub = ps, pd# + * #vtype= __m512, __m512d# + */ +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_abs_@vsub@(@vtype@ x) +{ + return _mm512_abs_@vsub@(x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_reciprocal_@vsub@(@vtype@ x) +{ + return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x); +} +/**end repeat**/ #endif /**begin repeat @@ -1438,9 +1593,175 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ sin = @fmadd@(sin, x, x); return sin; } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_sqrt_ps(@vtype@ x) +{ + return _mm@vsize@_sqrt_ps(x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d +@isa@_sqrt_pd(@vtype@d x) +{ + return _mm@vsize@_sqrt_pd(x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_square_ps(@vtype@ x) +{ + return _mm@vsize@_mul_ps(x,x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d +@isa@_square_pd(@vtype@d x) +{ + return _mm@vsize@_mul_pd(x,x); +} + #endif /**end repeat**/ + +/**begin repeat + * #ISA = FMA, AVX512F# + * #isa = fma, avx512# + * #vsize = 256, 512# + * #BYTES = 32, 64# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #mask = __m256, __mmask16# + * #vsub = , _mask# + * #vtype = __m256, __m512# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# + */ + +/**begin repeat1 + * #func = sqrt, absolute, square, reciprocal# + * #vectorf = sqrt, abs, square, reciprocal# + */ + +#if defined @CHK@ +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_@func@_FLOAT(npy_float* op, + npy_float* ip, + const npy_intp array_size, + const npy_intp steps) +{ + const npy_intp stride = steps/sizeof(npy_float); + const npy_int num_lanes = @BYTES@/sizeof(npy_float); + npy_intp num_remaining_elements = array_size; + @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); + @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask); + npy_int indexarr[16]; + for (npy_int ii = 0; ii < 16; ii++) { + indexarr[ii] = ii*stride; + } + @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < num_lanes) { + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, + num_lanes); + inv_load_mask = @isa@_invert_mask_ps(load_mask); + } + @vtype@ x; + if (stride == 1) { + x = @isa@_masked_load_ps(load_mask, ip); + /* + * Replace masked elements with 1.0f to avoid divide by zero fp + * exception in reciprocal + */ + x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask); + } + else { + x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask); + } + @vtype@ out = @isa@_@vectorf@_ps(x); + @masked_store@(op, @cvtps_epi32@(load_mask), out); + + ip += num_lanes*stride; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} +#endif +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #ISA = FMA, AVX512F# + * #isa = fma, avx512# + * #vsize = 256, 512# + * #BYTES = 32, 64# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #mask = __m256i, __mmask8# + * #vsub = , _mask# + * #vtype = __m256d, __m512d# + * #vindextype = __m128i, __m256i# + * #vindexsize = 128, 256# + * #vindexload = _mm_loadu_si128, _mm256_loadu_si256# + * #cvtps_epi32 = _mm256_cvtpd_epi32, # + * #castmask = _mm256_castsi256_pd, # + * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# + */ + +/**begin repeat1 + * #func = sqrt, absolute, square, reciprocal# + * #vectorf = sqrt, abs, square, reciprocal# + */ + +#if defined @CHK@ +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_@func@_DOUBLE(npy_double* op, + npy_double* ip, + const npy_intp array_size, + const npy_intp steps) +{ + const npy_intp stride = steps/sizeof(npy_double); + const npy_int num_lanes = @BYTES@/sizeof(npy_double); + npy_intp num_remaining_elements = array_size; + @mask@ load_mask = @isa@_get_full_load_mask_pd(); + @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask); + @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); + npy_int indexarr[8]; + for (npy_int ii = 0; ii < 8; ii++) { + indexarr[ii] = ii*stride; + } + @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < num_lanes) { + load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements, + num_lanes); + inv_load_mask = @isa@_invert_mask_pd(load_mask); + } + @vtype@ x; + if (stride == 1) { + x = @isa@_masked_load_pd(load_mask, ip); + /* + * Replace masked elements with 1.0f to avoid divide by zero fp + * exception in reciprocal + */ + x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask)); + } + else { + x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask)); + } + @vtype@ out = @isa@_@vectorf@_pd(x); + @masked_store@(op, load_mask, out); + + ip += num_lanes*stride; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} +#endif +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * #ISA = FMA, AVX512F# * #isa = fma, avx512# @@ -1460,7 +1781,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# */ - /* * Vectorized approximate sine/cosine algorithms: The following code is a * vectorized version of the algorithm presented here: @@ -1519,7 +1839,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@ quadrant, reduced_x, reduced_x2, cos, sin; @vtype@i iquadrant; @mask@ nan_mask, glibc_mask, sine_mask, negate_mask; - @mask@ load_mask = @isa@_get_full_load_mask(); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; npy_int indexarr[16]; for (npy_int ii = 0; ii < 16; ii++) { @@ -1530,16 +1850,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void while (num_remaining_elements > 0) { if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask(num_remaining_elements, + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, num_lanes); } @vtype@ x; if (stride == 1) { - x = @isa@_masked_load(load_mask, ip); + x = @isa@_masked_load_ps(load_mask, ip); } else { - x = @isa@_masked_gather(zero_f, ip, vindex, load_mask); + x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask); } /* @@ -1551,7 +1871,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void glibc_mask = @isa@_in_range_mask(x, large_number,-large_number); glibc_mask = @and_masks@(load_mask, glibc_mask); nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ); - x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask)); + x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask)); npy_int iglibc_mask = @mask_to_int@(glibc_mask); if (iglibc_mask != @full_mask@) { @@ -1584,7 +1904,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void /* multiply by -1 for appropriate elements */ negate_mask = @isa@_should_negate(iquadrant, twos, twos); cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask); - cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); + cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); @masked_store@(op, @cvtps_epi32@(load_mask), cos); } @@ -1662,27 +1982,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask; - @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes); - @mask@ load_mask = @isa@_get_full_load_mask(); + @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; while (num_remaining_elements > 0) { if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask(num_remaining_elements, - num_lanes); + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, + num_lanes); } @vtype@ x; if (stride == 1) { - x = @isa@_masked_load(load_mask, ip); + x = @isa@_masked_load_ps(load_mask, ip); } else { - x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask); + x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask); } nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ); - x = @isa@_set_masked_lanes(x, zeros_f, nan_mask); + x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask); xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ); xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ); @@ -1690,7 +2010,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void overflow_mask = @or_masks@(overflow_mask, @xor_masks@(xmax_mask, inf_mask)); - x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@( + x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@( @or_masks@(nan_mask, xmin_mask), xmax_mask)); quadrant = _mm@vsize@_mul_ps(x, log2e); @@ -1723,9 +2043,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void * elem < xmin; return 0.0f * elem = +/- nan, return nan */ - poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); - poly = @isa@_set_masked_lanes(poly, inf, xmax_mask); - poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask); + poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); + poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask); + poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask); @masked_store@(op, @cvtps_epi32@(load_mask), poly); @@ -1790,24 +2110,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@ poly, num_poly, denom_poly, exponent; @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask; - @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes); + @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes); @mask@ divide_by_zero_mask = invalid_mask; - @mask@ load_mask = @isa@_get_full_load_mask(); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; while (num_remaining_elements > 0) { if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask(num_remaining_elements, - num_lanes); + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, + num_lanes); } @vtype@ x_in; if (stride == 1) { - x_in = @isa@_masked_load(load_mask, ip); + x_in = @isa@_masked_load_ps(load_mask, ip); } else { - x_in = @isa@_masked_gather(zeros_f, ip, vindex, load_mask); + x_in = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask); } negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ); @@ -1818,7 +2138,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @and_masks@(zero_mask, load_mask)); invalid_mask = @or_masks@(invalid_mask, negx_mask); - @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask); + @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask); /* set x = normalized mantissa */ exponent = @isa@_get_exponent(x); diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index ef48fed05593..294a9b1fd41a 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -694,8 +694,92 @@ def test_sincos_values(self): assert_raises(FloatingPointError, np.cos, np.float32(-np.inf)) assert_raises(FloatingPointError, np.cos, np.float32(np.inf)) + def test_sqrt_values(self): + with np.errstate(all='ignore'): + x = [np.nan, np.nan, np.inf, np.nan, 0.] + y = [np.nan, -np.nan, np.inf, -np.inf, 0.] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.sqrt(yf), xf) + + with np.errstate(invalid='raise'): + for dt in ['f', 'd', 'g']: + assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt)) + + def test_abs_values(self): + x = [np.nan, np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0] + y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0., -1.0, 1.0] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.abs(yf), xf) + + def test_square_values(self): + x = [np.nan, np.nan, np.inf, np.inf] + y = [np.nan, -np.nan, np.inf, -np.inf] + with np.errstate(all='ignore'): + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.square(yf), xf) + + with np.errstate(over='raise'): + assert_raises(FloatingPointError, np.square, np.array(1E32, dtype='f')) + assert_raises(FloatingPointError, np.square, np.array(1E200, dtype='d')) -class TestSIMDFloat32(object): + def test_reciprocal_values(self): + with np.errstate(all='ignore'): + x = [np.nan, np.nan, 0.0, -0.0, np.inf, -np.inf] + y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0.] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.reciprocal(yf), xf) + + with np.errstate(divide='raise'): + for dt in ['f', 'd', 'g']: + assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt)) + +# func : [maxulperror, low, high] +avx_ufuncs = {'sqrt' :[1, 0., 100.], + 'absolute' :[0, -100., 100.], + 'reciprocal' :[1, 1., 100.], + 'square' :[1, -100., 100.]} + +class TestAVXUfuncs(object): + def test_avx_based_ufunc(self): + strides = np.array([-4,-3,-2,-1,1,2,3,4]) + np.random.seed(42) + for func, prop in avx_ufuncs.items(): + maxulperr = prop[0] + minval = prop[1] + maxval = prop[2] + # various array sizes to ensure masking in AVX is tested + for size in range(1,32): + myfunc = getattr(np, func) + x_f32 = np.float32(np.random.uniform(low=minval, high=maxval, + size=size)) + x_f64 = np.float64(x_f32) + x_f128 = np.float128(x_f32) + y_true128 = myfunc(x_f128) + if maxulperr == 0: + assert_equal(myfunc(x_f32), np.float32(y_true128)) + assert_equal(myfunc(x_f64), np.float64(y_true128)) + else: + assert_array_max_ulp(myfunc(x_f32), np.float32(y_true128), + maxulp=maxulperr) + assert_array_max_ulp(myfunc(x_f64), np.float64(y_true128), + maxulp=maxulperr) + # various strides to test gather instruction + if size > 1: + y_true32 = myfunc(x_f32) + y_true64 = myfunc(x_f64) + for jj in strides: + assert_equal(myfunc(x_f64[::jj]), y_true64[::jj]) + assert_equal(myfunc(x_f32[::jj]), y_true32[::jj]) + +class TestAVXFloat32Transcendental(object): def test_exp_float32(self): np.random.seed(42) x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000)) @@ -722,8 +806,8 @@ def test_sincos_float32(self): def test_strided_float32(self): np.random.seed(42) - strides = np.random.randint(low=-100, high=100, size=100) - sizes = np.random.randint(low=1, high=2000, size=100) + strides = np.array([-4,-3,-2,-1,1,2,3,4]) + sizes = np.arange(2,100) for ii in sizes: x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii)) exp_true = np.exp(x_f32) From d874804aeaf31049d258bb8f2943ad8ca7e68f0f Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 2 Jul 2019 15:19:49 -0700 Subject: [PATCH 2/8] BUG: fixing multiple CI failures (1) Workaround for bug in clang6: added missing GCC attribute to the prototype of ISA_sqrt_TYPE function which otherwise leads to a weird build failure in clang6 (gcc and clang7.0 doesnt have this issue) (2) Changed np.float128 to np.longdouble in tests: NumPy in windows doesn't support the np.float128 dtype (3) GCC 4.8/5.0 doesn't support _mm512_abs_ps/pd intrinsic --- numpy/core/src/umath/simd.inc.src | 23 +++++++++++++---------- numpy/core/tests/test_umath.py | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index c0dc53dd8757..23f94893d17c 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -148,7 +148,7 @@ abs_ptrdiff(char *a, char *b) */ #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE void +static NPY_INLINE NPY_GCC_TARGET_@ISA@ void @ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride); #endif @@ -1501,13 +1501,16 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) return _mm512_scalef_ps(poly, quadrant); } /**begin repeat - * #vsub = ps, pd# - * #vtype= __m512, __m512d# + * #vsub = ps, pd# + * #epi_vsub = epi32, epi64# + * #vtype = __m512, __m512d# + * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# */ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_abs_@vsub@(@vtype@ x) { - return _mm512_abs_@vsub@(x); + return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x, + _mm512_set1_@epi_vsub@ (@and_const@)); } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ @@ -1644,9 +1647,9 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d #if defined @CHK@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @ISA@_@func@_FLOAT(npy_float* op, - npy_float* ip, - const npy_intp array_size, - const npy_intp steps) + npy_float* ip, + const npy_intp array_size, + const npy_intp steps) { const npy_intp stride = steps/sizeof(npy_float); const npy_int num_lanes = @BYTES@/sizeof(npy_float); @@ -1716,9 +1719,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void #if defined @CHK@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @ISA@_@func@_DOUBLE(npy_double* op, - npy_double* ip, - const npy_intp array_size, - const npy_intp steps) + npy_double* ip, + const npy_intp array_size, + const npy_intp steps) { const npy_intp stride = steps/sizeof(npy_double); const npy_int num_lanes = @BYTES@/sizeof(npy_double); diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 294a9b1fd41a..86c724b5ae41 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -761,7 +761,7 @@ def test_avx_based_ufunc(self): x_f32 = np.float32(np.random.uniform(low=minval, high=maxval, size=size)) x_f64 = np.float64(x_f32) - x_f128 = np.float128(x_f32) + x_f128 = np.longdouble(x_f32) y_true128 = myfunc(x_f128) if maxulperr == 0: assert_equal(myfunc(x_f32), np.float32(y_true128)) From 344b40f70d83f1fe364d6a8cbc44aceea20d6e26 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 3 Jul 2019 11:16:36 -0700 Subject: [PATCH 3/8] BUG: ignore invalid exception raised by absolute clang6 generates an invalid exception when computing abs value of +/-np.nan. --- numpy/core/src/umath/loops.c.src | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index ad2ec9e4a0b6..b674a158544a 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1684,6 +1684,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void *((@type@ *)op1) = tmp + 0; } } + npy_clear_floatstatus_barrier((char*)dimensions); } NPY_NO_EXPORT NPY_GCC_OPT_3 void From 7a327d09344214ba5aac95f2428bffaa7fa260d2 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 8 Jul 2019 21:30:30 -0700 Subject: [PATCH 4/8] ENH: use AVX for floor, rint, ceil and trunc --- numpy/core/code_generators/generate_umath.py | 8 ++ numpy/core/src/umath/loops.c.src | 42 +++++++++++ numpy/core/src/umath/loops.h.src | 20 +++++ numpy/core/src/umath/simd.inc.src | 77 ++++++++++++++++++-- numpy/core/tests/test_umath.py | 10 ++- 5 files changed, 146 insertions(+), 11 deletions(-) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 1c87a148da8e..be4fc934d8c3 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -777,6 +777,8 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.ceil'), None, + TD('e', f='ceil', astype={'e':'f'}), + TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD(flts, f='ceil', astype={'e':'f'}), TD(O, f='npy_ObjectCeil'), ), @@ -784,6 +786,8 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.trunc'), None, + TD('e', f='trunc', astype={'e':'f'}), + TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD(flts, f='trunc', astype={'e':'f'}), TD(O, f='npy_ObjectTrunc'), ), @@ -798,6 +802,8 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.floor'), None, + TD('e', f='floor', astype={'e':'f'}), + TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD(flts, f='floor', astype={'e':'f'}), TD(O, f='npy_ObjectFloor'), ), @@ -805,6 +811,8 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.rint'), None, + TD('e', f='rint', astype={'e':'f'}), + TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD(inexact, f='rint', astype={'e':'f'}), TD(P, f='rint'), ), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index b674a158544a..d948e25bba17 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1634,6 +1634,30 @@ NPY_NO_EXPORT void /**end repeat**/ +/**begin repeat + * #func = rint, ceil, floor, trunc# + * #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +* #type = npy_float, npy_double# +* #typesub = f, # +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = @scalarf@@typesub@(in1); + } +} + + +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * #func = sin, cos, exp, log# * #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf# @@ -1709,6 +1733,23 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void } } +/**begin repeat2 + * #func = rint, ceil, floor, trunc# + * #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc# + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = @scalarf@@typesub@(in1); + } + } +} + +/**end repeat2**/ /**end repeat1**/ /**begin repeat1 @@ -1763,6 +1804,7 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY /**end repeat1**/ /**end repeat**/ + /**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble, npy_float# diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index fe1b1145d7d1..e98a1ac3cde7 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -206,6 +206,26 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY /**end repeat1**/ /**end repeat**/ +/**begin repeat + * #func = rint, ceil, floor, trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); + +/**begin repeat2 + * #isa = avx512f, fma# + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * Float types * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 23f94893d17c..bf92b56ecb7f 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -144,7 +144,7 @@ abs_ptrdiff(char *a, char *b) */ /**begin repeat2 - * #func = sqrt, absolute, square, reciprocal# + * #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc# */ #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS @@ -216,7 +216,6 @@ run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, /**end repeat**/ - /**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# @@ -1383,6 +1382,30 @@ fma_reciprocal_@vsub@(@vtype@ x) { return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x); } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_rint_@vsub@(@vtype@ x) +{ + return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_floor_@vsub@(@vtype@ x) +{ + return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_ceil_@vsub@(@vtype@ x) +{ + return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_trunc_@vsub@(@vtype@ x) +{ + return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO); +} /**end repeat**/ #endif @@ -1518,6 +1541,30 @@ avx512_reciprocal_@vsub@(@vtype@ x) { return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x); } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_rint_@vsub@(@vtype@ x) +{ + return _mm512_roundscale_@vsub@(x, 0x08); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_floor_@vsub@(@vtype@ x) +{ + return _mm512_roundscale_@vsub@(x, 0x09); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_ceil_@vsub@(@vtype@ x) +{ + return _mm512_roundscale_@vsub@(x, 0x0A); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_trunc_@vsub@(@vtype@ x) +{ + return _mm512_roundscale_@vsub@(x, 0x0B); +} /**end repeat**/ #endif @@ -1640,12 +1687,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d */ /**begin repeat1 - * #func = sqrt, absolute, square, reciprocal# - * #vectorf = sqrt, abs, square, reciprocal# + * #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc# + * #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc# + * #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0# */ #if defined @CHK@ -static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @ISA@_@func@_FLOAT(npy_float* op, npy_float* ip, const npy_intp array_size, @@ -1656,7 +1704,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void npy_intp num_remaining_elements = array_size; @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); @mask@ load_mask = @isa@_get_full_load_mask_ps(); +#if @replace_0_with_1@ @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask); +#endif npy_int indexarr[16]; for (npy_int ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; @@ -1667,16 +1717,20 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void if (num_remaining_elements < num_lanes) { load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, num_lanes); +#if @replace_0_with_1@ inv_load_mask = @isa@_invert_mask_ps(load_mask); +#endif } @vtype@ x; if (stride == 1) { x = @isa@_masked_load_ps(load_mask, ip); +#if @replace_0_with_1@ /* * Replace masked elements with 1.0f to avoid divide by zero fp * exception in reciprocal */ x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask); +#endif } else { x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask); @@ -1712,12 +1766,13 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void */ /**begin repeat1 - * #func = sqrt, absolute, square, reciprocal# - * #vectorf = sqrt, abs, square, reciprocal# + * #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc# + * #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc# + * #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0# */ #if defined @CHK@ -static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @ISA@_@func@_DOUBLE(npy_double* op, npy_double* ip, const npy_intp array_size, @@ -1727,7 +1782,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_int num_lanes = @BYTES@/sizeof(npy_double); npy_intp num_remaining_elements = array_size; @mask@ load_mask = @isa@_get_full_load_mask_pd(); +#if @replace_0_with_1@ @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask); +#endif @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); npy_int indexarr[8]; for (npy_int ii = 0; ii < 8; ii++) { @@ -1739,16 +1796,20 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void if (num_remaining_elements < num_lanes) { load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements, num_lanes); +#if @replace_0_with_1@ inv_load_mask = @isa@_invert_mask_pd(load_mask); +#endif } @vtype@ x; if (stride == 1) { x = @isa@_masked_load_pd(load_mask, ip); +#if @replace_0_with_1@ /* * Replace masked elements with 1.0f to avoid divide by zero fp * exception in reciprocal */ x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask)); +#endif } else { x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask)); diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 86c724b5ae41..264eea00ee14 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -742,10 +742,14 @@ def test_reciprocal_values(self): assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt)) # func : [maxulperror, low, high] -avx_ufuncs = {'sqrt' :[1, 0., 100.], +avx_ufuncs = {'sqrt' :[1, 0., 100.], 'absolute' :[0, -100., 100.], - 'reciprocal' :[1, 1., 100.], - 'square' :[1, -100., 100.]} + 'reciprocal' :[1, 1., 100.], + 'square' :[1, -100., 100.], + 'rint' :[0, -100., 100.], + 'floor' :[0, -100., 100.], + 'ceil' :[0, -100., 100.], + 'trunc' :[0, -100., 100.]} class TestAVXUfuncs(object): def test_avx_based_ufunc(self): From 299e533cecbe935d1bfe7a85621ed4cbaedda275 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 9 Jul 2019 20:21:14 -0700 Subject: [PATCH 5/8] TEST: disable raise invalid exception test for sqrt --- numpy/core/tests/test_umath.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 264eea00ee14..ed2bfbabe621 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -703,9 +703,9 @@ def test_sqrt_values(self): yf = np.array(y, dtype=dt) assert_equal(np.sqrt(yf), xf) - with np.errstate(invalid='raise'): - for dt in ['f', 'd', 'g']: - assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt)) + #with np.errstate(invalid='raise'): + # for dt in ['f', 'd', 'g']: + # assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt)) def test_abs_values(self): x = [np.nan, np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0] From 0286715abf7eec52e5dc8d51ceb8819a54c3290e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sat, 24 Aug 2019 08:20:10 -0700 Subject: [PATCH 6/8] MAINT: rebase with master --- numpy/core/src/umath/simd.inc.src | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index bf92b56ecb7f..74f52cc9d529 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -2236,10 +2236,10 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void * x = +/- NAN; return NAN * x = 0.0f; return -INF */ - poly = @isa@_set_masked_lanes(poly, nan, nan_mask); - poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask); - poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask); - poly = @isa@_set_masked_lanes(poly, inf, inf_mask); + poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask); + poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask); + poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask); + poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask); @masked_store@(op, @cvtps_epi32@(load_mask), poly); From 5ee46de530005d0dcbe2e1e386b30f06477f8d7c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 27 Sep 2019 15:58:51 -0700 Subject: [PATCH 7/8] MAINT: removing duplicated inner loop for e->e --- numpy/core/code_generators/generate_umath.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index be4fc934d8c3..0d3bbffe9100 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -779,7 +779,7 @@ def english_upper(s): None, TD('e', f='ceil', astype={'e':'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), - TD(flts, f='ceil', astype={'e':'f'}), + TD('fdg', f='ceil'), TD(O, f='npy_ObjectCeil'), ), 'trunc': @@ -788,7 +788,7 @@ def english_upper(s): None, TD('e', f='trunc', astype={'e':'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), - TD(flts, f='trunc', astype={'e':'f'}), + TD('fdg', f='trunc'), TD(O, f='npy_ObjectTrunc'), ), 'fabs': @@ -804,7 +804,7 @@ def english_upper(s): None, TD('e', f='floor', astype={'e':'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), - TD(flts, f='floor', astype={'e':'f'}), + TD('fdg', f='floor'), TD(O, f='npy_ObjectFloor'), ), 'rint': @@ -813,7 +813,7 @@ def english_upper(s): None, TD('e', f='rint', astype={'e':'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), - TD(inexact, f='rint', astype={'e':'f'}), + TD('fdg' + cmplx, f='rint'), TD(P, f='rint'), ), 'arctan2': From 5323bbfc15f320ac5b9c4b3bece6394f5bf74ee2 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 1 Jul 2019 13:38:09 -0700 Subject: [PATCH 8/8] BENCH: adding benchmarks for avx based ufuncs --- benchmarks/benchmarks/bench_avx.py | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 benchmarks/benchmarks/bench_avx.py diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py new file mode 100644 index 000000000000..f7b524e43ba0 --- /dev/null +++ b/benchmarks/benchmarks/bench_avx.py @@ -0,0 +1,34 @@ +from __future__ import absolute_import, division, print_function + +from .common import Benchmark + +import numpy as np + +avx_ufuncs = ['sqrt', + 'absolute', + 'reciprocal', + 'square', + 'rint', + 'floor', + 'ceil' , + 'trunc'] +stride = [1, 2, 4] +dtype = ['f', 'd'] + +class AVX_UFunc(Benchmark): + params = [avx_ufuncs, stride, dtype] + param_names = ['avx_based_ufunc', 'stride', 'dtype'] + timeout = 10 + + def setup(self, ufuncname, stride, dtype): + np.seterr(all='ignore') + try: + self.f = getattr(np, ufuncname) + except AttributeError: + raise NotImplementedError() + N = 10000 + self.arr = np.ones(stride*N, dtype) + + def time_ufunc(self, ufuncname, stride, dtype): + self.f(self.arr[::stride]) +