diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index a2649ed93d8d..f84d74efe113 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1626,12 +1626,11 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY /* * We use the AVX function to compute exp/log for scalar elements as well. * This is needed to ensure the output of strided and non-strided - * cases match. But this worsens the performance of strided arrays. - * There is plan to fix this in a subsequent patch by using gather - * instructions for strided arrays in the AVX function. + * cases match. SIMD code handles strided input cases, but not + * strided output. */ #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS - @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1); + @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]); #else const npy_float in1 = *(npy_float *)ip1; *(npy_float *)op1 = @scalarf@(in1); diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 1c6ac44260f5..6ddd6fcc61cb 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -51,6 +51,15 @@ abs_ptrdiff(char *a, char *b) ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ ((abs_ptrdiff(args[1], args[0]) == 0)))) +/* + * output should be contiguous, can handle strided input data + */ +#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \ + (steps[1] == (esize) && \ + (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ + ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ + ((abs_ptrdiff(args[1], args[0]) == 0)))) + #define IS_BLOCKABLE_REDUCE(esize, vsize) \ (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \ npy_is_aligned(args[1], (esize)) && \ @@ -134,15 +143,15 @@ abs_ptrdiff(char *a, char *b) #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS static NPY_INLINE void -@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n); +@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride); #endif static NPY_INLINE int run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps) { #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS - if (IS_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) { - @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]); + if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) { + @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]); return 1; } else @@ -1127,14 +1136,23 @@ avx2_get_full_load_mask(void) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem) +avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem) { float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float* addr = maskint + total_elem - num_elem; + float* addr = maskint + total_elem - num_lanes; return _mm256_loadu_ps(addr); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_masked_gather(__m256 src, + npy_float* addr, + __m256i vindex, + __m256 mask) +{ + return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 avx2_masked_load(__m256 mask, npy_float* addr) { @@ -1220,6 +1238,15 @@ avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem) return (0x0001 << num_elem) - 0x0001; } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +avx512_masked_gather(__m512 src, + npy_float* addr, + __m512i vindex, + __mmask16 kmask) +{ + return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_masked_load(__mmask16 mask, npy_float* addr) { @@ -1310,11 +1337,19 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size) +@ISA@_exp_FLOAT(npy_float * op, + npy_float * ip, + const npy_intp array_size, + const npy_intp steps) { + const npy_intp stride = steps/sizeof(npy_float); const npy_int num_lanes = @BYTES@/sizeof(npy_float); npy_float xmax = 88.72283935546875f; npy_float xmin = -87.3365478515625f; + npy_int indexarr[16]; + for (npy_int ii = 0; ii < 16; ii++) { + indexarr[ii] = ii*stride; + } /* Load up frequently used constants */ @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf); @@ -1333,6 +1368,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF); @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f); @vtype@ poly, num_poly, denom_poly, quadrant; + @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @vtype@i exponent; @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask; @@ -1342,10 +1378,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) + if (num_remaining_elements < num_lanes) { load_mask = @isa@_get_partial_load_mask(num_remaining_elements, - num_lanes); - @vtype@ x = @isa@_masked_load(load_mask, ip); + num_lanes); + } + + @vtype@ x; + if (stride == 1) { + x = @isa@_masked_load(load_mask, ip); + } + else { + x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask); + } xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ); xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ); @@ -1396,13 +1440,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @masked_store@(op, @cvtps_epi32@(load_mask), poly); - ip += num_lanes; + ip += num_lanes*stride; op += num_lanes; num_remaining_elements -= num_lanes; } - if (@mask_to_int@(overflow_mask)) + if (@mask_to_int@(overflow_mask)) { npy_set_floatstatus_overflow(); + } } /* @@ -1420,9 +1465,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void */ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size) +@ISA@_log_FLOAT(npy_float * op, + npy_float * ip, + const npy_intp array_size, + const npy_intp steps) { + const npy_intp stride = steps/sizeof(npy_float); const npy_int num_lanes = @BYTES@/sizeof(npy_float); + npy_int indexarr[16]; + for (npy_int ii = 0; ii < 16; ii++) { + indexarr[ii] = ii*stride; + } /* Load up frequently used constants */ @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf); @@ -1443,6 +1496,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF); @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f); @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); + @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr); @vtype@ poly, num_poly, denom_poly, exponent; @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask; @@ -1453,10 +1507,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) + if (num_remaining_elements < num_lanes) { load_mask = @isa@_get_partial_load_mask(num_remaining_elements, - num_lanes); - @vtype@ x_in = @isa@_masked_load(load_mask, ip); + num_lanes); + } + + @vtype@ x_in; + if (stride == 1) { + x_in = @isa@_masked_load(load_mask, ip); + } + else { + x_in = @isa@_masked_gather(zeros_f, ip, vindex, load_mask); + } negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ); zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ); @@ -1506,15 +1568,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @masked_store@(op, @cvtps_epi32@(load_mask), poly); - ip += num_lanes; + ip += num_lanes*stride; op += num_lanes; num_remaining_elements -= num_lanes; } - if (@mask_to_int@(invalid_mask)) + if (@mask_to_int@(invalid_mask)) { npy_set_floatstatus_invalid(); - if (@mask_to_int@(divide_by_zero_mask)) + } + if (@mask_to_int@(divide_by_zero_mask)) { npy_set_floatstatus_divbyzero(); + } } #endif /**end repeat**/ diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 2c963b5eb993..cd2034d9cee9 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -13,7 +13,7 @@ from numpy.testing import ( assert_, assert_equal, assert_raises, assert_raises_regex, assert_array_equal, assert_almost_equal, assert_array_almost_equal, - assert_allclose, assert_no_warnings, suppress_warnings, + assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings, _gen_alignment_data ) @@ -678,6 +678,31 @@ def test_log_values(self): assert_raises(FloatingPointError, np.log, np.float32(-np.inf)) assert_raises(FloatingPointError, np.log, np.float32(-1.0)) +class TestExpLogFloat32(object): + def test_exp_float32(self): + np.random.seed(42) + x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000)) + x_f64 = np.float64(x_f32) + assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6) + + def test_log_float32(self): + np.random.seed(42) + x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000)) + x_f64 = np.float64(x_f32) + assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9) + + def test_strided_exp_log_float32(self): + np.random.seed(42) + strides = np.random.randint(low=-100, high=100, size=100) + sizes = np.random.randint(low=1, high=2000, size=100) + for ii in sizes: + x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii)) + exp_true = np.exp(x_f32) + log_true = np.log(x_f32) + for jj in strides: + assert_equal(np.exp(x_f32[::jj]), exp_true[::jj]) + assert_equal(np.log(x_f32[::jj]), log_true[::jj]) + class TestLogAddExp(_FilterInvalids): def test_logaddexp_values(self): x = [1, 2, 3, 4, 5]