Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH: AVX support for exp/log for strided float32 arrays #13581

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1626,12 +1626,11 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
/*
* We use the AVX function to compute exp/log for scalar elements as well.
* This is needed to ensure the output of strided and non-strided
* cases match. But this worsens the performance of strided arrays.
* There is plan to fix this in a subsequent patch by using gather
* instructions for strided arrays in the AVX function.
* cases match. SIMD code handles strided input cases, but not
* strided output.
*/
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1);
@ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
#else
const npy_float in1 = *(npy_float *)ip1;
*(npy_float *)op1 = @scalarf@(in1);
Expand Down
100 changes: 82 additions & 18 deletions numpy/core/src/umath/simd.inc.src
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ abs_ptrdiff(char *a, char *b)
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
((abs_ptrdiff(args[1], args[0]) == 0))))

/*
* output should be contiguous, can handle strided input data
*/
#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
(steps[1] == (esize) && \
(npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
((abs_ptrdiff(args[1], args[0]) == 0))))

#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
npy_is_aligned(args[1], (esize)) && \
Expand Down Expand Up @@ -134,15 +143,15 @@ abs_ptrdiff(char *a, char *b)

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
#endif

static NPY_INLINE int
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
return 1;
}
else
Expand Down Expand Up @@ -1127,14 +1136,23 @@ avx2_get_full_load_mask(void)
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
float* addr = maskint + total_elem - num_elem;
float* addr = maskint + total_elem - num_lanes;
return _mm256_loadu_ps(addr);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_masked_gather(__m256 src,
npy_float* addr,
__m256i vindex,
__m256 mask)
{
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_masked_load(__m256 mask, npy_float* addr)
{
Expand Down Expand Up @@ -1220,6 +1238,15 @@ avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
return (0x0001 << num_elem) - 0x0001;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_gather(__m512 src,
npy_float* addr,
__m512i vindex,
__mmask16 kmask)
{
return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_load(__mmask16 mask, npy_float* addr)
{
Expand Down Expand Up @@ -1310,11 +1337,19 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
@ISA@_exp_FLOAT(npy_float * op,
npy_float * ip,
const npy_intp array_size,
const npy_intp steps)
{
const npy_intp stride = steps/sizeof(npy_float);
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
npy_float xmax = 88.72283935546875f;
npy_float xmin = -87.3365478515625f;
npy_int indexarr[16];
for (npy_int ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}

/* Load up frequently used constants */
@vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
Expand All @@ -1333,6 +1368,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
@vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
@vtype@ poly, num_poly, denom_poly, quadrant;
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
@vtype@i exponent;

@mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
Expand All @@ -1342,10 +1378,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void

while (num_remaining_elements > 0) {

if (num_remaining_elements < num_lanes)
if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
num_lanes);
@vtype@ x = @isa@_masked_load(load_mask, ip);
num_lanes);
}

@vtype@ x;
if (stride == 1) {
x = @isa@_masked_load(load_mask, ip);
}
else {
x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
}

xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
Expand Down Expand Up @@ -1396,13 +1440,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void

@masked_store@(op, @cvtps_epi32@(load_mask), poly);

ip += num_lanes;
ip += num_lanes*stride;
op += num_lanes;
num_remaining_elements -= num_lanes;
}

if (@mask_to_int@(overflow_mask))
if (@mask_to_int@(overflow_mask)) {
npy_set_floatstatus_overflow();
}
}

/*
Expand All @@ -1420,9 +1465,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
*/

static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
@ISA@_log_FLOAT(npy_float * op,
npy_float * ip,
const npy_intp array_size,
const npy_intp steps)
{
const npy_intp stride = steps/sizeof(npy_float);
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
npy_int indexarr[16];
for (npy_int ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}

/* Load up frequently used constants */
@vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
Expand All @@ -1443,6 +1496,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
@vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
@vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
@vtype@ poly, num_poly, denom_poly, exponent;

@mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
Expand All @@ -1453,10 +1507,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void

while (num_remaining_elements > 0) {

if (num_remaining_elements < num_lanes)
if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
num_lanes);
@vtype@ x_in = @isa@_masked_load(load_mask, ip);
num_lanes);
}

@vtype@ x_in;
if (stride == 1) {
x_in = @isa@_masked_load(load_mask, ip);
}
else {
x_in = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
}

negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
Expand Down Expand Up @@ -1506,15 +1568,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void

@masked_store@(op, @cvtps_epi32@(load_mask), poly);

ip += num_lanes;
ip += num_lanes*stride;
op += num_lanes;
num_remaining_elements -= num_lanes;
}

if (@mask_to_int@(invalid_mask))
if (@mask_to_int@(invalid_mask)) {
npy_set_floatstatus_invalid();
if (@mask_to_int@(divide_by_zero_mask))
}
if (@mask_to_int@(divide_by_zero_mask)) {
npy_set_floatstatus_divbyzero();
}
}
#endif
/**end repeat**/
Expand Down
27 changes: 26 additions & 1 deletion numpy/core/tests/test_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from numpy.testing import (
assert_, assert_equal, assert_raises, assert_raises_regex,
assert_array_equal, assert_almost_equal, assert_array_almost_equal,
assert_allclose, assert_no_warnings, suppress_warnings,
assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
_gen_alignment_data
)

Expand Down Expand Up @@ -678,6 +678,31 @@ def test_log_values(self):
assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
assert_raises(FloatingPointError, np.log, np.float32(-1.0))

class TestExpLogFloat32(object):
def test_exp_float32(self):
np.random.seed(42)
x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
x_f64 = np.float64(x_f32)
assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)

def test_log_float32(self):
np.random.seed(42)
x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
x_f64 = np.float64(x_f32)
assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)

def test_strided_exp_log_float32(self):
np.random.seed(42)
strides = np.random.randint(low=-100, high=100, size=100)
sizes = np.random.randint(low=1, high=2000, size=100)
for ii in sizes:
x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

denormal floats are excluded here. how is the accuracy for these?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests are meant to be like a sanity check and are not comprehensive at all. It will be slow to test for a large sample of float32's. But the MAXULP error of 2.6 and 3.9 hold even for denormals (this is something I validated separately by enumerating all float32 numbers).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok thanks

exp_true = np.exp(x_f32)
log_true = np.log(x_f32)
for jj in strides:
assert_equal(np.exp(x_f32[::jj]), exp_true[::jj])
assert_equal(np.log(x_f32[::jj]), log_true[::jj])

class TestLogAddExp(_FilterInvalids):
def test_logaddexp_values(self):
x = [1, 2, 3, 4, 5]
Expand Down