Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH: use AVX for float32 and float64 implementation of sqrt, square, absolute, reciprocal, rint, floor, ceil and trunc #13885

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions benchmarks/benchmarks/bench_avx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from __future__ import absolute_import, division, print_function

from .common import Benchmark

import numpy as np

avx_ufuncs = ['sqrt',
'absolute',
'reciprocal',
'square',
'rint',
'floor',
'ceil' ,
'trunc']
stride = [1, 2, 4]
dtype = ['f', 'd']

class AVX_UFunc(Benchmark):
params = [avx_ufuncs, stride, dtype]
param_names = ['avx_based_ufunc', 'stride', 'dtype']
timeout = 10

def setup(self, ufuncname, stride, dtype):
np.seterr(all='ignore')
try:
self.f = getattr(np, ufuncname)
except AttributeError:
raise NotImplementedError()
N = 10000
self.arr = np.ones(stride*N, dtype)

def time_ufunc(self, ufuncname, stride, dtype):
self.f(self.arr[::stride])

24 changes: 16 additions & 8 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,14 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.square'),
None,
TD(ints+inexact, simd=[('avx2', ints)]),
TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
TD(O, f='Py_square'),
),
'reciprocal':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.reciprocal'),
None,
TD(ints+inexact, simd=[('avx2', ints)]),
TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
TD(O, f='Py_reciprocal'),
),
# This is no longer used as numpy.ones_like, however it is
Expand Down Expand Up @@ -395,7 +395,7 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.absolute'),
'PyUFunc_AbsoluteTypeResolver',
TD(bints+flts+timedeltaonly),
TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD(cmplx, out=('f', 'd', 'g')),
TD(O, f='PyNumber_Absolute'),
),
Expand Down Expand Up @@ -762,7 +762,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.sqrt'),
None,
TD('e', f='sqrt', astype={'e':'f'}),
TD(inexactvec),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg' + cmplx, f='sqrt'),
TD(P, f='sqrt'),
),
Expand All @@ -777,14 +777,18 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.ceil'),
None,
TD(flts, f='ceil', astype={'e':'f'}),
TD('e', f='ceil', astype={'e':'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg', f='ceil'),
TD(O, f='npy_ObjectCeil'),
),
'trunc':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.trunc'),
None,
TD(flts, f='trunc', astype={'e':'f'}),
TD('e', f='trunc', astype={'e':'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg', f='trunc'),
TD(O, f='npy_ObjectTrunc'),
),
'fabs':
Expand All @@ -798,14 +802,18 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.floor'),
None,
TD(flts, f='floor', astype={'e':'f'}),
TD('e', f='floor', astype={'e':'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg', f='floor'),
TD(O, f='npy_ObjectFloor'),
),
'rint':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.rint'),
None,
TD(inexact, f='rint', astype={'e':'f'}),
TD('e', f='rint', astype={'e':'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg' + cmplx, f='rint'),
TD(P, f='rint'),
),
'arctan2':
Expand Down
99 changes: 97 additions & 2 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,30 @@ NPY_NO_EXPORT void

/**end repeat**/

/**begin repeat
* #func = rint, ceil, floor, trunc#
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
*/

/**begin repeat1
* #TYPE = FLOAT, DOUBLE#
* #type = npy_float, npy_double#
* #typesub = f, #
*/

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
*(@type@ *)op1 = @scalarf@@typesub@(in1);
}
}


/**end repeat1**/
/**end repeat**/

/**begin repeat
* #func = sin, cos, exp, log#
* #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
Expand All @@ -1656,6 +1680,78 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
*/

/**begin repeat1
* #TYPE = FLOAT, DOUBLE#
* #type = npy_float, npy_double#
* #typesub = f, #
*/

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
*(@type@ *)op1 = npy_sqrt@typesub@(in1);
}
}
Comment on lines +1692 to +1697
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the compiler able to generate the avx code automatically if you use

if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) {
    UNARY_LOOP { ... }
}
else {
    // as above
    UNARY_LOOP { ... }
}

We use this trick in all sorts of places today to encourage it to generate optimized code.

Copy link
Member Author

@r-devulap r-devulap Oct 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried several options with GCC-9.2 and found the following:

  • Any compiler generated vectorized loop for floating point seems to require extra compiler options like -ffast-math (see https://gcc.gnu.org/projects/tree-ssa/vectorization.html#using) . Here is the code for an example of the sqrt loop with and without this option. There are several problems with this path: (1) -ffast-math obviously should not be used as a global compile option and (2) the code generated with this option ends up using a combination of vrsqrt14ps and vmulps instruction to compute square root which is neither accurate nor fast (vrsqrt14ps is only accurate up to the 6th decimal place and I have no idea why even the latest GCC wont use a simple vsqrtps instruction instead!)

  • The other problem is, no matter what option I try, I could not get GCC to vectorize the strided array case (see an example here). Even if somehow we were able to properly vectorize the case where stride = 1, as far as I know, we cannot auto-vectorize for general strided arrays.

Copy link
Member Author

@r-devulap r-devulap Oct 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I finally learnt why gcc wont use vsqrtps! vrsqrt14ps instruction is 1-3 cycles, where as vsqrtps is > 14 cycles. So its basically faster to compute invsqrt , multiple it with input and then correct it with one step of newton raphson than to compute an accurate sqrt directly. -ffast-math obviously chooses speed over accuracy. This logic works for single precision and not for double precision where it uses the vsqrtpd instruction (see code here) :)

}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ tmp = in1 > 0 ? in1 : -in1;
/* add 0 to clear -0.0 */
*((@type@ *)op1) = tmp + 0;
}
}
npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
*(@type@ *)op1 = in1*in1;
}
}
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
*(@type@ *)op1 = 1.0f/in1;
}
}
}

/**begin repeat2
* #func = rint, ceil, floor, trunc#
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
*/

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
*(@type@ *)op1 = @scalarf@@typesub@(in1);
}
}
}

/**end repeat2**/
/**end repeat1**/

/**begin repeat1
* #func = exp, log#
* #scalarf = npy_expf, npy_logf#
Expand Down Expand Up @@ -1706,10 +1802,9 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
}

/**end repeat1**/


/**end repeat**/


/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble, npy_float#
Expand Down
33 changes: 33 additions & 0 deletions numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,19 @@ NPY_NO_EXPORT void
*/
NPY_NO_EXPORT void
@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));

/**begin repeat1
* #isa = avx512f, fma#
*/

/**begin repeat2
* #func = sqrt, absolute, square, reciprocal#
*/
NPY_NO_EXPORT void
@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));

/**end repeat2**/
/**end repeat1**/
/**end repeat**/

/**begin repeat
Expand All @@ -193,6 +206,26 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
/**end repeat1**/
/**end repeat**/

/**begin repeat
* #func = rint, ceil, floor, trunc#
*/

/**begin repeat1
* #TYPE = FLOAT, DOUBLE#
*/

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));

/**begin repeat2
* #isa = avx512f, fma#
*/
NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
/**end repeat2**/
/**end repeat1**/
/**end repeat**/

/**begin repeat
* Float types
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
Expand Down
Loading