numpy · mattip · Oct 15, 2019 · May 24, 2019 · Jul 2, 2019 · Jul 3, 2019
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py
@@ -0,0 +1,34 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+
+avx_ufuncs = ['sqrt',
+              'absolute',
+              'reciprocal',
+              'square',
+              'rint',
+              'floor',
+              'ceil' ,
+              'trunc']
+stride = [1, 2, 4]
+dtype  = ['f', 'd']
+
+class AVX_UFunc(Benchmark):
+    params = [avx_ufuncs, stride, dtype]
+    param_names = ['avx_based_ufunc', 'stride', 'dtype']
+    timeout = 10
+
+    def setup(self, ufuncname, stride, dtype):
+        np.seterr(all='ignore')
+        try:
+            self.f = getattr(np, ufuncname)
+        except AttributeError:
+            raise NotImplementedError()
+        N = 10000
+        self.arr = np.ones(stride*N, dtype)
+
+    def time_ufunc(self, ufuncname, stride, dtype):
+        self.f(self.arr[::stride])
+
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -358,14 +358,14 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -395,7 +395,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly),
+          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD(cmplx, out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -762,7 +762,7 @@ def english_upper(s):
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
@@ -777,14 +777,18 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.ceil'),
           None,
-          TD(flts, f='ceil', astype={'e':'f'}),
+          TD('e', f='ceil', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
           ),
 'trunc':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.trunc'),
           None,
-          TD(flts, f='trunc', astype={'e':'f'}),
+          TD('e', f='trunc', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
           ),
 'fabs':
@@ -798,14 +802,18 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.floor'),
           None,
-          TD(flts, f='floor', astype={'e':'f'}),
+          TD('e', f='floor', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
           ),
 'rint':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rint'),
           None,
-          TD(inexact, f='rint', astype={'e':'f'}),
+          TD('e', f='rint', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
           ),
 'arctan2':

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -1634,6 +1634,30 @@ NPY_NO_EXPORT void
 
 /**end repeat**/
 
+/**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*  #type = npy_float, npy_double#
+*  #typesub = f, #
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = @scalarf@@typesub@(in1);
+    }
+}
+
+
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  *  #func = sin, cos, exp, log#
  *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
@@ -1656,6 +1680,78 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
  */
 
+/**begin repeat1
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #type = npy_float, npy_double#
+ *  #typesub = f, #
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = in1*in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = 1.0f/in1;
+        }
+    }
+}
+
+/**begin repeat2
+ *  #func = rint, ceil, floor, trunc#
+ *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = @scalarf@@typesub@(in1);
+        }
+    }
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
 /**begin repeat1
  *  #func = exp, log#
  *  #scalarf = npy_expf, npy_logf#
@@ -1706,10 +1802,9 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 /**end repeat1**/
-
-
 /**end repeat**/
 
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble, npy_float#

diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
@@ -175,6 +175,19 @@ NPY_NO_EXPORT void
  */
 NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, fma#
+ */
+
+/**begin repeat2
+ * #func = sqrt, absolute, square, reciprocal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat2**/
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
@@ -193,6 +206,26 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 /**end repeat1**/
 /**end repeat**/
 
+/**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+/**begin repeat2
+ * #isa = avx512f, fma#
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#