numpy · seiko2plus · Jun 18, 2020 · Jun 18, 2020 · Jun 18, 2020 · Jun 18, 2020
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -48,8 +48,11 @@ class TypeDescription:
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
+    dispatch: list
+        Available SIMD ufunc loops, dispatched at runtime in specified order
+        Currently only supported for simples types (see make_arrays)
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -62,6 +65,15 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
             out = out.replace('P', type)
         self.out = out
         self.simd = simd
+        """
+        TODO:
+            - remove 'SIMD' after integrate the rest of simd loops
+            - support astype, this can be done by cover each func in NPYV intrinsics,
+              also, SIMD kernels of PyUFunc_*_As_* must handle remained scalars as vectors,
+              since func_data only takes one ptr for each type,
+              a lot of work I know but it will leave a good memory.
+        """
+        self.dispatch = dispatch
 
     def finish_signature(self, nin, nout):
         if self.in_ is None:
@@ -86,7 +98,7 @@ def build_func_data(types, f):
     func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
+def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -115,7 +127,14 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
             simdt = [k for k, v in simd if t in v]
         else:
             simdt = []
-        tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt))
+        # [(dispatch file name without extension '.dispatch.c*', list of types)]
+        if dispatch:
+            dispt = [k for k, v in dispatch if t in v]
+        else:
+            dispt = []
+        tds.append(TypeDescription(
+            t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
+        ))
     return tds
 
 class Ufunc:
@@ -269,7 +288,8 @@ def english_upper(s):
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx512f', cmplxvec)],
+                             dispatch=[('loops_fast', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -280,7 +300,8 @@ def english_upper(s):
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx512f', cmplxvec)],
+                             dispatch=[('loops_fast', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -291,7 +312,8 @@ def english_upper(s):
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx512f', cmplxvec)],
+                             dispatch=[('loops_fast', ints)]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -326,7 +348,8 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.conjugate'),
           None,
-          TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]),
+          TD(ints+flts+cmplx, simd=[('avx512f', cmplxvec)],
+                              dispatch=[('loops_fast', ints)]),
           TD(P, f='conjugate'),
           ),
 'fmod':
@@ -341,14 +364,16 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
+          TD(ints+inexact, simd=[('fma', 'fd'), ('avx512f', 'FDfd')],
+                           dispatch=[('loops_fast', ints)]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
+          TD(ints+inexact, simd=[('fma', 'fd'), ('avx512f','fd')],
+                           dispatch=[('loops_fast', ints)]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -392,7 +417,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
           'PyUFunc_NegativeTypeResolver',
-          TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]),
+          TD(ints+flts+timedeltaonly, dispatch=[('loops_fast', ints)]),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
           ),
@@ -414,71 +439,71 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_fast', ints)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'greater_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_fast', ints)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'less':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_fast', ints)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'less_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_fast', ints)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_fast', ints)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'not_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_fast', ints)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'logical_and':
     Ufunc(2, 1, True_,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[('loops_fast', ints)]),
           TD(O, f='npy_ObjectLogicalAnd'),
           TD(O, f='npy_ObjectLogicalAnd', out='?'),
           ),
 'logical_not':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.logical_not'),
           None,
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[('loops_fast', ints)]),
           TD(O, f='npy_ObjectLogicalNot'),
           TD(O, f='npy_ObjectLogicalNot', out='?'),
           ),
 'logical_or':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[('loops_fast', ints)]),
           TD(O, f='npy_ObjectLogicalOr'),
           TD(O, f='npy_ObjectLogicalOr', out='?'),
           ),
@@ -540,42 +565,42 @@ def english_upper(s):
     Ufunc(2, 1, AllOnes,
           docstrings.get('numpy.core.umath.bitwise_and'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD(bints, dispatch=[('loops_fast', ints)]),
           TD(O, f='PyNumber_And'),
           ),
 'bitwise_or':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_or'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD(bints, dispatch=[('loops_fast', ints)]),
           TD(O, f='PyNumber_Or'),
           ),
 'bitwise_xor':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_xor'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD(bints, dispatch=[('loops_fast', ints)]),
           TD(O, f='PyNumber_Xor'),
           ),
 'invert':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.invert'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD(bints, dispatch=[('loops_fast', ints)]),
           TD(O, f='PyNumber_Invert'),
           ),
 'left_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.left_shift'),
           None,
-          TD(ints, simd=[('avx2', ints)]),
+          TD(ints, dispatch=[('loops_fast', ints)]),
           TD(O, f='PyNumber_Lshift'),
           ),
 'right_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.right_shift'),
           None,
-          TD(ints, simd=[('avx2', ints)]),
+          TD(ints, dispatch=[('loops_fast', ints)]),
           TD(O, f='PyNumber_Rshift'),
           ),
 'heaviside':
@@ -1024,6 +1049,16 @@ def make_arrays(funcdict):
                             ISA=vt.upper(), isa=vt,
                             fname=name, type=tname, idx=k
                         ))
+                if t.dispatch is not None:
+                    for dname in t.dispatch:
+                        code2list.append(textwrap.dedent("""\
+                        #ifndef NPY_DISABLE_OPTIMIZATION
+                        #include "{dname}.dispatch.h"
+                        #endif
+                        NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name})
+                        """).format(
+                            dname=dname, name=name, tname=tname, k=k
+                        ))
             else:
                 funclist.append('NULL')
                 try:

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -906,6 +906,7 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'scalarmath.c.src'),
             join('src', 'umath', 'ufunc_type_resolution.c'),
             join('src', 'umath', 'override.c'),
+            join('src', 'umath', 'loops_fast.dispatch.c.src'),
             ]
 
     umath_deps = [