From 7bd6de3d0947c5078c9524739dc08bb5eac49c1e Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Thu, 10 Dec 2020 23:53:57 +0000
Subject: [PATCH 1/4] MAINT, SIMD: reduce the number of preprocessor CPU
 runtime dispatcher calls

---
 numpy/core/code_generators/generate_umath.py | 38 ++++++++++++--------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index cb1147b93fba..fd0051ee9dc4 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -48,8 +48,9 @@ class TypeDescription:
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
-    dispatch: list
-        Available SIMD ufunc loops, dispatched at runtime in specified order
+    dispatch: str or None, optional
+        Dispatch-able source name without its extension '.dispatch.c' that contains the definition of ufunc,
+        dispatched at runtime depending on the specified targets of the dispatch-able source.
         Currently only supported for simples types (see make_arrays)
     """
     def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
@@ -119,11 +120,12 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None)
             simdt = [k for k, v in simd if t in v]
         else:
             simdt = []
+
         # [(dispatch file name without extension '.dispatch.c*', list of types)]
         if dispatch:
-            dispt = [k for k, v in dispatch if t in v]
+            dispt = ([k for k, v in dispatch if t in v]+[None])[0]
         else:
-            dispt = []
+            dispt = None
         tds.append(TypeDescription(
             t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
         ))
@@ -1000,6 +1002,7 @@ def make_arrays(funcdict):
     # later
     code1list = []
     code2list = []
+    dispdict  = {}
     names = sorted(funcdict.keys())
     for name in names:
         uf = funcdict[name]
@@ -1010,6 +1013,7 @@ def make_arrays(funcdict):
         sub = 0
 
         for t in uf.type_descriptions:
+
             if t.func_data is FullTypeDescr:
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
@@ -1023,7 +1027,8 @@ def make_arrays(funcdict):
             elif t.func_data is None:
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                funclist.append('%s_%s' % (tname, name))
+                cfunc_name = f"{tname}_{name}"
+                funclist.append(cfunc_name)
                 if t.simd is not None:
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
@@ -1036,16 +1041,8 @@ def make_arrays(funcdict):
                             ISA=vt.upper(), isa=vt,
                             fname=name, type=tname, idx=k
                         ))
-                if t.dispatch is not None:
-                    for dname in t.dispatch:
-                        code2list.append(textwrap.dedent("""\
-                        #ifndef NPY_DISABLE_OPTIMIZATION
-                        #include "{dname}.dispatch.h"
-                        #endif
-                        NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
-                        """).format(
-                            dname=dname, name=name, tname=tname, k=k
-                        ))
+                if t.dispatch:
+                    dispdict.setdefault(t.dispatch, []).append((tname, k, cfunc_name))
             else:
                 funclist.append('NULL')
                 try:
@@ -1091,6 +1088,17 @@ def make_arrays(funcdict):
                          % (name, datanames))
         code1list.append("static char %s_signatures[] = {%s};"
                          % (name, signames))
+
+    for dname, funcs in dispdict.items():
+        code2list.append(textwrap.dedent(f"""
+            #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "{dname}.dispatch.h"
+            #endif
+        """))
+        for (ufunc_name, func_idx, cfunc_name) in funcs:
+            code2list.append(textwrap.dedent(f"""\
+                NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name});
+            """))
     return "\n".join(code1list), "\n".join(code2list)
 
 def make_ufuncs(funcdict):

From d084917e8f884f43ee172117fe516244ebe728b5 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Fri, 11 Dec 2020 19:38:05 +0000
Subject: [PATCH 2/4] ENH: add new option 'cfunc_alias' to umath generator

---
 numpy/core/code_generators/generate_umath.py | 39 ++++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index fd0051ee9dc4..14b5488762a4 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -45,15 +45,19 @@ class TypeDescription:
     astype : dict or None, optional
         If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
         instead of PyUFunc_x_x/PyUFunc_xx_x.
+    cfunc_alias : str or none, optional
+        replaces the suffix of C function name instead of using ufunc_name,
+        e.g. "FLOAT_{cfunc_alias}" instead of "FLOAT_{ufunc_name}" (see make_arrays)
+        NOTE: it doesn't support 'astype'
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
     dispatch: str or None, optional
         Dispatch-able source name without its extension '.dispatch.c' that contains the definition of ufunc,
         dispatched at runtime depending on the specified targets of the dispatch-able source.
-        Currently only supported for simples types (see make_arrays)
+        NOTE: it doesn't support 'astype'
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None, simd=None, dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -65,6 +69,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dis
         if out is not None:
             out = out.replace('P', type)
         self.out = out
+        self.cfunc_alias = cfunc_alias
         self.simd = simd
         self.dispatch = dispatch
 
@@ -91,7 +96,7 @@ def build_func_data(types, f):
     func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
+def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None, simd=None, dispatch=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -127,7 +132,7 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None)
         else:
             dispt = None
         tds.append(TypeDescription(
-            t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
+            t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias, simd=simdt, dispatch=dispt
         ))
     return tds
 
@@ -1013,38 +1018,33 @@ def make_arrays(funcdict):
         sub = 0
 
         for t in uf.type_descriptions:
-
+            cfunc_alias = t.cfunc_alias if t.cfunc_alias else name
+            cfunc_fname = None
             if t.func_data is FullTypeDescr:
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
-                funclist.append(
-                        '%s_%s_%s_%s' % (tname, t.in_, t.out, name))
+                cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
             elif isinstance(t.func_data, FuncNameSuffix):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                funclist.append(
-                        '%s_%s_%s' % (tname, name, t.func_data.suffix))
+                cfunc_fname = f"{tname}_{cfunc_alias}_{t.func_data.suffix}"
             elif t.func_data is None:
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                cfunc_name = f"{tname}_{name}"
-                funclist.append(cfunc_name)
+                cfunc_fname = f"{tname}_{cfunc_alias}"
                 if t.simd is not None:
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
                         #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
                         if (NPY_CPU_HAVE({ISA})) {{
-                            {fname}_functions[{idx}] = {type}_{fname}_{isa};
+                            {fname}_functions[{idx}] = {cname}_{isa};
                         }}
                         #endif
                         """).format(
                             ISA=vt.upper(), isa=vt,
-                            fname=name, type=tname, idx=k
+                            fname=name, cname=cfunc_fname, idx=k
                         ))
-                if t.dispatch:
-                    dispdict.setdefault(t.dispatch, []).append((tname, k, cfunc_name))
             else:
-                funclist.append('NULL')
                 try:
                     thedict = arity_lookup[uf.nin, uf.nout]
                 except KeyError as e:
@@ -1074,6 +1074,13 @@ def make_arrays(funcdict):
                     #datalist.append('(void *)%s' % t.func_data)
                 sub += 1
 
+            if cfunc_fname:
+                funclist.append(cfunc_fname)
+                if t.dispatch:
+                    dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname))
+            else:
+                funclist.append('NULL')
+
             for x in t.in_ + t.out:
                 siglist.append('NPY_%s' % (english_upper(chartoname[x]),))
 

From 0985a73ffa4090b862829b92bf9df09bb2783efc Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Fri, 11 Dec 2020 19:45:12 +0000
Subject: [PATCH 3/4] ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic)

   The first patch in a series of pull-requests aims to facilitate the migration
   process to our new SIMD interface(NPYV).

   It is basically a process that focuses on getting rid of the main umath SIMD source `simd.inc`,
   which contains almost all SIMD kernels, by splitting it into several dispatch-able sources without
   changing the base code, which facilitates the review process in order to speed up access to the nominal target.

   In this patch, we have moved the arithmetic operations of real and complex for single/double precision
   to the new CPU dispatcher.

   NOTE: previously, the SIMD code of AVX2 and AVX512F for single/double precision wasn't dispatched in runtime before.
---
 .gitignore                                    |   2 +
 numpy/core/code_generators/generate_umath.py  |  31 +-
 numpy/core/setup.py                           |   2 +
 numpy/core/src/umath/fast_loop_macros.h       | 112 +++
 numpy/core/src/umath/loops.c.src              | 258 +-----
 numpy/core/src/umath/loops.h.src              |  53 +-
 .../src/umath/loops_arithm_fp.dispatch.c.src  | 777 ++++++++++++++++++
 numpy/core/src/umath/loops_utils.h            |  42 -
 numpy/core/src/umath/loops_utils.h.src        | 213 +++++
 numpy/core/src/umath/simd.inc.src             | 669 +--------------
 10 files changed, 1192 insertions(+), 967 deletions(-)
 create mode 100644 numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
 delete mode 100644 numpy/core/src/umath/loops_utils.h
 create mode 100644 numpy/core/src/umath/loops_utils.h.src

diff --git a/.gitignore b/.gitignore
index f85c577647c4..18317b315027 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,6 +172,7 @@ numpy/core/src/umath/simd.inc
 numpy/core/src/umath/struct_ufunc_test.c
 numpy/core/src/umath/test_rational.c
 numpy/core/src/umath/umath_tests.c
+numpy/core/src/umath/loops_utils.h
 numpy/distutils/__config__.py
 numpy/linalg/umath_linalg.c
 doc/source/**/generated/
@@ -218,3 +219,4 @@ numpy/core/src/_simd/_simd_data.inc
 numpy/core/src/_simd/_simd_inc.h
 # umath module
 numpy/core/src/umath/loops_unary_fp.dispatch.c
+numpy/core/src/umath/loops_arithm_fp.dispatch.c
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 14b5488762a4..cdd2a84941fa 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -46,18 +46,19 @@ class TypeDescription:
         If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
         instead of PyUFunc_x_x/PyUFunc_xx_x.
     cfunc_alias : str or none, optional
-        replaces the suffix of C function name instead of using ufunc_name,
-        e.g. "FLOAT_{cfunc_alias}" instead of "FLOAT_{ufunc_name}" (see make_arrays)
+        appended to inner loop C function name, e.g. FLOAT_{cfunc_alias} (see make_arrays)
         NOTE: it doesn't support 'astype'
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
     dispatch: str or None, optional
-        Dispatch-able source name without its extension '.dispatch.c' that contains the definition of ufunc,
-        dispatched at runtime depending on the specified targets of the dispatch-able source.
+        Dispatch-able source name without its extension '.dispatch.c' that
+        contains the definition of ufunc, dispatched at runtime depending on the
+        specified targets of the dispatch-able source.
         NOTE: it doesn't support 'astype'
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None, simd=None, dispatch=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
+                 simd=None, dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -96,7 +97,8 @@ def build_func_data(types, f):
     func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None, simd=None, dispatch=None):
+def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
+       simd=None, dispatch=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -132,7 +134,8 @@ def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None, simd=No
         else:
             dispt = None
         tds.append(TypeDescription(
-            t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias, simd=simdt, dispatch=dispt
+            t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
+            simd=simdt, dispatch=dispt
         ))
     return tds
 
@@ -287,7 +290,7 @@ def english_upper(s):
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -298,7 +301,7 @@ def english_upper(s):
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -309,7 +312,7 @@ def english_upper(s):
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -333,10 +336,10 @@ def english_upper(s):
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.true_divide'),
           'PyUFunc_TrueDivisionTypeResolver',
-          TD(flts+cmplx),
-          [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
-           TypeDescription('m', FullTypeDescr, 'md', 'm'),
-           TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+          TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]),
+          [TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'),
+           TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
           ],
           TD(O, f='PyNumber_TrueDivide'),
           ),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2ec5e1a64b37..2e020a595c14 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -916,8 +916,10 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'funcs.inc.src'),
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
+            join('src', 'umath', 'loops_utils.h.src'),
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 5c22c6f1c2f8..dbcff8793506 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b)
     TYPE io1 = *(TYPE *)iop1; \
     BINARY_REDUCE_LOOP_INNER
 
+#define IS_BINARY_STRIDE_ONE(esize, vsize) \
+    ((steps[0] == esize) && \
+     (steps[1] == esize) && \
+     (steps[2] == esize) && \
+     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
+     (abs_ptrdiff(args[2], args[1]) >= vsize))
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register. The check of the steps against
+ * esize also quarantees that steps are >= 0.
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+      ((abs_ptrdiff(args[1], args[0]) == 0))))
+
+/*
+ * Avoid using SIMD for very large step sizes for several reasons:
+ * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
+ *    in which case we need two i64gather instructions and an additional vinsertf32x8
+ *    instruction to load a single zmm register (since one i64gather instruction
+ *    loads into a ymm register). This is not ideal for performance.
+ * 2) Gather and scatter instructions can be slow when the loads/stores
+ *    cross page boundaries.
+ *
+ * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
+ * ensures this. The condition also requires that the input and output arrays
+ * should have no overlap in memory.
+ */
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
+
+#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
+
+/*
+ * 1) Output should be contiguous, can handle strided input data
+ * 2) Input step should be smaller than MAX_STEP_SIZE for performance
+ * 3) Input and output arrays should have no overlap in memory
+ */
+#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
+    ((steps[0] & (esizein-1)) == 0 && \
+     steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
+
+#define IS_BLOCKABLE_REDUCE(esize, vsize) \
+    (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
+     npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY(esize, vsize) \
+    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)) && \
+     (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
+      abs_ptrdiff(args[2], args[0]) == 0) && \
+     (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
+      abs_ptrdiff(args[2], args[1]) >= 0))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
+    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
+      (abs_ptrdiff(args[2], args[1]) == 0)) && \
+     abs_ptrdiff(args[2], args[0]) >= (esize))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
+    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
+     ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
+      (abs_ptrdiff(args[2], args[0]) == 0)) && \
+     abs_ptrdiff(args[2], args[1]) >= (esize))
+
+#undef abs_ptrdiff
+
+#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
+     npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
+    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
+     npy_is_aligned(args[1], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
+    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
+     npy_is_aligned(args[0], (esize)))
+
+/* align var to alignment */
+#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+                                                alignment, n);\
+    for(i = 0; i < peel; i++)
+
+#define LOOP_BLOCKED(type, vsize)\
+    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+            i += (vsize / sizeof(type)))
+
+#define LOOP_BLOCKED_END\
+    for (; i < n; i++)
+
 
 #endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 6403efaeee17..839d2b3ae2bb 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1795,86 +1795,6 @@ DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step
     }
 }
 
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble, npy_float#
- *  #dtype = npy_float, npy_double, npy_longdouble, npy_half#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
- *  #c = f, , l, #
- *  #C = F, , L, #
- *  #trf = , , , npy_half_to_float#
- */
-
-/*
- * Pairwise summation, rounding error O(lg n) instead of O(n).
- * The recursion depth is O(lg n) as well.
- * when updating also update similar complex floats summation
- */
-static @type@
-pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
-{
-    if (n < 8) {
-        npy_intp i;
-        @type@ res = 0.;
-
-        for (i = 0; i < n; i++) {
-            res += @trf@(*((@dtype@*)(a + i * stride)));
-        }
-        return res;
-    }
-    else if (n <= PW_BLOCKSIZE) {
-        npy_intp i;
-        @type@ r[8], res;
-
-        /*
-         * sum a block with 8 accumulators
-         * 8 times unroll reduces blocksize to 16 and allows vectorization with
-         * avx without changing summation ordering
-         */
-        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
-        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
-        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
-        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
-        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
-        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
-        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
-        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
-
-        for (i = 8; i < n - (n % 8); i += 8) {
-            /* small blocksizes seems to mess with hardware prefetch */
-            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
-            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
-            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
-            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
-            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
-            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
-            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
-            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
-            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
-        }
-
-        /* accumulate now to avoid stack spills for single peel loop */
-        res = ((r[0] + r[1]) + (r[2] + r[3])) +
-              ((r[4] + r[5]) + (r[6] + r[7]));
-
-        /* do non multiple of 8 rest */
-        for (; i < n; i++) {
-            res += @trf@(*((@dtype@ *)(a + i * stride)));
-        }
-        return res;
-    }
-    else {
-        /* divide by two but avoid non-multiples of unroll factor */
-        npy_intp n2 = n / 2;
-
-        n2 -= n2 % 8;
-        return pairwise_sum_@TYPE@(a, n2, stride) +
-               pairwise_sum_@TYPE@(a + n2 * stride, n - n2, stride);
-    }
-}
-
-/**end repeat**/
-
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -1882,39 +1802,6 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
  *  #c = f, , l#
  *  #C = F, , L#
  */
-
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-#if @PW@
-        @type@ * iop1 = (@type@ *)args[0];
-        npy_intp n = dimensions[0];
-
-        *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
-#else
-        BINARY_REDUCE_LOOP(@type@) {
-            io1 @OP@= *(@type@ *)ip2;
-        }
-        *((@type@ *)iop1) = io1;
-#endif
-    }
-    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = in1 @OP@ in2;
-        }
-    }
-}
-/**end repeat1**/
-
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
  *        logical_and, logical_or#
@@ -2244,8 +2131,6 @@ NPY_NO_EXPORT void
     }
 }
 
-#define @TYPE@_true_divide @TYPE@_divide
-
 /**end repeat**/
 
 /*
@@ -2254,6 +2139,38 @@ NPY_NO_EXPORT void
  *****************************************************************************
  */
 
+/**begin repeat
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ */
+NPY_NO_EXPORT void
+LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if @PW@
+        npy_longdouble * iop1 = (npy_longdouble *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 @OP@= LONGDOUBLE_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(npy_longdouble) {
+            io1 @OP@= *(npy_longdouble *)ip2;
+        }
+        *((npy_longdouble *)iop1) = io1;
+#endif
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1 = *(npy_longdouble *)ip1;
+            const npy_longdouble in2 = *(npy_longdouble *)ip2;
+            *((npy_longdouble *)op1) = in1 @OP@ in2;
+        }
+    }
+}
+/**end repeat**/
+
 NPY_NO_EXPORT void
 LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -2306,7 +2223,7 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 #if @PW@
         npy_intp n = dimensions[0];
 
-        io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
+        io1 @OP@= HALF_pairwise_sum(args[1], n, steps[1]);
 #else
         BINARY_REDUCE_LOOP_INNER {
             io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
@@ -2614,9 +2531,6 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps,
     }
 }
 
-#define HALF_true_divide HALF_divide
-
-
 /*
  *****************************************************************************
  **                           COMPLEX LOOPS                                 **
@@ -2643,80 +2557,8 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps,
  * #SIMD = 1, 1, 0#
  */
 
-/* similar to pairwise sum of real floats */
-static void
-pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
-                    npy_intp stride)
-{
-    assert(n % 2 == 0);
-    if (n < 8) {
-        npy_intp i;
-
-        *rr = 0.;
-        *ri = 0.;
-        for (i = 0; i < n; i += 2) {
-            *rr += *((@ftype@ *)(a + i * stride + 0));
-            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
-        }
-        return;
-    }
-    else if (n <= PW_BLOCKSIZE) {
-        npy_intp i;
-        @ftype@ r[8];
-
-        /*
-         * sum a block with 8 accumulators
-         * 8 times unroll reduces blocksize to 16 and allows vectorization with
-         * avx without changing summation ordering
-         */
-        r[0] = *((@ftype@ *)(a + 0 * stride));
-        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
-        r[2] = *((@ftype@ *)(a + 2 * stride));
-        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
-        r[4] = *((@ftype@ *)(a + 4 * stride));
-        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
-        r[6] = *((@ftype@ *)(a + 6 * stride));
-        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
-
-        for (i = 8; i < n - (n % 8); i += 8) {
-            /* small blocksizes seems to mess with hardware prefetch */
-            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
-            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
-            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
-            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
-            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
-            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
-            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
-            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
-            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
-        }
-
-        /* accumulate now to avoid stack spills for single peel loop */
-        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
-        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
-
-        /* do non multiple of 8 rest */
-        for (; i < n; i+=2) {
-            *rr += *((@ftype@ *)(a + i * stride + 0));
-            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
-        }
-        return;
-    }
-    else {
-        /* divide by two but avoid non-multiples of unroll factor */
-        @ftype@ rr1, ri1, rr2, ri2;
-        npy_intp n2 = n / 2;
-
-        n2 -= n2 % 8;
-        pairwise_sum_@TYPE@(&rr1, &ri1, a, n2, stride);
-        pairwise_sum_@TYPE@(&rr2, &ri2, a + n2 * stride, n - n2, stride);
-        *rr = rr1 + rr2;
-        *ri = ri1 + ri2;
-        return;
-    }
-}
-
-
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 /**begin repeat1
  * arithmetic
  * #kind = add, subtract#
@@ -2733,7 +2575,7 @@ NPY_NO_EXPORT void
         @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
         @ftype@ rr, ri;
 
-        pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
         *or @OP@= rr;
         *oi @OP@= ri;
         return;
@@ -2763,6 +2605,7 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
     }
 }
+#endif // !SIMD
 
 NPY_NO_EXPORT void
 @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -2796,31 +2639,6 @@ NPY_NO_EXPORT void
     }
 }
 
-#if @SIMD@
-NPY_NO_EXPORT void
-@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
-    if (IS_BINARY_REDUCE) {
-        @TYPE@_add(args, dimensions, steps, func);
-    }
-    else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_add(args, dimensions, steps, func);
-    }
-}
-
-/**begin repeat1
- * arithmetic
- * #kind = subtract, multiply#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
-    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_@kind@(args, dimensions, steps, func);
-    }
-}
-/**end repeat1**/
-#endif
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -3066,8 +2884,6 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-#define @TYPE@_true_divide @TYPE@_divide
-
 /**end repeat**/
 
 #undef CGE
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index a0b68d168b19..c15ff8e3b75d 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -185,6 +185,22 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
@@ -356,9 +372,6 @@ NPY_NO_EXPORT void
 
 NPY_NO_EXPORT void
 @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-#define @TYPE@_true_divide @TYPE@_divide
-
 /**end repeat**/
 
 
@@ -367,6 +380,19 @@ NPY_NO_EXPORT void
  **                           COMPLEX LOOPS                                 **
  *****************************************************************************
  */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = add, subtract, multiply#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
 
 #define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
 #define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
@@ -380,26 +406,14 @@ NPY_NO_EXPORT void
  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  * #c = f, , l#
  * #C = F, , L#
- * #IFSIMD = 1, 1, 0#
  */
 
 /**begin repeat1
- * #isa = , _avx512f#
- */
-
-/**begin repeat2
  * arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
+ * #kind = add, subtract, multiply#
  */
-
 NPY_NO_EXPORT void
-C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat2**/
-
-NPY_NO_EXPORT void
-C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -480,7 +494,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v
 NPY_NO_EXPORT void
 C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
-#define C@TYPE@_true_divide C@TYPE@_divide
 
 /**end repeat**/
 
@@ -593,10 +606,6 @@ NPY_NO_EXPORT void
 TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /* Special case equivalents to above functions */
-
-#define TIMEDELTA_mq_m_true_divide TIMEDELTA_mq_m_divide
-#define TIMEDELTA_md_m_true_divide TIMEDELTA_md_m_divide
-#define TIMEDELTA_mm_d_true_divide TIMEDELTA_mm_d_divide
 #define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
 #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
 /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
new file mode 100644
index 000000000000..d8c8fdc9e41e
--- /dev/null
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -0,0 +1,777 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 avx2 avx512f
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+// TODO: replace raw SIMD with NPYV
+//###############################################################################
+//## Real Single/Double precision
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#ifdef NPY_HAVE_SSE2
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #c = f, #
+ *  #vtype = __m128, __m128d#
+ *  #vtype256 = __m256, __m256d#
+ *  #vtype512 = __m512, __m512d#
+ *  #vpre = _mm, _mm#
+ *  #vpre256 = _mm256, _mm256#
+ *  #vpre512 = _mm512, _mm512#
+ *  #vsuf = ps, pd#
+ *  #vsufs = ss, sd#
+ *  #nan = NPY_NANF, NPY_NAN#
+ *  #double = 0, 1#
+ *  #cast = _mm_castps_si128, _mm_castpd_si128#
+ */
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+static void
+sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef NPY_HAVE_AVX512F
+    const npy_intp vector_size_bytes = 64;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+#elif defined NPY_HAVE_AVX2
+    const npy_intp vector_size_bytes = 32;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
+            npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+#else
+    const npy_intp vector_size_bytes = 16;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
+            npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
+
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef NPY_HAVE_AVX512F
+    const npy_intp vector_size_bytes = 64;
+    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+
+
+#elif defined NPY_HAVE_AVX2
+    const npy_intp vector_size_bytes = 32;
+    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+#else
+    const npy_intp vector_size_bytes = 16;
+    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
+
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef NPY_HAVE_AVX512F
+    const npy_intp vector_size_bytes = 64;
+    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+
+#elif defined NPY_HAVE_AVX2
+    const npy_intp vector_size_bytes = 32;
+    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+#else
+    const npy_intp vector_size_bytes = 16;
+    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+#else // NPY_HAVE_SSE2
+
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #sfx = f32, f64#
+ *  #CHK =    , _F64#
+ */
+#if NPY_SIMD@CHK@
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+    /* lots of specializations, to squeeze out max performance */
+    if (ip1 == ip2) {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
+
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
+
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+/**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
+/**end repeat**/
+#endif // NPY_HAVE_SSE2
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ *  #vector = 1, 1, 0#
+ *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if @vector@ && defined NPY_HAVE_SSE2
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+#if defined NPY_HAVE_AVX512F
+    const npy_uintp vector_size_bytes = 64;
+#elif defined NPY_HAVE_AVX2
+    const npy_uintp vector_size_bytes = 32;
+#else
+    const npy_uintp vector_size_bytes = 32;
+#endif
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
+        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
+        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
+        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+#elif @VECTOR@
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #c = f, #
+ *  #C = F, #
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if @PW@
+        @type@ * iop1 = (@type@ *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(@type@) {
+            io1 @OP@= *(@type@ *)ip2;
+        }
+        *((@type@ *)iop1) = io1;
+#endif
+    }
+    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((@type@ *)op1) = in1 @OP@ in2;
+        }
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
+    /**
+     * For somehow MSVC commit aggressive optimization lead
+     * to raises 'RuntimeWarning: invalid value encountered in multiply'
+     *
+     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
+     * investigate about it while moving to NPYV.
+     */
+    #define AVX512F_NOMSVC
+#endif
+
+#ifdef AVX512F_NOMSVC
+static NPY_INLINE __mmask16
+avx512_get_full_load_mask_ps(void)
+{
+    return 0xFFFF;
+}
+
+static NPY_INLINE __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+    return 0xFF;
+}
+static NPY_INLINE __m512
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
+{
+    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+}
+
+static NPY_INLINE __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x0001 << num_elem) - 0x0001;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x01 << num_elem) - 0x01;
+}
+/**begin repeat
+ *  #vsub  = ps, pd#
+ *  #type= npy_float, npy_double#
+ *  #epi_vsub  = epi32, epi64#
+ *  #vtype = __m512, __m512d#
+ *  #mask = __mmask16, __mmask8#
+ *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
+ *  #neg_mask = 0x80000000, 0x8000000000000000#
+ *  #perm_ = 0xb1, 0x55#
+ *  #cmpx_img_mask = 0xAAAA, 0xAA#
+ *  #cmpx_re_mask = 0x5555, 0x55#
+ *  #INF = NPY_INFINITYF, NPY_INFINITY#
+ *  #NAN = NPY_NANF, NPY_NAN#
+ */
+static @vtype@
+avx512_hadd_@vsub@(const @vtype@ x)
+{
+    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+
+static @vtype@
+avx512_hsub_@vsub@(const @vtype@ x)
+{
+    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+static NPY_INLINE @vtype@
+avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
+{
+    // x1 = r1, i1
+    // x2 = r2, i2
+    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
+    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
+    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
+    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
+    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
+    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type = npy_float, npy_double#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #epi_vsub  = epi32, epi64#
+ * #mask = __mmask16, __mmask8#
+ * #vtype = __m512, __m512d#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ * #storemask = 0xFF, 0xF#
+ * #IS_FLOAT = 1, 0#
+ */
+/**begin repeat1
+ *  #func = add, subtract, multiply#
+ *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
+ */
+#if defined AVX512F_NOMSVC
+static NPY_INLINE void
+AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = 2*array_size;
+    @type@* ip1 = (@type@*) args[0];
+    @type@* ip2 = (@type@*) args[1];
+    @type@* op  = (@type@*) args[2];
+
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype@ x1, x2;
+        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
+
+        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
+
+        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+
+        ip1 += @num_lanes@;
+        ip2 += @num_lanes@;
+        op += @num_lanes@;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+#endif // AVX512F_NOMSVC
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type= npy_float, npy_double#
+ * #esize = 8, 16#
+ */
+/**begin repeat1
+ *  #func = add, subtract, multiply#
+ */
+static NPY_INLINE int
+run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+#if defined AVX512F_NOMSVC
+    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
+        AVX512F_@func@_@TYPE@(args, dimensions, steps);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #c = f, #
+ * #C = F, #
+ */
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract#
+ * #OP = +, -#
+ * #PW = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around @PW@ tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (@PW@)) {
+        npy_intp n = dimensions[0];
+        @ftype@ * or = ((@ftype@ *)args[0]);
+        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+        @ftype@ rr, ri;
+
+        @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or @OP@= rr;
+        *oi @OP@= ri;
+        return;
+    }
+    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
+            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+        }
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+            ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
+            ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+        }
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h
deleted file mode 100644
index f5540bdae2fd..000000000000
--- a/numpy/core/src/umath/loops_utils.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _NPY_UMATH_LOOPS_UTILS_H_
-#define _NPY_UMATH_LOOPS_UTILS_H_
-
-#include "numpy/npy_common.h" // NPY_FINLINE
-/*
- * nomemoverlap - returns false if two strided arrays have an overlapping
- * region in memory. ip_size/op_size = size of the arrays which can be negative
- * indicating negative steps.
- */
-NPY_FINLINE npy_bool
-nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
-{
-    char *ip_start, *ip_end, *op_start, *op_end;
-    if (ip_size < 0) {
-        ip_start = ip + ip_size;
-        ip_end = ip;
-    }
-    else {
-        ip_start = ip;
-        ip_end = ip + ip_size;
-    }
-    if (op_size < 0) {
-        op_start = op + op_size;
-        op_end = op;
-    }
-    else {
-        op_start = op;
-        op_end = op + op_size;
-    }
-    return (ip_start == op_start && op_end == ip_end) ||
-           (ip_start > op_end) || (op_start > ip_end);
-}
-
-// returns true if two strided arrays have an overlapping region in memory
-// same as `nomemoverlap()` but requires array length and step sizes
-NPY_FINLINE npy_bool
-is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
-{
-    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
-}
-
-#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
new file mode 100644
index 000000000000..dfa790ed9f34
--- /dev/null
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -0,0 +1,213 @@
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+#include "numpy/halffloat.h" // npy_half_to_float
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start == op_start && op_end == ip_end) ||
+           (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+/*
+ * cutoff blocksize for pairwise summation
+ * decreasing it decreases errors slightly as more pairs are summed but
+ * also lowers performance, as the inner loop is unrolled eight times it is
+ * effectively 16
+ */
+#define PW_BLOCKSIZE    128
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble, npy_float#
+ *  #dtype = npy_float, npy_double, npy_longdouble, npy_half#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
+ *  #c = f, , l, #
+ *  #C = F, , L, #
+ *  #trf = , , , npy_half_to_float#
+ */
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static NPY_INLINE @type@
+@TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        @type@ res = 0.;
+
+        for (i = 0; i < n; i++) {
+            res += @trf@(*((@dtype@*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        @type@ r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
+        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
+        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
+        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
+        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
+        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
+        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
+        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
+            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
+            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
+            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
+            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
+            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
+            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
+            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
+            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += @trf@(*((@dtype@ *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return @TYPE@_pairwise_sum(a, n2, stride) +
+               @TYPE@_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #ftype = npy_float, npy_double, npy_longdouble#
+ * #c = f, , l#
+ * #C = F, , L#
+ * #SIMD = 1, 1, 0#
+ */
+/* similar to pairwise sum of real floats */
+static NPY_INLINE void
+@TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = 0.;
+        *ri = 0.;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        @ftype@ r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((@ftype@ *)(a + 0 * stride));
+        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
+        r[2] = *((@ftype@ *)(a + 2 * stride));
+        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
+        r[4] = *((@ftype@ *)(a + 4 * stride));
+        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
+        r[6] = *((@ftype@ *)(a + 6 * stride));
+        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
+            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
+            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
+            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
+            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        @ftype@ rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        @TYPE@_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        @TYPE@_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+/**end repeat**/
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index a118fb0d091e..53bb4e0597f9 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,7 +28,6 @@
 #undef __AVX512F__
 #endif
 #endif
-#include "simd/simd.h"
 #include "loops_utils.h" // nomemoverlap
 #include <assert.h>
 #include <stdlib.h>
@@ -52,119 +51,6 @@
  */
 #define MAX_STEP_SIZE 2097152
 
-#define IS_BINARY_STRIDE_ONE(esize, vsize) \
-    ((steps[0] == esize) && \
-     (steps[1] == esize) && \
-     (steps[2] == esize) && \
-     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
-     (abs_ptrdiff(args[2], args[1]) >= vsize))
-
-/*
- * stride is equal to element size and input and destination are equal or
- * don't overlap within one register. The check of the steps against
- * esize also quarantees that steps are >= 0.
- */
-#define IS_BLOCKABLE_UNARY(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && \
-     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
-     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
-      ((abs_ptrdiff(args[1], args[0]) == 0))))
-
-/*
- * Avoid using SIMD for very large step sizes for several reasons:
- * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
- *    in which case we need two i64gather instructions and an additional vinsertf32x8
- *    instruction to load a single zmm register (since one i64gather instruction
- *    loads into a ymm register). This is not ideal for performance.
- * 2) Gather and scatter instructions can be slow when the loads/stores
- *    cross page boundaries.
- *
- * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
- * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
- * ensures this. The condition also requires that the input and output arrays
- * should have no overlap in memory.
- */
-#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
-     (labs(steps[1]) < MAX_STEP_SIZE)  && \
-     (labs(steps[2]) < MAX_STEP_SIZE)  && \
-     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
-     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
-
-#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
-     (labs(steps[1]) < MAX_STEP_SIZE)  && \
-     (labs(steps[2]) < MAX_STEP_SIZE)  && \
-     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
-     (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
-
-/*
- * 1) Output should be contiguous, can handle strided input data
- * 2) Input step should be smaller than MAX_STEP_SIZE for performance
- * 3) Input and output arrays should have no overlap in memory
- */
-#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
-    ((steps[0] & (esizein-1)) == 0 && \
-     steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
-     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
-
-#define IS_BLOCKABLE_REDUCE(esize, vsize) \
-    (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
-     npy_is_aligned(args[1], (esize)) && \
-     npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY(esize, vsize) \
-    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
-     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
-     npy_is_aligned(args[0], (esize)) && \
-     (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
-      abs_ptrdiff(args[2], args[0]) == 0) && \
-     (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
-      abs_ptrdiff(args[2], args[1]) >= 0))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
-    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
-     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
-     ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
-      (abs_ptrdiff(args[2], args[1]) == 0)) && \
-     abs_ptrdiff(args[2], args[0]) >= (esize))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
-    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
-     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
-     ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
-      (abs_ptrdiff(args[2], args[0]) == 0)) && \
-     abs_ptrdiff(args[2], args[1]) >= (esize))
-
-#undef abs_ptrdiff
-
-#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)) && \
-     npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
-    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
-     npy_is_aligned(args[0], (esize)))
-
-/* align var to alignment */
-#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
-    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
-                                                alignment, n);\
-    for(i = 0; i < peel; i++)
-
-#define LOOP_BLOCKED(type, vsize)\
-    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
-            i += (vsize / sizeof(type)))
-
-#define LOOP_BLOCKED_END\
-    for (; i < n; i++)
-
-
 /*
  * Dispatcher functions
  * decide whether the operation can be vectorized and run it
@@ -183,31 +69,6 @@
  * #esize = 8, 16#
  */
 
-/**begin repeat1
- *  #func = add, subtract, multiply#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps);
-#endif
-
-static NPY_INLINE int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
-        AVX512F_@func@_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
 /**begin repeat1
  *  #func = square, absolute, conjugate#
  *  #outsize = 1, 2, 1#
@@ -507,92 +368,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
 
 /**end repeat1**/
 
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
-                          npy_intp n);
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
-                                  npy_intp n);
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
-                                  npy_intp n);
-
-#elif @VECTOR@
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
-                          npy_intp n);
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
-                                  npy_intp n);
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
-                                  npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-#if defined __AVX512F__
-    const npy_uintp vector_size_bytes = 64;
-#elif defined __AVX2__
-    const npy_uintp vector_size_bytes = 32;
-#else
-    const npy_uintp vector_size_bytes = 32;
-#endif
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-#elif @VECTOR@
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
  *         logical_and, logical_or#
@@ -777,7 +552,6 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
     _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
     return r;
 }
-
 /**end repeat**/
 
 /**begin repeat
@@ -797,327 +571,6 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
  *  #double = 0, 1#
  *  #cast = _mm_castps_si128, _mm_castpd_si128#
  */
-
-
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef  __AVX512F__
-    const npy_intp vector_size_bytes = 64;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
-            }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
-            }
-        }
-    }
-#elif defined __AVX2__
-    const npy_intp vector_size_bytes = 32;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-    }
-#else
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
-            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
-
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __AVX512F__
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-
-#elif defined __AVX2__
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-}
-
-
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __AVX512F__
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-#elif defined __AVX2__
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-}
-
-/**end repeat1**/
-
 /*
  * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
  * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
@@ -3347,45 +2800,6 @@ AVX512F_log_DOUBLE(npy_double * op,
  * #IS_FLOAT = 1, 0#
  */
 
-/**begin repeat1
- *  #func = add, subtract, multiply#
- *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = 2*array_size;
-    @type@* ip1 = (@type@*) args[0];
-    @type@* ip2 = (@type@*) args[1];
-    @type@* op  = (@type@*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
-        ip1 += @num_lanes@;
-        ip2 += @num_lanes@;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-
 /**begin repeat1
  *  #func = square, conjugate#
  *  #vectorf = avx512_csquare, avx512_conjugate#
@@ -3648,86 +3062,5 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 /**end repeat**/
 
 #undef VECTOR_SIZE_BYTES
-#else  /* NPY_HAVE_SSE2_INTRINSICS */
-
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #sfx = f32, f64#
- *  #CHK =    , _F64#
- */
-
-#if NPY_SIMD@CHK@
-
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-    /* lots of specializations, to squeeze out max performance */
-    if (ip1 == ip2) {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
-
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-}
-
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-}
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif
+#endif  /* NPY_HAVE_SSE2_INTRINSICS */
 #endif

From 81bb563172c84eb1eba92b05fa99d902d28af13b Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sat, 19 Dec 2020 13:07:40 -0700
Subject: [PATCH 4/4] MAINT: Small style fixes.

---
 numpy/core/code_generators/generate_umath.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index cdd2a84941fa..4e9a2cfec3ef 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -46,7 +46,7 @@ class TypeDescription:
         If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
         instead of PyUFunc_x_x/PyUFunc_xx_x.
     cfunc_alias : str or none, optional
-        appended to inner loop C function name, e.g. FLOAT_{cfunc_alias} (see make_arrays)
+        Appended to inner loop C function name, e.g., FLOAT_{cfunc_alias}. See make_arrays.
         NOTE: it doesn't support 'astype'
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order