From 9d08632f7f1e641ffde2dc21d245ca9d6bf8d3e2 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 16 May 2023 10:46:25 -0600 Subject: [PATCH 1/7] BENCH: add benchmark for where slow path --- benchmarks/benchmarks/bench_function_base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py index cc37bef3994b..1655868b3b55 100644 --- a/benchmarks/benchmarks/bench_function_base.py +++ b/benchmarks/benchmarks/bench_function_base.py @@ -308,7 +308,9 @@ def time_sort_worst(self): class Where(Benchmark): def setup(self): self.d = np.arange(20000) + self.d_o = self.d.astype(object) self.e = self.d.copy() + self.e_o = self.d_o.copy() self.cond = (self.d > 5000) size = 1024 * 1024 // 8 rnd_array = np.random.rand(size) @@ -332,6 +334,11 @@ def time_1(self): def time_2(self): np.where(self.cond, self.d, self.e) + def time_2_object(self): + # object and byteswapped arrays have a + # special slow path in the where internals + np.where(self.cond, self.d_o, self.e_o) + def time_2_broadcast(self): np.where(self.cond, self.d, 0) From f8b1a3e4a794352b7f608782b9ad818490f0cc11 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 16 May 2023 10:50:39 -0600 Subject: [PATCH 2/7] MAINT: do not use copyswap in where internals --- numpy/core/src/multiarray/multiarraymodule.c | 74 ++++++++++++++++---- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index d7d19493b754..bbd9b5089a81 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -67,6 +67,8 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "mem_overlap.h" #include "typeinfo.h" #include "convert.h" /* for PyArray_AssignZero */ +#include "lowlevel_strided_loops.h" +#include "dtype_transfer.h" #include "get_attr_string.h" #include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */ @@ -3381,6 +3383,9 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) return NULL; } + NPY_cast_info x_cast_info = {.func = NULL}; + NPY_cast_info y_cast_info = {.func = NULL}; + ax = (PyArrayObject*)PyArray_FROM_O(x); ay = (PyArrayObject*)PyArray_FROM_O(y); if (ax == NULL || ay == NULL) { @@ -3423,6 +3428,43 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) /* Get the result from the iterator object array */ ret = (PyObject*)NpyIter_GetOperandArray(iter)[0]; + PyArray_Descr **dts = NpyIter_GetDescrArray(iter); + PyArray_Descr *dtx = dts[2]; + PyArray_Descr *dty = dts[3]; + npy_intp itemsize = dts[0]->elsize; + + npy_intp *strides = NpyIter_GetInnerStrideArray(iter); + npy_intp cstride = strides[1]; + npy_intp xstride = strides[2]; + npy_intp ystride = strides[3]; + + int axswap = PyDataType_ISBYTESWAPPED(dtx); + int ayswap = PyDataType_ISBYTESWAPPED(dty); + int native = (axswap == ayswap) && (axswap == 0) && !needs_api; + + NPY_ARRAYMETHOD_FLAGS x_transfer_flags; + NPY_ARRAYMETHOD_FLAGS y_transfer_flags; + + int x_is_aligned = IsAligned(ax); + int y_is_aligned = IsAligned(ay); + + npy_intp xstrides[2] = {xstride, itemsize}; + npy_intp ystrides[2] = {xstride, itemsize}; + + npy_intp one = 1; + + if (PyArray_GetDTypeTransferFunction( + x_is_aligned, xstrides[0], xstrides[1], dtx, common_dt, 0, + &x_cast_info, &x_transfer_flags) != NPY_SUCCEED) { + goto fail; + } + + if (PyArray_GetDTypeTransferFunction( + y_is_aligned, ystrides[0], ystrides[1], dty, common_dt, 0, + &y_cast_info, &y_transfer_flags) != NPY_SUCCEED) { + goto fail; + } + NPY_BEGIN_THREADS_NDITER(iter); if (NpyIter_GetIterSize(iter) != 0) { @@ -3431,18 +3473,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) char **dataptrarray = NpyIter_GetDataPtrArray(iter); do { - PyArray_Descr * dtx = NpyIter_GetDescrArray(iter)[2]; - PyArray_Descr * dty = NpyIter_GetDescrArray(iter)[3]; - int axswap = PyDataType_ISBYTESWAPPED(dtx); - int ayswap = PyDataType_ISBYTESWAPPED(dty); - PyArray_CopySwapFunc *copyswapx = dtx->f->copyswap; - PyArray_CopySwapFunc *copyswapy = dty->f->copyswap; - int native = (axswap == ayswap) && (axswap == 0) && !needs_api; npy_intp n = (*innersizeptr); - npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize; - npy_intp cstride = NpyIter_GetInnerStrideArray(iter)[1]; - npy_intp xstride = NpyIter_GetInnerStrideArray(iter)[2]; - npy_intp ystride = NpyIter_GetInnerStrideArray(iter)[3]; char * dst = dataptrarray[0]; char * csrc = dataptrarray[1]; char * xsrc = dataptrarray[2]; @@ -3465,14 +3496,25 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) INNER_WHERE_LOOP(1); } else { - /* copyswap is faster than memcpy even if we are native */ npy_intp i; for (i = 0; i < n; i++) { if (*csrc) { - copyswapx(dst, xsrc, axswap, ret); + char *args[2] = {xsrc, dst}; + + if (x_cast_info.func( + &x_cast_info.context, args, &one, strides, + x_cast_info.auxdata) < 0) { + goto fail; + } } else { - copyswapy(dst, ysrc, ayswap, ret); + char *args[2] = {ysrc, dst}; + + if (y_cast_info.func( + &y_cast_info.context, args, &one, strides, + y_cast_info.auxdata) < 0) { + goto fail; + } } dst += itemsize; xsrc += xstride; @@ -3489,6 +3531,8 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) Py_DECREF(arr); Py_DECREF(ax); Py_DECREF(ay); + NPY_cast_info_xfree(&x_cast_info); + NPY_cast_info_xfree(&y_cast_info); if (NpyIter_Deallocate(iter) != NPY_SUCCEED) { Py_DECREF(ret); @@ -3502,6 +3546,8 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) Py_DECREF(arr); Py_XDECREF(ax); Py_XDECREF(ay); + NPY_cast_info_xfree(&x_cast_info); + NPY_cast_info_xfree(&y_cast_info); return NULL; } From 46cf47d1e23069c359f84f2f26db2ab075f45a8b Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 16 May 2023 11:34:39 -0600 Subject: [PATCH 3/7] MAINT: attempt to speed up optimized path --- numpy/core/src/multiarray/multiarraymodule.c | 21 +++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index bbd9b5089a81..a1dd1a5a4ef4 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -3453,16 +3453,19 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) npy_intp one = 1; - if (PyArray_GetDTypeTransferFunction( - x_is_aligned, xstrides[0], xstrides[1], dtx, common_dt, 0, - &x_cast_info, &x_transfer_flags) != NPY_SUCCEED) { - goto fail; - } + if (!native || ((itemsize != 16) && (itemsize != 8) && (itemsize != 4) && + (itemsize != 2) && (itemsize != 1))) { + if (PyArray_GetDTypeTransferFunction( + x_is_aligned, xstrides[0], xstrides[1], dtx, common_dt, 0, + &x_cast_info, &x_transfer_flags) != NPY_SUCCEED) { + goto fail; + } - if (PyArray_GetDTypeTransferFunction( - y_is_aligned, ystrides[0], ystrides[1], dty, common_dt, 0, - &y_cast_info, &y_transfer_flags) != NPY_SUCCEED) { - goto fail; + if (PyArray_GetDTypeTransferFunction( + y_is_aligned, ystrides[0], ystrides[1], dty, common_dt, 0, + &y_cast_info, &y_transfer_flags) != NPY_SUCCEED) { + goto fail; + } } NPY_BEGIN_THREADS_NDITER(iter); From efa004bb7af6a3908b6be37d20f448f4c2ee7357 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 16 May 2023 14:53:32 -0600 Subject: [PATCH 4/7] MAINT: use NPY_ITER_ALIGNED flag to simplify cast setup --- numpy/core/src/multiarray/multiarraymodule.c | 58 ++++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index a1dd1a5a4ef4..10938c70d994 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -49,6 +49,8 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "convert_datatype.h" #include "conversion_utils.h" #include "nditer_pywrap.h" +#define NPY_ITERATOR_IMPLEMENTATION_CODE +#include "nditer_impl.h" #include "methods.h" #include "_datetime.h" #include "datetime_strings.h" @@ -3383,8 +3385,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) return NULL; } - NPY_cast_info x_cast_info = {.func = NULL}; - NPY_cast_info y_cast_info = {.func = NULL}; + NPY_cast_info cast_info = {.func = NULL}; ax = (PyArrayObject*)PyArray_FROM_O(x); ay = (PyArrayObject*)PyArray_FROM_O(y); @@ -3399,7 +3400,9 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) }; npy_uint32 op_flags[4] = { NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE | NPY_ITER_NO_SUBTYPE, - NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_READONLY + NPY_ITER_READONLY, + NPY_ITER_READONLY | NPY_ITER_ALIGNED, + NPY_ITER_READONLY | NPY_ITER_ALIGNED }; PyArray_Descr * common_dt = PyArray_ResultType(2, &op_in[0] + 2, 0, NULL); @@ -3442,33 +3445,32 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) int ayswap = PyDataType_ISBYTESWAPPED(dty); int native = (axswap == ayswap) && (axswap == 0) && !needs_api; - NPY_ARRAYMETHOD_FLAGS x_transfer_flags; - NPY_ARRAYMETHOD_FLAGS y_transfer_flags; + NPY_ARRAYMETHOD_FLAGS transfer_flags = 0; - int x_is_aligned = IsAligned(ax); - int y_is_aligned = IsAligned(ay); - - npy_intp xstrides[2] = {xstride, itemsize}; - npy_intp ystrides[2] = {xstride, itemsize}; + npy_intp transfer_strides[2] = {xstride, itemsize}; npy_intp one = 1; if (!native || ((itemsize != 16) && (itemsize != 8) && (itemsize != 4) && (itemsize != 2) && (itemsize != 1))) { + // The iterator has NPY_ITER_ALIGNED flag so no need to check alignment + // of the input arrays. + // + // There's also no need to set up a cast for y, since the iterator + // ensures both casts are identical. if (PyArray_GetDTypeTransferFunction( - x_is_aligned, xstrides[0], xstrides[1], dtx, common_dt, 0, - &x_cast_info, &x_transfer_flags) != NPY_SUCCEED) { - goto fail; - } - - if (PyArray_GetDTypeTransferFunction( - y_is_aligned, ystrides[0], ystrides[1], dty, common_dt, 0, - &y_cast_info, &y_transfer_flags) != NPY_SUCCEED) { + 1, xstride, itemsize, dtx, common_dt, 0, + &cast_info, &transfer_flags) != NPY_SUCCEED) { goto fail; } } - NPY_BEGIN_THREADS_NDITER(iter); + transfer_flags = PyArrayMethod_COMBINED_FLAGS( + transfer_flags, NpyIter_GetTransferFlags(iter)); + + if (!(transfer_flags & NPY_METH_REQUIRES_PYAPI)) { + NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter)); + } if (NpyIter_GetIterSize(iter) != 0) { NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); @@ -3504,18 +3506,18 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) if (*csrc) { char *args[2] = {xsrc, dst}; - if (x_cast_info.func( - &x_cast_info.context, args, &one, strides, - x_cast_info.auxdata) < 0) { + if (cast_info.func( + &cast_info.context, args, &one, + transfer_strides, cast_info.auxdata) < 0) { goto fail; } } else { char *args[2] = {ysrc, dst}; - if (y_cast_info.func( - &y_cast_info.context, args, &one, strides, - y_cast_info.auxdata) < 0) { + if (cast_info.func( + &cast_info.context, args, &one, + transfer_strides, cast_info.auxdata) < 0) { goto fail; } } @@ -3534,8 +3536,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) Py_DECREF(arr); Py_DECREF(ax); Py_DECREF(ay); - NPY_cast_info_xfree(&x_cast_info); - NPY_cast_info_xfree(&y_cast_info); + NPY_cast_info_xfree(&cast_info); if (NpyIter_Deallocate(iter) != NPY_SUCCEED) { Py_DECREF(ret); @@ -3549,8 +3550,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) Py_DECREF(arr); Py_XDECREF(ax); Py_XDECREF(ay); - NPY_cast_info_xfree(&x_cast_info); - NPY_cast_info_xfree(&y_cast_info); + NPY_cast_info_xfree(&cast_info); return NULL; } From 20463eadc93bd0f8e063cb2289c1a102bd2a6b94 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 17 May 2023 09:24:47 -0600 Subject: [PATCH 5/7] MAINT: simplify where cast setup further --- numpy/core/src/multiarray/multiarraymodule.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 10938c70d994..f8403acc5d2f 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -3431,23 +3431,19 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) /* Get the result from the iterator object array */ ret = (PyObject*)NpyIter_GetOperandArray(iter)[0]; - PyArray_Descr **dts = NpyIter_GetDescrArray(iter); - PyArray_Descr *dtx = dts[2]; - PyArray_Descr *dty = dts[3]; - npy_intp itemsize = dts[0]->elsize; + npy_intp itemsize = common_dt->elsize; npy_intp *strides = NpyIter_GetInnerStrideArray(iter); npy_intp cstride = strides[1]; npy_intp xstride = strides[2]; npy_intp ystride = strides[3]; - int axswap = PyDataType_ISBYTESWAPPED(dtx); - int ayswap = PyDataType_ISBYTESWAPPED(dty); - int native = (axswap == ayswap) && (axswap == 0) && !needs_api; + int swap = PyDataType_ISBYTESWAPPED(common_dt); + int native = (swap == 0) && !needs_api; NPY_ARRAYMETHOD_FLAGS transfer_flags = 0; - npy_intp transfer_strides[2] = {xstride, itemsize}; + npy_intp transfer_strides[2] = {itemsize, itemsize}; npy_intp one = 1; @@ -3459,7 +3455,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) // There's also no need to set up a cast for y, since the iterator // ensures both casts are identical. if (PyArray_GetDTypeTransferFunction( - 1, xstride, itemsize, dtx, common_dt, 0, + 1, xstride, itemsize, common_dt, common_dt, 0, &cast_info, &transfer_flags) != NPY_SUCCEED) { goto fail; } From 2db272b5918606dca5a897b5c8bd21ea540a7b15 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 17 May 2023 09:59:29 -0600 Subject: [PATCH 6/7] move stride unpacking inside the iteration loop --- numpy/core/src/multiarray/multiarraymodule.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index f8403acc5d2f..8c830cd51fe3 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -3433,11 +3433,6 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) npy_intp itemsize = common_dt->elsize; - npy_intp *strides = NpyIter_GetInnerStrideArray(iter); - npy_intp cstride = strides[1]; - npy_intp xstride = strides[2]; - npy_intp ystride = strides[3]; - int swap = PyDataType_ISBYTESWAPPED(common_dt); int native = (swap == 0) && !needs_api; @@ -3455,7 +3450,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) // There's also no need to set up a cast for y, since the iterator // ensures both casts are identical. if (PyArray_GetDTypeTransferFunction( - 1, xstride, itemsize, common_dt, common_dt, 0, + 1, itemsize, itemsize, common_dt, common_dt, 0, &cast_info, &transfer_flags) != NPY_SUCCEED) { goto fail; } @@ -3472,6 +3467,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); npy_intp * innersizeptr = NpyIter_GetInnerLoopSizePtr(iter); char **dataptrarray = NpyIter_GetDataPtrArray(iter); + npy_intp *strides = NpyIter_GetInnerStrideArray(iter); do { npy_intp n = (*innersizeptr); @@ -3480,6 +3476,12 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) char * xsrc = dataptrarray[2]; char * ysrc = dataptrarray[3]; + // the iterator might mutate these pointers, + // so need to update them every iteration + npy_intp cstride = strides[1]; + npy_intp xstride = strides[2]; + npy_intp ystride = strides[3]; + /* constant sizes so compiler replaces memcpy */ if (native && itemsize == 16) { INNER_WHERE_LOOP(16); From 01a251ba0d014ebf7f748b1766c5890e1caa8831 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 17 May 2023 21:13:17 +0200 Subject: [PATCH 7/7] MAINT: Use has_ref for trivial copy decision and remove unused needs_api The dtype is forced to be the same, swapping is impossible but even if it was possible, it wouldn't matter here. --- numpy/core/src/multiarray/multiarraymodule.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 8c830cd51fe3..0fa1af11d0ab 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -3409,7 +3409,6 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) PyArray_Descr * op_dt[4] = {common_dt, PyArray_DescrFromType(NPY_BOOL), common_dt, common_dt}; NpyIter * iter; - int needs_api; NPY_BEGIN_THREADS_DEF; if (common_dt == NULL || op_dt[1] == NULL) { @@ -3426,23 +3425,19 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) goto fail; } - needs_api = NpyIter_IterationNeedsAPI(iter); - /* Get the result from the iterator object array */ ret = (PyObject*)NpyIter_GetOperandArray(iter)[0]; npy_intp itemsize = common_dt->elsize; - int swap = PyDataType_ISBYTESWAPPED(common_dt); - int native = (swap == 0) && !needs_api; + int has_ref = PyDataType_REFCHK(common_dt); NPY_ARRAYMETHOD_FLAGS transfer_flags = 0; npy_intp transfer_strides[2] = {itemsize, itemsize}; - npy_intp one = 1; - if (!native || ((itemsize != 16) && (itemsize != 8) && (itemsize != 4) && + if (has_ref || ((itemsize != 16) && (itemsize != 8) && (itemsize != 4) && (itemsize != 2) && (itemsize != 1))) { // The iterator has NPY_ITER_ALIGNED flag so no need to check alignment // of the input arrays. @@ -3483,19 +3478,19 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) npy_intp ystride = strides[3]; /* constant sizes so compiler replaces memcpy */ - if (native && itemsize == 16) { + if (!has_ref && itemsize == 16) { INNER_WHERE_LOOP(16); } - else if (native && itemsize == 8) { + else if (!has_ref && itemsize == 8) { INNER_WHERE_LOOP(8); } - else if (native && itemsize == 4) { + else if (!has_ref && itemsize == 4) { INNER_WHERE_LOOP(4); } - else if (native && itemsize == 2) { + else if (!has_ref && itemsize == 2) { INNER_WHERE_LOOP(2); } - else if (native && itemsize == 1) { + else if (!has_ref && itemsize == 1) { INNER_WHERE_LOOP(1); } else {