diff --git a/doc/source/reference/c-api.iterator.rst b/doc/source/reference/c-api.iterator.rst index b38c21390b44..b5d00f4be1a9 100644 --- a/doc/source/reference/c-api.iterator.rst +++ b/doc/source/reference/c-api.iterator.rst @@ -461,6 +461,33 @@ Construction and Destruction Then, call :c:func:`NpyIter_Reset` to allocate and fill the buffers with their initial values. + .. c:var:: NPY_ITER_COPY_IF_OVERLAP + + If a write operand has overlap with a read operand, eliminate all + overlap by making temporary copies (with UPDATEIFCOPY for write + operands). + + Overlapping means: + + - For a (read, write) pair of operands, there is a memory address + that contains data common to both arrays, which can be reached + via *different* index/dtype/shape combinations. + + - In particular, unless the arrays have the same shape, dtype, + strides, and start address, any shared common data byte accessible + by indexing implies overlap. + + Because exact overlap detection has exponential runtime + in the number of dimensions, the decision is made based + on heuristics, which has false positives (needless copies in unusual + cases) but has no false negatives. + + If read/write overlap exists and write operands are modified in the + iterator loop element-wise, this flag ensures the result of the + operation is the same as if all operands were copied. + In cases where copies would need to be made, **the result of the + computation may be undefined without this flag!** + Flags that may be passed in ``op_flags[i]``, where ``0 <= i < nop``: .. c:var:: NPY_ITER_READWRITE diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py index 41267b797349..5ffb411c6dd4 100644 --- a/numpy/add_newdocs.py +++ b/numpy/add_newdocs.py @@ -169,6 +169,11 @@ with one per iteration dimension, to be tracked. * "common_dtype" causes all the operands to be converted to a common data type, with copying or buffering as necessary. + * "copy_if_overlap" causes the iterator to determine if read + operands have overlap with write operands (except if + the arrays are exactly the same), and make temporary copies + as necessary to avoid overlap. False positives (needless + copying) are possible in some cases. * "delay_bufalloc" delays allocation of the buffers until a reset() call is made. Allows "allocate" operands to be initialized before their values are copied into the buffers. diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h index a9848f43496e..06d351a6af77 100644 --- a/numpy/core/include/numpy/ndarraytypes.h +++ b/numpy/core/include/numpy/ndarraytypes.h @@ -1008,6 +1008,12 @@ typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter, #define NPY_ITER_DELAY_BUFALLOC 0x00000800 /* When NPY_KEEPORDER is specified, disable reversing negative-stride axes */ #define NPY_ITER_DONT_NEGATE_STRIDES 0x00001000 +/* + * If output operands overlap with other operands (based on heuristics that + * has false positives but no false negatives), make temporary copies to + * eliminate overlap. + */ +#define NPY_ITER_COPY_IF_OVERLAP 0x00002000 /*** Per-operand flags that may be passed to the iterator constructors ***/ diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 0b055dba460d..c8198c7331d3 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -905,6 +905,7 @@ def generate_umath_c(ext, build_dir): join('src', 'private', 'templ_common.h.src'), join('src', 'umath', 'simd.inc.src'), join(codegen_dir, 'generate_ufunc_api.py'), + join('src', 'private', 'lowlevel_strided_loops.h'), join('src', 'private', 'ufunc_override.h')] + npymath_sources config.add_extension('umath', diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c index 3cbbb2b27605..f91e957ccdee 100644 --- a/numpy/core/src/multiarray/nditer_constr.c +++ b/numpy/core/src/multiarray/nditer_constr.c @@ -17,6 +17,8 @@ #include "arrayobject.h" #include "templ_common.h" +#include "mem_overlap.h" + /* Internal helper functions private to this file */ static int @@ -2711,6 +2713,73 @@ npyiter_allocate_arrays(NpyIter *iter, bufferdata = NIT_BUFFERDATA(iter); } + if (flags & NPY_ITER_COPY_IF_OVERLAP) { + /* Perform operand memory overlap checks if requested */ + for (iop = 0; iop < nop; ++iop) { + int may_share_memory = 0; + int iother; + + if (op[iop] == NULL) { + /* Iterator will always allocate */ + continue; + } + + if (!(op_itflags[iop] & NPY_OP_ITFLAG_WRITE)) { + /* + * Copy output operands only, not inputs. + * A more sophisticated heuristic could be + * substituted here later. + */ + continue; + } + + for (iother = 0; iother < nop; ++iother) { + if (iother == iop || op[iother] == NULL) { + continue; + } + + if (!(op_itflags[iother] & NPY_OP_ITFLAG_READ)) { + /* No data dependence for arrays not read from */ + continue; + } + + if (op_itflags[iother] & NPY_OP_ITFLAG_FORCECOPY) { + /* Already copied */ + continue; + } + + /* + * If the arrays are views to exactly the same data, no need + * to make copies, because ufunc inner loops are assumed to + * deal with that + */ + if (PyArray_BYTES(op[iop]) == PyArray_BYTES(op[iother]) && + PyArray_NDIM(op[iop]) == PyArray_NDIM(op[iother]) && + PyArray_CompareLists(PyArray_DIMS(op[iop]), + PyArray_DIMS(op[iother]), + PyArray_NDIM(op[iop])) && + PyArray_CompareLists(PyArray_STRIDES(op[iop]), + PyArray_STRIDES(op[iother]), + PyArray_NDIM(op[iop])) && + PyArray_DESCR(op[iop]) == PyArray_DESCR(op[iother])) { + continue; + } + + /* + * Use max work = 1. If the arrays are large, it might + * make sense to go further. + */ + may_share_memory = solve_may_share_memory( + op[iop], op[iother], 1); + + if (may_share_memory) { + op_itflags[iop] |= NPY_OP_ITFLAG_FORCECOPY; + break; + } + } + } + } + for (iop = 0; iop < nop; ++iop) { /* * Check whether there are any WRITEMASKED REDUCE operands @@ -2800,9 +2869,15 @@ npyiter_allocate_arrays(NpyIter *iter, NBF_STRIDES(bufferdata)[iop] = 0; } } - /* If casting is required and permitted */ - else if ((op_itflags[iop] & NPY_OP_ITFLAG_CAST) && - (op_flags[iop] & (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) { + /* + * Make a temporary copy if, + * 1. If casting is required and permitted, or, + * 2. If force-copy is requested + */ + else if (((op_itflags[iop] & NPY_OP_ITFLAG_CAST) && + (op_flags[iop] & + (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) || + (op_itflags[iop] & NPY_OP_ITFLAG_FORCECOPY)) { PyArrayObject *temp; int ondim = PyArray_NDIM(op[iop]); diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h index ae24f46e6e61..7788d327b7bb 100644 --- a/numpy/core/src/multiarray/nditer_impl.h +++ b/numpy/core/src/multiarray/nditer_impl.h @@ -122,6 +122,8 @@ #define NPY_OP_ITFLAG_WRITEMASKED 0x0080 /* The operand's data pointer is pointing into its buffer */ #define NPY_OP_ITFLAG_USINGBUFFER 0x0100 +/* The operand must be copied (with UPDATEIFCOPY if also ITFLAG_WRITE) */ +#define NPY_OP_ITFLAG_FORCECOPY 0x0200 /* * The data layout of the iterator is fully specified by diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c index c735e7ad10db..26756dfc1201 100644 --- a/numpy/core/src/multiarray/nditer_pywrap.c +++ b/numpy/core/src/multiarray/nditer_pywrap.c @@ -148,6 +148,11 @@ NpyIter_GlobalFlagsConverter(PyObject *flags_in, npy_uint32 *flags) flag = NPY_ITER_C_INDEX; } break; + case 'i': + if (strcmp(str, "copy_if_overlap") == 0) { + flag = NPY_ITER_COPY_IF_OVERLAP; + } + break; case 'n': if (strcmp(str, "common_dtype") == 0) { flag = NPY_ITER_COMMON_DTYPE; diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py index 3b5aaa28d2fb..722c8d1a4dc8 100644 --- a/numpy/core/tests/test_nditer.py +++ b/numpy/core/tests/test_nditer.py @@ -1138,6 +1138,91 @@ def test_iter_common_dtype(): assert_equal(i.dtypes[1], np.dtype('c16')) assert_equal(i.dtypes[2], np.dtype('c16')) +def test_iter_copy_if_overlap(): + # Ensure the iterator makes copies on read/write overlap, if requested + + # Copy not needed, 1 op + for flag in ['readonly', 'writeonly', 'readwrite']: + a = arange(10) + i = nditer([a], ['copy_if_overlap'], [[flag]]) + assert_(i.operands[0] is a) + + # Copy needed, 2 ops, read-write overlap + x = arange(10) + a = x[1:] + b = x[:-1] + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']]) + assert_(not np.shares_memory(*i.operands)) + + # Copy not needed, 2 ops, exactly same arrays + x = arange(10) + a = x + b = x + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']]) + assert_(i.operands[0] is a and i.operands[1] is b) + + # Copy not needed, 2 ops, no overlap + x = arange(10) + a = x[::2] + b = x[1::2] + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']]) + assert_(i.operands[0] is a and i.operands[1] is b) + + # Copy needed, 2 ops, read-write overlap + x = arange(4, dtype=np.int8) + a = x[3:] + b = x.view(np.int32)[:1] + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']]) + assert_(not np.shares_memory(*i.operands)) + + # Copy needed, 3 ops, read-write overlap + for flag in ['writeonly', 'readwrite']: + x = np.ones([10, 10]) + a = x + b = x.T + c = x + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['readonly'], [flag]]) + a2, b2, c2 = i.operands + assert_(not np.shares_memory(a2, c2)) + assert_(not np.shares_memory(b2, c2)) + + # Copy not needed, 3 ops, read-only overlap + x = np.ones([10, 10]) + a = x + b = x.T + c = x + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['readonly'], ['readonly']]) + a2, b2, c2 = i.operands + assert_(a is a2) + assert_(b is b2) + assert_(c is c2) + + # Copy not needed, 3 ops, read-only overlap + x = np.ones([10, 10]) + a = x + b = np.ones([10, 10]) + c = x.T + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['writeonly'], ['readonly']]) + a2, b2, c2 = i.operands + assert_(a is a2) + assert_(b is b2) + assert_(c is c2) + + # Copy not needed, 3 ops, write-only overlap + x = np.arange(7) + a = x[:3] + b = x[3:6] + c = x[4:7] + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['writeonly'], ['writeonly']]) + a2, b2, c2 = i.operands + assert_(a is a2) + assert_(b is b2) + assert_(c is c2) + def test_iter_op_axes(): # Check that custom axes work