From 06fe5592a86b801ab0b18b81b217e456e4ce18db Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 6 Sep 2016 00:08:58 +0200 Subject: [PATCH 1/2] MAINT: core: add a missing compilation dependency to setup.py --- numpy/core/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 0b055dba460d..c8198c7331d3 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -905,6 +905,7 @@ def generate_umath_c(ext, build_dir): join('src', 'private', 'templ_common.h.src'), join('src', 'umath', 'simd.inc.src'), join(codegen_dir, 'generate_ufunc_api.py'), + join('src', 'private', 'lowlevel_strided_loops.h'), join('src', 'private', 'ufunc_override.h')] + npymath_sources config.add_extension('umath', From a0630407d54395190ceeb8bc32fb09d8f778831e Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 5 Sep 2016 20:41:56 +0200 Subject: [PATCH 2/2] ENH: NpyIter: add a flag to handle read/write operand overlap Add a new NPY_ITER_COPY_IF_OVERLAP iterator flag to NpyIter, which instructs it to check if read operands overlap with write operands in memory, and make temporary copies to eliminate detected overlap. Thanks to Sebastian Berg. --- doc/source/reference/c-api.iterator.rst | 27 +++++++ numpy/add_newdocs.py | 5 ++ numpy/core/include/numpy/ndarraytypes.h | 6 ++ numpy/core/src/multiarray/nditer_constr.c | 81 ++++++++++++++++++++- numpy/core/src/multiarray/nditer_impl.h | 2 + numpy/core/src/multiarray/nditer_pywrap.c | 5 ++ numpy/core/tests/test_nditer.py | 85 +++++++++++++++++++++++ 7 files changed, 208 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/c-api.iterator.rst b/doc/source/reference/c-api.iterator.rst index b38c21390b44..b5d00f4be1a9 100644 --- a/doc/source/reference/c-api.iterator.rst +++ b/doc/source/reference/c-api.iterator.rst @@ -461,6 +461,33 @@ Construction and Destruction Then, call :c:func:`NpyIter_Reset` to allocate and fill the buffers with their initial values. + .. c:var:: NPY_ITER_COPY_IF_OVERLAP + + If a write operand has overlap with a read operand, eliminate all + overlap by making temporary copies (with UPDATEIFCOPY for write + operands). + + Overlapping means: + + - For a (read, write) pair of operands, there is a memory address + that contains data common to both arrays, which can be reached + via *different* index/dtype/shape combinations. + + - In particular, unless the arrays have the same shape, dtype, + strides, and start address, any shared common data byte accessible + by indexing implies overlap. + + Because exact overlap detection has exponential runtime + in the number of dimensions, the decision is made based + on heuristics, which has false positives (needless copies in unusual + cases) but has no false negatives. + + If read/write overlap exists and write operands are modified in the + iterator loop element-wise, this flag ensures the result of the + operation is the same as if all operands were copied. + In cases where copies would need to be made, **the result of the + computation may be undefined without this flag!** + Flags that may be passed in ``op_flags[i]``, where ``0 <= i < nop``: .. c:var:: NPY_ITER_READWRITE diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py index 41267b797349..5ffb411c6dd4 100644 --- a/numpy/add_newdocs.py +++ b/numpy/add_newdocs.py @@ -169,6 +169,11 @@ with one per iteration dimension, to be tracked. * "common_dtype" causes all the operands to be converted to a common data type, with copying or buffering as necessary. + * "copy_if_overlap" causes the iterator to determine if read + operands have overlap with write operands (except if + the arrays are exactly the same), and make temporary copies + as necessary to avoid overlap. False positives (needless + copying) are possible in some cases. * "delay_bufalloc" delays allocation of the buffers until a reset() call is made. Allows "allocate" operands to be initialized before their values are copied into the buffers. diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h index a9848f43496e..06d351a6af77 100644 --- a/numpy/core/include/numpy/ndarraytypes.h +++ b/numpy/core/include/numpy/ndarraytypes.h @@ -1008,6 +1008,12 @@ typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter, #define NPY_ITER_DELAY_BUFALLOC 0x00000800 /* When NPY_KEEPORDER is specified, disable reversing negative-stride axes */ #define NPY_ITER_DONT_NEGATE_STRIDES 0x00001000 +/* + * If output operands overlap with other operands (based on heuristics that + * has false positives but no false negatives), make temporary copies to + * eliminate overlap. + */ +#define NPY_ITER_COPY_IF_OVERLAP 0x00002000 /*** Per-operand flags that may be passed to the iterator constructors ***/ diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c index 3cbbb2b27605..f91e957ccdee 100644 --- a/numpy/core/src/multiarray/nditer_constr.c +++ b/numpy/core/src/multiarray/nditer_constr.c @@ -17,6 +17,8 @@ #include "arrayobject.h" #include "templ_common.h" +#include "mem_overlap.h" + /* Internal helper functions private to this file */ static int @@ -2711,6 +2713,73 @@ npyiter_allocate_arrays(NpyIter *iter, bufferdata = NIT_BUFFERDATA(iter); } + if (flags & NPY_ITER_COPY_IF_OVERLAP) { + /* Perform operand memory overlap checks if requested */ + for (iop = 0; iop < nop; ++iop) { + int may_share_memory = 0; + int iother; + + if (op[iop] == NULL) { + /* Iterator will always allocate */ + continue; + } + + if (!(op_itflags[iop] & NPY_OP_ITFLAG_WRITE)) { + /* + * Copy output operands only, not inputs. + * A more sophisticated heuristic could be + * substituted here later. + */ + continue; + } + + for (iother = 0; iother < nop; ++iother) { + if (iother == iop || op[iother] == NULL) { + continue; + } + + if (!(op_itflags[iother] & NPY_OP_ITFLAG_READ)) { + /* No data dependence for arrays not read from */ + continue; + } + + if (op_itflags[iother] & NPY_OP_ITFLAG_FORCECOPY) { + /* Already copied */ + continue; + } + + /* + * If the arrays are views to exactly the same data, no need + * to make copies, because ufunc inner loops are assumed to + * deal with that + */ + if (PyArray_BYTES(op[iop]) == PyArray_BYTES(op[iother]) && + PyArray_NDIM(op[iop]) == PyArray_NDIM(op[iother]) && + PyArray_CompareLists(PyArray_DIMS(op[iop]), + PyArray_DIMS(op[iother]), + PyArray_NDIM(op[iop])) && + PyArray_CompareLists(PyArray_STRIDES(op[iop]), + PyArray_STRIDES(op[iother]), + PyArray_NDIM(op[iop])) && + PyArray_DESCR(op[iop]) == PyArray_DESCR(op[iother])) { + continue; + } + + /* + * Use max work = 1. If the arrays are large, it might + * make sense to go further. + */ + may_share_memory = solve_may_share_memory( + op[iop], op[iother], 1); + + if (may_share_memory) { + op_itflags[iop] |= NPY_OP_ITFLAG_FORCECOPY; + break; + } + } + } + } + for (iop = 0; iop < nop; ++iop) { /* * Check whether there are any WRITEMASKED REDUCE operands @@ -2800,9 +2869,15 @@ npyiter_allocate_arrays(NpyIter *iter, NBF_STRIDES(bufferdata)[iop] = 0; } } - /* If casting is required and permitted */ - else if ((op_itflags[iop] & NPY_OP_ITFLAG_CAST) && - (op_flags[iop] & (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) { + /* + * Make a temporary copy if, + * 1. If casting is required and permitted, or, + * 2. If force-copy is requested + */ + else if (((op_itflags[iop] & NPY_OP_ITFLAG_CAST) && + (op_flags[iop] & + (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) || + (op_itflags[iop] & NPY_OP_ITFLAG_FORCECOPY)) { PyArrayObject *temp; int ondim = PyArray_NDIM(op[iop]); diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h index ae24f46e6e61..7788d327b7bb 100644 --- a/numpy/core/src/multiarray/nditer_impl.h +++ b/numpy/core/src/multiarray/nditer_impl.h @@ -122,6 +122,8 @@ #define NPY_OP_ITFLAG_WRITEMASKED 0x0080 /* The operand's data pointer is pointing into its buffer */ #define NPY_OP_ITFLAG_USINGBUFFER 0x0100 +/* The operand must be copied (with UPDATEIFCOPY if also ITFLAG_WRITE) */ +#define NPY_OP_ITFLAG_FORCECOPY 0x0200 /* * The data layout of the iterator is fully specified by diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c index c735e7ad10db..26756dfc1201 100644 --- a/numpy/core/src/multiarray/nditer_pywrap.c +++ b/numpy/core/src/multiarray/nditer_pywrap.c @@ -148,6 +148,11 @@ NpyIter_GlobalFlagsConverter(PyObject *flags_in, npy_uint32 *flags) flag = NPY_ITER_C_INDEX; } break; + case 'i': + if (strcmp(str, "copy_if_overlap") == 0) { + flag = NPY_ITER_COPY_IF_OVERLAP; + } + break; case 'n': if (strcmp(str, "common_dtype") == 0) { flag = NPY_ITER_COMMON_DTYPE; diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py index 3b5aaa28d2fb..722c8d1a4dc8 100644 --- a/numpy/core/tests/test_nditer.py +++ b/numpy/core/tests/test_nditer.py @@ -1138,6 +1138,91 @@ def test_iter_common_dtype(): assert_equal(i.dtypes[1], np.dtype('c16')) assert_equal(i.dtypes[2], np.dtype('c16')) +def test_iter_copy_if_overlap(): + # Ensure the iterator makes copies on read/write overlap, if requested + + # Copy not needed, 1 op + for flag in ['readonly', 'writeonly', 'readwrite']: + a = arange(10) + i = nditer([a], ['copy_if_overlap'], [[flag]]) + assert_(i.operands[0] is a) + + # Copy needed, 2 ops, read-write overlap + x = arange(10) + a = x[1:] + b = x[:-1] + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']]) + assert_(not np.shares_memory(*i.operands)) + + # Copy not needed, 2 ops, exactly same arrays + x = arange(10) + a = x + b = x + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']]) + assert_(i.operands[0] is a and i.operands[1] is b) + + # Copy not needed, 2 ops, no overlap + x = arange(10) + a = x[::2] + b = x[1::2] + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']]) + assert_(i.operands[0] is a and i.operands[1] is b) + + # Copy needed, 2 ops, read-write overlap + x = arange(4, dtype=np.int8) + a = x[3:] + b = x.view(np.int32)[:1] + i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']]) + assert_(not np.shares_memory(*i.operands)) + + # Copy needed, 3 ops, read-write overlap + for flag in ['writeonly', 'readwrite']: + x = np.ones([10, 10]) + a = x + b = x.T + c = x + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['readonly'], [flag]]) + a2, b2, c2 = i.operands + assert_(not np.shares_memory(a2, c2)) + assert_(not np.shares_memory(b2, c2)) + + # Copy not needed, 3 ops, read-only overlap + x = np.ones([10, 10]) + a = x + b = x.T + c = x + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['readonly'], ['readonly']]) + a2, b2, c2 = i.operands + assert_(a is a2) + assert_(b is b2) + assert_(c is c2) + + # Copy not needed, 3 ops, read-only overlap + x = np.ones([10, 10]) + a = x + b = np.ones([10, 10]) + c = x.T + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['writeonly'], ['readonly']]) + a2, b2, c2 = i.operands + assert_(a is a2) + assert_(b is b2) + assert_(c is c2) + + # Copy not needed, 3 ops, write-only overlap + x = np.arange(7) + a = x[:3] + b = x[3:6] + c = x[4:7] + i = nditer([a, b, c], ['copy_if_overlap'], + [['readonly'], ['writeonly'], ['writeonly']]) + a2, b2, c2 = i.operands + assert_(a is a2) + assert_(b is b2) + assert_(c is c2) + def test_iter_op_axes(): # Check that custom axes work