From 0164256c4b3d3143c369e29530f4ece027d049b3 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 19 Feb 2020 11:46:14 -0800
Subject: [PATCH 1/4] TST: Adding test to validate np.maximum.accumulate and
 np.minimum.accumulate

---
 numpy/core/tests/test_umath.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index d1d4467d6d77..233a0b1d6ee1 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -3157,6 +3157,14 @@ def test_rint_big_int():
     # Rint should not change the value
     assert_equal(val, np.rint(val))
 
+@pytest.mark.parametrize('ftype', [np.float32, np.float64])
+def test_memoverlap_accumulate(ftype):
+    # Reproduces bug https://github.com/numpy/numpy/issues/15597
+    arr = np.array([0.61, 0.60, 0.77, 0.41, 0.19], dtype=ftype)
+    out_max = np.array([0.61, 0.61, 0.77, 0.77, 0.77], dtype=ftype)
+    out_min = np.array([0.61, 0.60, 0.60, 0.41, 0.19], dtype=ftype)
+    assert_equal(np.maximum.accumulate(arr), out_max)
+    assert_equal(np.minimum.accumulate(arr), out_min)
 
 def test_signaling_nan_exceptions():
     with assert_no_warnings():

From 085cdbe5e7755ed40684a1cb5b1f6de743fa7263 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 19 Feb 2020 13:52:20 -0800
Subject: [PATCH 2/4] BUG: Check for memory overlap in AVX-512F implementation
 of np.maximim and np.minimum

Fixes bug in np.maximum.accumulate and np.minimum.accumulate
See https://github.com/numpy/numpy/issues/15597
---
 numpy/core/src/umath/simd.inc.src | 41 ++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8db0f6ee6c3c..61321445f078 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -55,6 +55,37 @@ abs_ptrdiff(char *a, char *b)
     return (a > b) ? (a - b) : (b - a);
 }
 
+/*
+ * nomemoverlap - returns true if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+static NPY_INLINE npy_bool
+nomemoverlap(char *ip,
+             npy_intp ip_size,
+             char *op,
+             npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start > op_end) | (op_start > ip_end);
+}
+
 #define IS_BINARY_STRIDE_ONE(esize, vsize) \
     ((steps[0] == esize) && \
      (steps[1] == esize) && \
@@ -85,10 +116,12 @@ abs_ptrdiff(char *a, char *b)
  * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
  * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this.
  */
-#define IS_BINARY_SMALL_STEPS \
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
     ((abs(steps[0]) < MAX_STEP_SIZE)  && \
      (abs(steps[1]) < MAX_STEP_SIZE)  && \
-     (abs(steps[2]) < MAX_STEP_SIZE))
+     (abs(steps[2]) < MAX_STEP_SIZE)  && \
+     (nomemoverlap(args[0], steps[0]*dimensions[0], args[2], steps[2]*dimensions[0])) && \
+     (nomemoverlap(args[1], steps[1]*dimensions[0], args[2], steps[2]*dimensions[0])))
 
 /*
  * output should be contiguous, can handle strided input data
@@ -252,7 +285,7 @@ static NPY_INLINE int
 run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_BINARY_SMALL_STEPS) {
+    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
         AVX512F_@func@_@TYPE@(args, dimensions, steps);
         return 1;
     }
@@ -1942,7 +1975,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
     /*
      * Note: while generally indices are npy_intp, we ensure that our maximum index
      * will fit in an int32 as a precondition for this function via
-     * IS_BINARY_SMALL_STEPS
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
      */
 
     npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];

From 735fb999059d809007217b4e84321a63c63c8406 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Fri, 21 Feb 2020 11:42:22 -0800
Subject: [PATCH 3/4] BUG: Update IS_OUTPUT_BLOCKABLE_UNARY to use the
 nomemoverlap check

abs_ptrdiff(args[1], args[0]) >= (vsize) does not accomodate strides,
specially when the strides are negative.
---
 numpy/core/src/umath/simd.inc.src | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 61321445f078..137fdaa71ae6 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -129,8 +129,7 @@ nomemoverlap(char *ip,
  */
 #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
     (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \
-     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
-      ((abs_ptrdiff(args[1], args[0]) == 0))))
+     (nomemoverlap(args[1], steps[1]*dimensions[0], args[0], steps[0]*dimensions[0])))
 
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
     (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \

From 629f980d5287e281e263ca4029f6cefab2e18136 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Sat, 22 Feb 2020 12:57:23 -0800
Subject: [PATCH 4/4] MAINT: Improve formatting and update comments

---
 numpy/core/src/umath/simd.inc.src | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 137fdaa71ae6..4265476b5c3a 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -114,22 +114,25 @@ nomemoverlap(char *ip,
  *    cross page boundaries.
  *
  * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
- * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this.
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
+ * ensures this. The condition also requires that the input and output arrays
+ * should have no overlap in memory.
  */
 #define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
     ((abs(steps[0]) < MAX_STEP_SIZE)  && \
      (abs(steps[1]) < MAX_STEP_SIZE)  && \
      (abs(steps[2]) < MAX_STEP_SIZE)  && \
-     (nomemoverlap(args[0], steps[0]*dimensions[0], args[2], steps[2]*dimensions[0])) && \
-     (nomemoverlap(args[1], steps[1]*dimensions[0], args[2], steps[2]*dimensions[0])))
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
 
 /*
- * output should be contiguous, can handle strided input data
- * Input step should be smaller than MAX_STEP_SIZE for performance
+ * 1) Output should be contiguous, can handle strided input data
+ * 2) Input step should be smaller than MAX_STEP_SIZE for performance
+ * 3) Input and output arrays should have no overlap in memory
  */
 #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
     (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \
-     (nomemoverlap(args[1], steps[1]*dimensions[0], args[0], steps[0]*dimensions[0])))
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
 
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
     (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \