From 2ebabb78f314e6ed706499e81099132921d32dea Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 11 Jun 2025 19:43:52 +0300
Subject: [PATCH 1/3] BUG: fix matmul with transposed out arg

---
 numpy/_core/src/umath/matmul.c.src   | 2 +-
 numpy/_core/tests/test_multiarray.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/numpy/_core/src/umath/matmul.c.src b/numpy/_core/src/umath/matmul.c.src
index d9be7b1d6826..02c4fde56bf2 100644
--- a/numpy/_core/src/umath/matmul.c.src
+++ b/numpy/_core/src/umath/matmul.c.src
@@ -596,7 +596,7 @@ NPY_NO_EXPORT void
              * Use transpose equivalence:
              * matmul(a, b, o) == matmul(b.T, a.T, o.T)
              */
-            if (o_f_blasable) {
+            if (o_transpose) {
                 @TYPE@_matmul_matrixmatrix(
                     ip2_, is2_p_, is2_n_,
                     ip1_, is1_n_, is1_m_,
diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py
index 7603449ba28e..34740963f6d8 100644
--- a/numpy/_core/tests/test_multiarray.py
+++ b/numpy/_core/tests/test_multiarray.py
@@ -7272,6 +7272,10 @@ def test_out_contiguous(self):
         assert_array_equal(c, tgt_mv)
         c = self.matmul(v, a.T, out=out[:, 0, 0])
         assert_array_equal(c, tgt_mv)
+        # issue 29164
+        out_f = np.zeros((10, 4), dtype=float)
+        c = self.matmul(a, b, out=out_f[::-2, ::-2])
+        assert_array_equal(c, tgt)
 
         # test out contiguous in only last dim
         out = np.ones((10, 2), dtype=float)

From ece0bdb1f8ef91b6716777aa65b5c1c2ce659d60 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 11 Jun 2025 19:58:24 +0300
Subject: [PATCH 2/3] DOC: add release note

---
 doc/release/upcoming_changes/23752.performance.rst | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 doc/release/upcoming_changes/23752.performance.rst

diff --git a/doc/release/upcoming_changes/23752.performance.rst b/doc/release/upcoming_changes/23752.performance.rst
new file mode 100644
index 000000000000..37ed1ee34dd8
--- /dev/null
+++ b/doc/release/upcoming_changes/23752.performance.rst
@@ -0,0 +1,6 @@
+Improve matmul performance when operands are non-contiguous
+-----------------------------------------------------------
+
+Enable using BLAS for matmul even when operands are non-contiguous by copying
+if needed. This performance enhancement's original implementation had a bug
+that was fixed for v2.3.1

From 6100fba37b5f3c9584437d91d5c8c3c219021622 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Thu, 12 Jun 2025 08:35:22 +0300
Subject: [PATCH 3/3] fixes from review

---
 doc/release/upcoming_changes/23752.performance.rst |  6 ------
 doc/release/upcoming_changes/29179.change.rst      |  4 ++++
 doc/source/release/2.3.0-notes.rst                 |  6 ++++++
 numpy/_core/tests/test_multiarray.py               | 10 ++++++----
 4 files changed, 16 insertions(+), 10 deletions(-)
 delete mode 100644 doc/release/upcoming_changes/23752.performance.rst
 create mode 100644 doc/release/upcoming_changes/29179.change.rst

diff --git a/doc/release/upcoming_changes/23752.performance.rst b/doc/release/upcoming_changes/23752.performance.rst
deleted file mode 100644
index 37ed1ee34dd8..000000000000
--- a/doc/release/upcoming_changes/23752.performance.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Improve matmul performance when operands are non-contiguous
------------------------------------------------------------
-
-Enable using BLAS for matmul even when operands are non-contiguous by copying
-if needed. This performance enhancement's original implementation had a bug
-that was fixed for v2.3.1
diff --git a/doc/release/upcoming_changes/29179.change.rst b/doc/release/upcoming_changes/29179.change.rst
new file mode 100644
index 000000000000..12eb6804d3dd
--- /dev/null
+++ b/doc/release/upcoming_changes/29179.change.rst
@@ -0,0 +1,4 @@
+Fix bug in ``matmul`` for non-contiguous out kwarg parameter
+------------------------------------------------------------
+In some cases, if ``out`` was non-contiguous, ``np.matmul`` would cause
+memory corruption or a c-level assert. This was new to v2.3.0 and fixed in v2.3.1.
diff --git a/doc/source/release/2.3.0-notes.rst b/doc/source/release/2.3.0-notes.rst
index faad9ffcc8eb..4c3c923b3b5e 100644
--- a/doc/source/release/2.3.0-notes.rst
+++ b/doc/source/release/2.3.0-notes.rst
@@ -414,6 +414,12 @@ the best performance.
 
 (`gh-28769 <https://github.com/numpy/numpy/pull/28769>`__)
 
+Performance improvements for ``np.matmul``
+------------------------------------------
+Enable using BLAS for ``matmul`` even when operands are non-contiguous by copying
+if needed.
+
+(`gh-23752 <https://github.com/numpy/numpy/pull/23752>`__)
 
 Changes
 =======
diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py
index 34740963f6d8..acf053b41490 100644
--- a/numpy/_core/tests/test_multiarray.py
+++ b/numpy/_core/tests/test_multiarray.py
@@ -7272,10 +7272,6 @@ def test_out_contiguous(self):
         assert_array_equal(c, tgt_mv)
         c = self.matmul(v, a.T, out=out[:, 0, 0])
         assert_array_equal(c, tgt_mv)
-        # issue 29164
-        out_f = np.zeros((10, 4), dtype=float)
-        c = self.matmul(a, b, out=out_f[::-2, ::-2])
-        assert_array_equal(c, tgt)
 
         # test out contiguous in only last dim
         out = np.ones((10, 2), dtype=float)
@@ -7321,6 +7317,12 @@ def test_dot_equivalent(self, args):
         r3 = np.matmul(args[0].copy(), args[1].copy())
         assert_equal(r1, r3)
 
+        # matrix matrix, issue 29164
+        if [len(args[0].shape), len(args[1].shape)] == [2, 2]:
+            out_f = np.zeros((r2.shape[0] * 2, r2.shape[1] * 2), order='F')
+            r4 = np.matmul(*args, out=out_f[::2, ::2])
+            assert_equal(r2, r4)
+
     def test_matmul_object(self):
         import fractions