From d77dcba35f3795432e795d1699a50086bfc77b0d Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Fri, 12 Nov 2010 10:52:19 -0500
Subject: [PATCH 01/15] ENH: Allow user to pass in output array for dot()

This avoids the memory allocation. It is very strict in checking that
the types are correct, but since it is intended as an optimisation, it
should only be used when the user knows what they are doing.
---
 numpy/core/blasdot/_dotblas.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 1dc24d8ea889..771c2bf5388f 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -216,7 +216,7 @@ static PyObject *
 dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *op1, *op2;
-    PyArrayObject *ap1 = NULL, *ap2 = NULL, *ret = NULL;
+    PyArrayObject *ap1 = NULL, *ap2 = NULL, *out = NULL, *ret = NULL;
     int j, l, lda, ldb, ldc;
     int typenum, nd;
     npy_intp ap1stride = 0;
@@ -231,7 +231,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     PyArray_Descr *dtype;
     MatrixShape ap1shape, ap2shape;
 
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+    if (!PyArg_ParseTuple(args, "OO|O", &op1, &op2, &out)) {
         return NULL;
     }
 
@@ -418,10 +418,27 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         subtype = Py_TYPE(ap1);
     }
 
-    ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                       typenum, NULL, NULL, 0, 0,
-                                       (PyObject *)
-                                       (prior2 > prior1 ? ap2 : ap1));
+    if (out) {
+        /* verify that it is usable */
+        if (Py_Type(out) != subtype || PyArray_NDIM(out) != nd || PyArray_TYPE(out) != typenum || !PyArray_ISCARRAY(out)) {
+            PyErr_SetString(PyExc_ValueError, "output array is not acceptable (must have the right type, nr dimensions, and be a C-Array(");
+            goto fail;
+        }
+        int d;
+        for (d = 0; d != nd; ++d) {
+            if (dimensions[d] != PyArray_DIM(out, d)) {
+                PyErr_SetString(PyExc_ValueError, "output array has wrong dimensions");
+                goto fail;
+            }
+        }
+        Py_INCREF(out);
+        ret = out;
+    } else {
+        ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
+                                           typenum, NULL, NULL, 0, 0,
+                                           (PyObject *)
+                                           (prior2 > prior1 ? ap2 : ap1));
+    }
 
     if (ret == NULL) {
         goto fail;

From f1283a5bba43fb02430559bbf512829e25837e91 Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Fri, 12 Nov 2010 11:09:10 -0500
Subject: [PATCH 02/15] DOC+TST: Document&test new output array for dot()

Using the toy example in the test as a benchmark, I clocked 63ms with
an `out` parameter, versus 91ms without. Here is the full
benchmark (requires ipython)::

    import numpy as np
    np.random.seed(22)
    f = np.random.rand(1024*128, 16)
    v = np.random.rand(16,32)
    r = np.empty((1024*128, 32))
    _ip.magic("timeit np.dot(f,v,r)")
    _ip.magic("timeit np.dot(f,v)")
---
 numpy/add_newdocs.py             |  9 +++++++++
 numpy/core/tests/test_blasdot.py | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index baf5285f0859..81643cbeedec 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -1193,6 +1193,7 @@
 add_newdoc('numpy.core', 'dot',
     """
     dot(a, b)
+    dot(a, b, out)
 
     Dot product of two arrays.
 
@@ -1209,6 +1210,13 @@
         First argument.
     b : array_like
         Second argument.
+    out : ndarray, optional
+        Output argument. This must have the exact kind that would be returned
+        if it was not used. In particular, it must have the right type, must be
+        C-contiguous, and its dtype must be the dtype that would be returned
+        for `dot(a,b)`. This is a performance feature. Therefore, if these
+        conditions are not met, an exception is raised, instead of attempting
+        to be flexible.
 
     Returns
     -------
@@ -1216,6 +1224,7 @@
         Returns the dot product of `a` and `b`.  If `a` and `b` are both
         scalars or both 1-D arrays then a scalar is returned; otherwise
         an array is returned.
+        If `out` is given, then it is returned.
 
     Raises
     ------
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 3c04759d55bd..04e670b76335 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -26,3 +26,19 @@ def test_blasdot_used():
     assert_(inner is _dotblas.inner)
     assert_(alterdot is _dotblas.alterdot)
     assert_(restoredot is _dotblas.restoredot)
+
+
+def test_dot_3args():
+    import numpy as np
+    import sys
+    np.random.seed(22)
+    f = np.random.rand(1024*128, 16)
+    v = np.random.rand(16,32)
+
+    r = np.empty((1024*128, 32))
+    for i in xrange(12):
+        np.dot(v,f,r)
+    assert sys.getrefcount(r) == 2
+    r2 = np.dot(f,v)
+    assert np.all(r2 == r)
+    assert r is np.dot(f,v, r)

From dfe809aab895fec278dcbee177ed4ea77506da3c Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Tue, 16 Nov 2010 11:30:26 -0500
Subject: [PATCH 03/15] ENH: Limit lines to 80 chars.

---
 numpy/core/blasdot/_dotblas.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 771c2bf5388f..0be34a36217e 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -420,14 +420,21 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
     if (out) {
         /* verify that it is usable */
-        if (Py_Type(out) != subtype || PyArray_NDIM(out) != nd || PyArray_TYPE(out) != typenum || !PyArray_ISCARRAY(out)) {
-            PyErr_SetString(PyExc_ValueError, "output array is not acceptable (must have the right type, nr dimensions, and be a C-Array(");
+        if (Py_Type(out) != subtype ||
+            PyArray_NDIM(out) != nd ||
+            PyArray_TYPE(out) != typenum ||
+            !PyArray_ISCARRAY(out)) {
+
+            PyErr_SetString(PyExc_ValueError,
+                "output array is not acceptable "
+                "(must have the right type, nr dimensions, and be a C-Array)");
             goto fail;
         }
         int d;
         for (d = 0; d != nd; ++d) {
             if (dimensions[d] != PyArray_DIM(out, d)) {
-                PyErr_SetString(PyExc_ValueError, "output array has wrong dimensions");
+                PyErr_SetString(PyExc_ValueError,
+                    "output array has wrong dimensions");
                 goto fail;
             }
         }

From 221ed686335819867374adc3734c6e5597088a3f Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Tue, 16 Nov 2010 13:24:18 -0500
Subject: [PATCH 04/15] TST: Test matrix x vector dot() with output array

---
 numpy/core/tests/test_blasdot.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 04e670b76335..c9745ab84f82 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -32,13 +32,20 @@ def test_dot_3args():
     import numpy as np
     import sys
     np.random.seed(22)
-    f = np.random.rand(1024*128, 16)
-    v = np.random.rand(16,32)
+    f = np.random.random_sample((1024, 16))
+    v = np.random.random_sample((16, 32))
 
-    r = np.empty((1024*128, 32))
+    r = np.empty((1024, 32))
     for i in xrange(12):
-        np.dot(v,f,r)
+        np.dot(f,v,r)
     assert sys.getrefcount(r) == 2
     r2 = np.dot(f,v)
     assert np.all(r2 == r)
-    assert r is np.dot(f,v, r)
+    assert r is np.dot(f,v,r)
+
+    v = v[:,0].copy() # v.shape == (16,)
+    r = r[:,0].copy() # r.shape == (1024,)
+    r2 = np.dot(f,v)
+    assert r is np.dot(f,v,r)
+    assert np.all(r2 == r)
+

From 9cbbeffd1d3b1e76c0fa0bd42c801a6263a912be Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Tue, 16 Nov 2010 13:53:48 -0500
Subject: [PATCH 05/15] TST: Test that error conditions raise ValueError

---
 numpy/core/tests/test_blasdot.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index c9745ab84f82..a30b8b86db02 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -1,5 +1,5 @@
 from numpy.core import zeros, float64
-from numpy.testing import dec, TestCase, assert_almost_equal, assert_
+from numpy.testing import dec, TestCase, assert_almost_equal, assert_, assert_raises
 from numpy.core.multiarray import inner as inner_
 
 DECPREC = 14
@@ -49,3 +49,33 @@ def test_dot_3args():
     assert r is np.dot(f,v,r)
     assert np.all(r2 == r)
 
+def test_dot_3args_errors():
+    import numpy as np
+    np.random.seed(22)
+    f = np.random.random_sample((1024, 16))
+    v = np.random.random_sample((16, 32))
+
+    r = np.empty((1024, 31))
+    assert_raises(ValueError, np.dot, f, v, r)
+
+    r = np.empty((1024,))
+    assert_raises(ValueError, np.dot, f, v, r)
+
+    r = np.empty((32,))
+    assert_raises(ValueError, np.dot, f, v, r)
+
+    r = np.empty((32, 1024))
+    assert_raises(ValueError, np.dot, f, v, r)
+    assert_raises(ValueError, np.dot, f, v, r.T)
+
+    r = np.empty((1024, 64))
+    assert_raises(ValueError, np.dot, f, v, r[:,::2])
+    assert_raises(ValueError, np.dot, f, v, r[:,:32])
+
+    r = np.empty((1024, 32), dtype=np.float32)
+    assert_raises(ValueError, np.dot, f, v, r)
+
+    r = np.empty((1024, 32), dtype=int)
+    assert_raises(ValueError, np.dot, f, v, r)
+
+

From 82e9dadaeca3136b574b69e84c3e81c1a1f1659d Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Sat, 20 Nov 2010 20:03:45 -0500
Subject: [PATCH 06/15] ENH: Have non-BLAS dot also support out parameter

---
 numpy/core/src/multiarray/multiarraymodule.c | 36 +++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index d4dba719c2ba..5dbdffea3af1 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -578,7 +578,7 @@ PyArray_CanCoerceScalar(int thistype, int neededtype,
  * priority of ap1 and ap2 into account.
  */
 static PyArrayObject *
-new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2,
+new_array_for_sum(PyArrayObject* out, PyArrayObject *ap1, PyArrayObject *ap2,
                   int nd, intp dimensions[], int typenum)
 {
     PyArrayObject *ret;
@@ -597,6 +597,26 @@ new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2,
         prior1 = prior2 = 0.0;
         subtype = Py_TYPE(ap1);
     }
+    if (out) {
+        if (Py_TYPE(out) != subtype ||
+            PyArray_NDIM(out) != nd ||
+            PyArray_TYPE(out) != typenum ||
+            !PyArray_ISCARRAY(out)) {
+            PyErr_SetString(PyExc_ValueError,
+                "output array is not acceptable "
+                "(must have the right type, nr dimensions, and be a C-Array)");
+            return 0;
+        }
+        int d;
+        for (d = 0; d != nd; ++d) {
+            if (dimensions[d] != PyArray_DIM(out, d)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "output array has wrong dimensions");
+                return 0;
+            }
+        }
+        return out;
+    }
 
     ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
                                        typenum, NULL, NULL, 0, 0,
@@ -666,7 +686,7 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
      * Need to choose an output array that can hold a sum
      * -- use priority to determine which subtype.
      */
-    ret = new_array_for_sum(ap1, ap2, nd, dimensions, typenum);
+    ret = new_array_for_sum(NULL, ap1, ap2, nd, dimensions, typenum);
     if (ret == NULL) {
         goto fail;
     }
@@ -719,7 +739,7 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
  * just like inner product but does the swapaxes stuff on the fly
  */
 NPY_NO_EXPORT PyObject *
-PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
+PyArray_MatrixProduct(PyObject *op1, PyObject *op2, PyObject* out)
 {
     PyArrayObject *ap1, *ap2, *ret = NULL;
     PyArrayIterObject *it1, *it2;
@@ -788,7 +808,7 @@ PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
 
     is1 = ap1->strides[ap1->nd-1]; is2 = ap2->strides[matchDim];
     /* Choose which subtype to return */
-    ret = new_array_for_sum(ap1, ap2, nd, dimensions, typenum);
+    ret = new_array_for_sum(out, ap1, ap2, nd, dimensions, typenum);
     if (ret == NULL) {
         goto fail;
     }
@@ -968,7 +988,7 @@ _pyarray_correlate(PyArrayObject *ap1, PyArrayObject *ap2, int typenum,
      * Need to choose an output array that can hold a sum
      * -- use priority to determine which subtype.
      */
-    ret = new_array_for_sum(ap1, ap2, 1, &length, typenum);
+    ret = new_array_for_sum(NULL, ap1, ap2, 1, &length, typenum);
     if (ret == NULL) {
         return NULL;
     }
@@ -1852,12 +1872,12 @@ array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 static PyObject *
 array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
-    PyObject *v, *a;
+    PyObject *v, *a, *o = NULL;
 
-    if (!PyArg_ParseTuple(args, "OO", &a, &v)) {
+    if (!PyArg_ParseTuple(args, "OO|O", &a, &v, &o)) {
         return NULL;
     }
-    return _ARET(PyArray_MatrixProduct(a, v));
+    return _ARET(PyArray_MatrixProduct(a, v, o));
 }
 
 static PyObject *

From 9ea97d931054432a3270099c2535f56e7e50b2b4 Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Sat, 20 Nov 2010 20:05:20 -0500
Subject: [PATCH 07/15] ENH: Make code C89

(Pointed out by pv on github)
---
 numpy/core/blasdot/_dotblas.c                | 4 ++--
 numpy/core/src/multiarray/multiarraymodule.c | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 0be34a36217e..4b0d9b755ea6 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -419,7 +419,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     }
 
     if (out) {
-        /* verify that it is usable */
+        int d;
+        /* verify that out is usable */
         if (Py_Type(out) != subtype ||
             PyArray_NDIM(out) != nd ||
             PyArray_TYPE(out) != typenum ||
@@ -430,7 +431,6 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
                 "(must have the right type, nr dimensions, and be a C-Array)");
             goto fail;
         }
-        int d;
         for (d = 0; d != nd; ++d) {
             if (dimensions[d] != PyArray_DIM(out, d)) {
                 PyErr_SetString(PyExc_ValueError,
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5dbdffea3af1..bb20ab96b5e7 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -598,6 +598,8 @@ new_array_for_sum(PyArrayObject* out, PyArrayObject *ap1, PyArrayObject *ap2,
         subtype = Py_TYPE(ap1);
     }
     if (out) {
+        int d;
+        /* verify that out is usable */
         if (Py_TYPE(out) != subtype ||
             PyArray_NDIM(out) != nd ||
             PyArray_TYPE(out) != typenum ||
@@ -607,7 +609,6 @@ new_array_for_sum(PyArrayObject* out, PyArrayObject *ap1, PyArrayObject *ap2,
                 "(must have the right type, nr dimensions, and be a C-Array)");
             return 0;
         }
-        int d;
         for (d = 0; d != nd; ++d) {
             if (dimensions[d] != PyArray_DIM(out, d)) {
                 PyErr_SetString(PyExc_ValueError,

From 5f064eddcbcf203a157e2551e3eed79b8993436b Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Mon, 22 Nov 2010 12:32:22 -0500
Subject: [PATCH 08/15] BUG: Fix typo. Call new MatrixMultiply with 3 args

---
 numpy/core/blasdot/_dotblas.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 4b0d9b755ea6..92a0fd04a5e3 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -246,7 +246,10 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     /* This function doesn't handle other types */
     if ((typenum != PyArray_DOUBLE && typenum != PyArray_CDOUBLE &&
          typenum != PyArray_FLOAT && typenum != PyArray_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct(op1, op2));
+        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct(
+                                                    (PyObject *)op1,
+                                                    (PyObject *)op2,
+                                                    (PyObject *)out));
     }
 
     dtype = PyArray_DescrFromType(typenum);
@@ -280,7 +283,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
             Py_DECREF(tmp2);
         }
         ret = (PyArrayObject *)PyArray_MatrixProduct((PyObject *)ap1,
-                                                     (PyObject *)ap2);
+                                                     (PyObject *)ap2,
+                                                     (PyObject *)out);
         Py_DECREF(ap1);
         Py_DECREF(ap2);
         return PyArray_Return(ret);
@@ -421,7 +425,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (out) {
         int d;
         /* verify that out is usable */
-        if (Py_Type(out) != subtype ||
+        if (Py_TYPE(out) != subtype ||
             PyArray_NDIM(out) != nd ||
             PyArray_TYPE(out) != typenum ||
             !PyArray_ISCARRAY(out)) {

From b42f326306136c02ab9f4d430d56764e6602a1ea Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Mon, 22 Nov 2010 12:39:09 -0500
Subject: [PATCH 09/15] ENH: Named arguments

This allows `np.dot(a, b, out=r)`.

Suggested by pv.
---
 numpy/core/blasdot/_dotblas.c                | 8 +++++---
 numpy/core/src/multiarray/multiarraymodule.c | 7 ++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 92a0fd04a5e3..f019ce2488f0 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -213,7 +213,7 @@ _bad_strides(PyArrayObject *ap)
  * NB: The first argument is not conjugated.;
  */
 static PyObject *
-dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwargs)
 {
     PyObject *op1, *op2;
     PyArrayObject *ap1 = NULL, *ap2 = NULL, *out = NULL, *ret = NULL;
@@ -230,8 +230,10 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     PyTypeObject *subtype;
     PyArray_Descr *dtype;
     MatrixShape ap1shape, ap2shape;
+    char* kwords[] = {"a", "b", "out", NULL };
 
-    if (!PyArg_ParseTuple(args, "OO|O", &op1, &op2, &out)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O", kwords,
+                                    &op1, &op2, &out)) {
         return NULL;
     }
 
@@ -1195,7 +1197,7 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
 }
 
 static struct PyMethodDef dotblas_module_methods[] = {
-    {"dot",  (PyCFunction)dotblas_matrixproduct, 1, NULL},
+    {"dot",  (PyCFunction)dotblas_matrixproduct, METH_VARARGS|METH_KEYWORDS, NULL},
     {"inner",   (PyCFunction)dotblas_innerproduct,  1, NULL},
     {"vdot", (PyCFunction)dotblas_vdot, 1, NULL},
     {"alterdot", (PyCFunction)dotblas_alterdot, 1, NULL},
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index bb20ab96b5e7..286843aad130 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -1871,11 +1871,12 @@ array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 }
 
 static PyObject *
-array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
 {
     PyObject *v, *a, *o = NULL;
+    char* kwlist[] = {"a", "b", "out", NULL };
 
-    if (!PyArg_ParseTuple(args, "OO|O", &a, &v, &o)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O", kwlist, &a, &v, &o)) {
         return NULL;
     }
     return _ARET(PyArray_MatrixProduct(a, v, o));
@@ -2787,7 +2788,7 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS, NULL},
     {"dot",
         (PyCFunction)array_matrixproduct,
-        METH_VARARGS, NULL},
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"_fastCopyAndTranspose",
         (PyCFunction)array_fastCopyAndTranspose,
         METH_VARARGS, NULL},

From bc7478d83c004af3f09b3eeffb8ea3a52be9fa8b Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 11 Dec 2010 19:46:12 +0100
Subject: [PATCH 10/15] BUG: core: restore PyArray_MatrixProduct signature, put
 the new signature in PyArray_MatrixProduct2

---
 numpy/core/blasdot/_dotblas.c                |  8 +--
 numpy/core/code_generators/numpy_api.py      |  1 +
 numpy/core/src/multiarray/multiarraymodule.c | 17 +++--
 numpy/core/tests/test_blasdot.py             | 17 +++--
 numpy/core/tests/test_multiarray.py          | 76 ++++++++++++++++----
 5 files changed, 90 insertions(+), 29 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index f019ce2488f0..abcfb9422827 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -248,7 +248,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
     /* This function doesn't handle other types */
     if ((typenum != PyArray_DOUBLE && typenum != PyArray_CDOUBLE &&
          typenum != PyArray_FLOAT && typenum != PyArray_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct(
+        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(
                                                     (PyObject *)op1,
                                                     (PyObject *)op2,
                                                     (PyObject *)out));
@@ -284,9 +284,9 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
             Py_DECREF(tmp1);
             Py_DECREF(tmp2);
         }
-        ret = (PyArrayObject *)PyArray_MatrixProduct((PyObject *)ap1,
-                                                     (PyObject *)ap2,
-                                                     (PyObject *)out);
+        ret = (PyArrayObject *)PyArray_MatrixProduct2((PyObject *)ap1,
+                                                      (PyObject *)ap2,
+                                                      (PyObject *)out);
         Py_DECREF(ap1);
         Py_DECREF(ap2);
         return PyArray_Return(ret);
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 9474a131a5c1..62319363250f 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -252,6 +252,7 @@
     'PyArray_TimedeltaToTimedeltaStruct':   218,
     'PyArray_DatetimeStructToDatetime':     219,
     'PyArray_TimedeltaStructToTimedelta':   220,
+    'PyArray_MatrixProduct2':               221,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 286843aad130..d106fad48963 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -734,13 +734,12 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     return NULL;
 }
 
-
 /*NUMPY_API
- *Numeric.matrixproduct(a,v)
+ * Numeric.matrixproduct(a,v,out)
  * just like inner product but does the swapaxes stuff on the fly
  */
 NPY_NO_EXPORT PyObject *
-PyArray_MatrixProduct(PyObject *op1, PyObject *op2, PyObject* out)
+PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyObject* out)
 {
     PyArrayObject *ap1, *ap2, *ret = NULL;
     PyArrayIterObject *it1, *it2;
@@ -866,6 +865,16 @@ PyArray_MatrixProduct(PyObject *op1, PyObject *op2, PyObject* out)
     return NULL;
 }
 
+/*NUMPY_API
+ *Numeric.matrixproduct(a,v)
+ * just like inner product but does the swapaxes stuff on the fly
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
+{
+    return PyArray_MatrixProduct2(op1, op2, NULL);
+}
+
 /*NUMPY_API
  * Fast Copy and Transpose
  */
@@ -1879,7 +1888,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O", kwlist, &a, &v, &o)) {
         return NULL;
     }
-    return _ARET(PyArray_MatrixProduct(a, v, o));
+    return _ARET(PyArray_MatrixProduct2(a, v, o));
 }
 
 static PyObject *
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index a30b8b86db02..28ee614259de 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -1,5 +1,7 @@
+import numpy as np
 from numpy.core import zeros, float64
-from numpy.testing import dec, TestCase, assert_almost_equal, assert_, assert_raises
+from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
+     assert_raises, assert_array_equal
 from numpy.core.multiarray import inner as inner_
 
 DECPREC = 14
@@ -29,8 +31,6 @@ def test_blasdot_used():
 
 
 def test_dot_3args():
-    import numpy as np
-    import sys
     np.random.seed(22)
     f = np.random.random_sample((1024, 16))
     v = np.random.random_sample((16, 32))
@@ -38,19 +38,18 @@ def test_dot_3args():
     r = np.empty((1024, 32))
     for i in xrange(12):
         np.dot(f,v,r)
-    assert sys.getrefcount(r) == 2
+    assert_equal(sys.getrefcount(r), 2)
     r2 = np.dot(f,v)
-    assert np.all(r2 == r)
-    assert r is np.dot(f,v,r)
+    assert_array_equal(r2, r)
+    assert_(r is np.dot(f,v,r))
 
     v = v[:,0].copy() # v.shape == (16,)
     r = r[:,0].copy() # r.shape == (1024,)
     r2 = np.dot(f,v)
-    assert r is np.dot(f,v,r)
-    assert np.all(r2 == r)
+    assert_(r is np.dot(f,v,r))
+    assert_array_equal(r2, r)
 
 def test_dot_3args_errors():
-    import numpy as np
     np.random.seed(22)
     f = np.random.random_sample((1024, 16))
     v = np.random.random_sample((16, 32))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b3bf209a15d9..2f8f90ddb405 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1274,6 +1274,58 @@ def __array_finalize__(self, obj):
         res = dat.var(1)
         assert res.info == dat.info
 
+class TestDot(TestCase):
+    def test_dot_3args(self):
+        from numpy.core.multiarray import dot
+
+        np.random.seed(22)
+        f = np.random.random_sample((1024, 16))
+        v = np.random.random_sample((16, 32))
+
+        r = np.empty((1024, 32))
+        for i in xrange(12):
+            dot(f,v,r)
+        assert_equal(sys.getrefcount(r), 2)
+        r2 = dot(f,v)
+        assert_array_equal(r2, r)
+        assert_(r is dot(f,v,r))
+
+        v = v[:,0].copy() # v.shape == (16,)
+        r = r[:,0].copy() # r.shape == (1024,)
+        r2 = dot(f,v)
+        assert_(r is dot(f,v,r))
+        assert_array_equal(r2, r)
+
+    def test_dot_3args_errors(self):
+        from numpy.core.multiarray import dot
+
+        np.random.seed(22)
+        f = np.random.random_sample((1024, 16))
+        v = np.random.random_sample((16, 32))
+
+        r = np.empty((1024, 31))
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((1024,))
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((32,))
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((32, 1024))
+        assert_raises(ValueError, dot, f, v, r)
+        assert_raises(ValueError, dot, f, v, r.T)
+
+        r = np.empty((1024, 64))
+        assert_raises(ValueError, dot, f, v, r[:,::2])
+        assert_raises(ValueError, dot, f, v, r[:,:32])
+
+        r = np.empty((1024, 32), dtype=np.float32)
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((1024, 32), dtype=int)
+        assert_raises(ValueError, dot, f, v, r)
+
 
 class TestSummarization(TestCase):
     def test_1d(self):
@@ -1329,23 +1381,23 @@ class TestNeighborhoodIter(TestCase):
     def _test_simple2d(self, dt):
         # Test zero and one padding for simple data type
         x = np.array([[0, 1], [2, 3]], dtype=dt)
-        r = [np.array([[0, 0, 0], [0, 0, 1]], dtype=dt), 
-             np.array([[0, 0, 0], [0, 1, 0]], dtype=dt), 
-             np.array([[0, 0, 1], [0, 2, 3]], dtype=dt), 
+        r = [np.array([[0, 0, 0], [0, 0, 1]], dtype=dt),
+             np.array([[0, 0, 0], [0, 1, 0]], dtype=dt),
+             np.array([[0, 0, 1], [0, 2, 3]], dtype=dt),
              np.array([[0, 1, 0], [2, 3, 0]], dtype=dt)]
         l = test_neighborhood_iterator(x, [-1, 0, -1, 1], x[0], NEIGH_MODE['zero'])
         assert_array_equal(l, r)
 
-        r = [np.array([[1, 1, 1], [1, 0, 1]], dtype=dt), 
-             np.array([[1, 1, 1], [0, 1, 1]], dtype=dt), 
-             np.array([[1, 0, 1], [1, 2, 3]], dtype=dt), 
+        r = [np.array([[1, 1, 1], [1, 0, 1]], dtype=dt),
+             np.array([[1, 1, 1], [0, 1, 1]], dtype=dt),
+             np.array([[1, 0, 1], [1, 2, 3]], dtype=dt),
              np.array([[0, 1, 1], [2, 3, 1]], dtype=dt)]
         l = test_neighborhood_iterator(x, [-1, 0, -1, 1], x[0], NEIGH_MODE['one'])
         assert_array_equal(l, r)
 
-        r = [np.array([[4, 4, 4], [4, 0, 1]], dtype=dt), 
-             np.array([[4, 4, 4], [0, 1, 4]], dtype=dt), 
-             np.array([[4, 0, 1], [4, 2, 3]], dtype=dt), 
+        r = [np.array([[4, 4, 4], [4, 0, 1]], dtype=dt),
+             np.array([[4, 4, 4], [0, 1, 4]], dtype=dt),
+             np.array([[4, 0, 1], [4, 2, 3]], dtype=dt),
              np.array([[0, 1, 4], [2, 3, 4]], dtype=dt)]
         l = test_neighborhood_iterator(x, [-1, 0, -1, 1], 4, NEIGH_MODE['constant'])
         assert_array_equal(l, r)
@@ -1362,9 +1414,9 @@ def test_simple2d_object(self):
 
     def _test_mirror2d(self, dt):
         x = np.array([[0, 1], [2, 3]], dtype=dt)
-        r = [np.array([[0, 0, 1], [0, 0, 1]], dtype=dt), 
-             np.array([[0, 1, 1], [0, 1, 1]], dtype=dt), 
-             np.array([[0, 0, 1], [2, 2, 3]], dtype=dt), 
+        r = [np.array([[0, 0, 1], [0, 0, 1]], dtype=dt),
+             np.array([[0, 1, 1], [0, 1, 1]], dtype=dt),
+             np.array([[0, 0, 1], [2, 2, 3]], dtype=dt),
              np.array([[0, 1, 1], [2, 3, 3]], dtype=dt)]
         l = test_neighborhood_iterator(x, [-1, 0, -1, 1], x[0], NEIGH_MODE['mirror'])
         assert_array_equal(l, r)

From 34a713095dbe415d4e79dffd2a41af696ab15666 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 11 Dec 2010 20:13:05 +0100
Subject: [PATCH 11/15] TST: core: some new tests for multiarray/blas dot()

---
 numpy/core/tests/test_blasdot.py    | 12 +++++++++++-
 numpy/core/tests/test_multiarray.py | 10 ++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 28ee614259de..1acddb3270eb 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -1,7 +1,7 @@
 import numpy as np
 from numpy.core import zeros, float64
 from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
-     assert_raises, assert_array_equal
+     assert_raises, assert_array_equal, assert_allclose
 from numpy.core.multiarray import inner as inner_
 
 DECPREC = 14
@@ -30,6 +30,16 @@ def test_blasdot_used():
     assert_(restoredot is _dotblas.restoredot)
 
 
+def test_dot_2args(self):
+    from numpy.core import dot
+
+    a = np.array([[1, 2], [3, 4]], dtype=float)
+    b = np.array([[1, 0], [1, 1]], dtype=float)
+    c = np.array([[3, 2], [7, 4]], dtype=float)
+
+    d = dot(a, b)
+    assert_allclose(c, d)
+
 def test_dot_3args():
     np.random.seed(22)
     f = np.random.random_sample((1024, 16))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 2f8f90ddb405..85a4a06916a6 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1275,6 +1275,16 @@ def __array_finalize__(self, obj):
         assert res.info == dat.info
 
 class TestDot(TestCase):
+    def test_dot_2args(self):
+        from numpy.core.multiarray import dot
+
+        a = np.array([[1, 2], [3, 4]], dtype=float)
+        b = np.array([[1, 0], [1, 1]], dtype=float)
+        c = np.array([[3, 2], [7, 4]], dtype=float)
+
+        d = dot(a, b)
+        assert_allclose(c, d)
+
     def test_dot_3args(self):
         from numpy.core.multiarray import dot
 

From a774d9675bbd40f24ee3af53acb20efb785c427b Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 11 Dec 2010 20:17:18 +0100
Subject: [PATCH 12/15] BUG: core: fix refcount error in
 multiarraymodule.c:new_array_for_sum

---
 numpy/core/src/multiarray/multiarraymodule.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index d106fad48963..d30e56ab6e74 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -616,6 +616,7 @@ new_array_for_sum(PyArrayObject* out, PyArrayObject *ap1, PyArrayObject *ap2,
                 return 0;
             }
         }
+        Py_INCREF(out);
         return out;
     }
 

From bcebd314dcf35ee8699d0c39a89f3124ecec5a2a Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 11 Dec 2010 20:18:28 +0100
Subject: [PATCH 13/15] TMP: fix some issues in tests

---
 numpy/core/tests/test_blasdot.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 1acddb3270eb..93a78c0bf5b1 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -1,7 +1,8 @@
 import numpy as np
+import sys
 from numpy.core import zeros, float64
 from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
-     assert_raises, assert_array_equal, assert_allclose
+     assert_raises, assert_array_equal, assert_allclose, assert_equal
 from numpy.core.multiarray import inner as inner_
 
 DECPREC = 14
@@ -30,7 +31,7 @@ def test_blasdot_used():
     assert_(restoredot is _dotblas.restoredot)
 
 
-def test_dot_2args(self):
+def test_dot_2args():
     from numpy.core import dot
 
     a = np.array([[1, 2], [3, 4]], dtype=float)

From 48eecd5f3739e155101b703898829a190923939d Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Sat, 8 Jan 2011 16:20:50 -0500
Subject: [PATCH 14/15] ENH: Use the name PyArray_MatrixProduct3

This indicates that the function takes 3 arguments
---
 numpy/core/blasdot/_dotblas.c                | 4 ++--
 numpy/core/code_generators/numpy_api.py      | 2 +-
 numpy/core/src/multiarray/multiarraymodule.c | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index abcfb9422827..6f22e3a9b5be 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -248,7 +248,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
     /* This function doesn't handle other types */
     if ((typenum != PyArray_DOUBLE && typenum != PyArray_CDOUBLE &&
          typenum != PyArray_FLOAT && typenum != PyArray_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(
+        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct3(
                                                     (PyObject *)op1,
                                                     (PyObject *)op2,
                                                     (PyObject *)out));
@@ -284,7 +284,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
             Py_DECREF(tmp1);
             Py_DECREF(tmp2);
         }
-        ret = (PyArrayObject *)PyArray_MatrixProduct2((PyObject *)ap1,
+        ret = (PyArrayObject *)PyArray_MatrixProduct3((PyObject *)ap1,
                                                       (PyObject *)ap2,
                                                       (PyObject *)out);
         Py_DECREF(ap1);
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 62319363250f..a031ca36e827 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -252,7 +252,7 @@
     'PyArray_TimedeltaToTimedeltaStruct':   218,
     'PyArray_DatetimeStructToDatetime':     219,
     'PyArray_TimedeltaStructToTimedelta':   220,
-    'PyArray_MatrixProduct2':               221,
+    'PyArray_MatrixProduct3':               221,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index d30e56ab6e74..f5a6b6d79434 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -740,7 +740,7 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
  * just like inner product but does the swapaxes stuff on the fly
  */
 NPY_NO_EXPORT PyObject *
-PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyObject* out)
+PyArray_MatrixProduct3(PyObject *op1, PyObject *op2, PyObject* out)
 {
     PyArrayObject *ap1, *ap2, *ret = NULL;
     PyArrayIterObject *it1, *it2;
@@ -873,7 +873,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyObject* out)
 NPY_NO_EXPORT PyObject *
 PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
 {
-    return PyArray_MatrixProduct2(op1, op2, NULL);
+    return PyArray_MatrixProduct3(op1, op2, NULL);
 }
 
 /*NUMPY_API
@@ -1889,7 +1889,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O", kwlist, &a, &v, &o)) {
         return NULL;
     }
-    return _ARET(PyArray_MatrixProduct2(a, v, o));
+    return _ARET(PyArray_MatrixProduct3(a, v, o));
 }
 
 static PyObject *

From c6c62e7ec48c18fc6be6989c88fc1df5a836c00b Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <lpc@cmu.edu>
Date: Sat, 8 Jan 2011 16:23:55 -0500
Subject: [PATCH 15/15] BUG: position 221 was taken for PyHalfArrType_Type

---
 numpy/core/code_generators/numpy_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index a031ca36e827..f89aebec318d 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -252,7 +252,7 @@
     'PyArray_TimedeltaToTimedeltaStruct':   218,
     'PyArray_DatetimeStructToDatetime':     219,
     'PyArray_TimedeltaStructToTimedelta':   220,
-    'PyArray_MatrixProduct3':               221,
+    'PyArray_MatrixProduct3':               222,
 }
 
 ufunc_types_api = {