diff --git a/MANIFEST.in b/MANIFEST.in
index 6f2e8025f118..e3c3316283a4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -21,3 +21,6 @@ include doc/Makefile doc/postprocess.py
 recursive-include doc/release *
 recursive-include doc/source *
 recursive-include doc/sphinxext *
+recursive-include doc/cython *
+recursive-include doc/pyrex *
+recursive-include doc/swig *
diff --git a/distnumpy/__init__.py b/distnumpy/__init__.py
new file mode 100644
index 000000000000..b05be908d7bf
--- /dev/null
+++ b/distnumpy/__init__.py
@@ -0,0 +1,21 @@
+"""
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+"""
+from setup import build
diff --git a/distnumpy/include/distnumpy.h b/distnumpy/include/distnumpy.h
new file mode 100644
index 000000000000..1335be46b21c
--- /dev/null
+++ b/distnumpy/include/distnumpy.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DISTNUMPY_H
+#define DISTNUMPY_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Only import when compiling distnumpymodule.c
+#ifdef DISTNUMPY_MODULE
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayobject.h"
+#endif
+
+//Flag indicating that it is a distributed array
+#define DNPY_DIST 0x2000
+//Flag indicating that it is a distributed array on one node
+#define DNPY_DIST_ONENODE 0x4000
+
+//Easy attribute retrievals.
+#define PyDistArray_WANT_DIST(m) PyArray_CHKFLAGS(m,DNPY_DIST)
+#define PyDistArray_WANT_ONENODE(m) PyArray_CHKFLAGS(m,DNPY_DIST_ONENODE)
+#define PyDistArray_ARRAY(obj) (((PyArrayObject *)(obj))->distary)
+
+//Import the API.
+#include "distnumpy_api.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(DISTNUMPY_H) */
diff --git a/distnumpy/include/distnumpy_api.h b/distnumpy/include/distnumpy_api.h
new file mode 100644
index 000000000000..9a1c1818a2f9
--- /dev/null
+++ b/distnumpy/include/distnumpy_api.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DISTNUMPY_API_H
+#define DISTNUMPY_API_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* C API functions */
+
+#define PyDistArray_Init_NUM 0
+#define PyDistArray_Init_RETURN int
+#define PyDistArray_Init_PROTO (void)
+
+#define PyDistArray_Exit_NUM 1
+#define PyDistArray_Exit_RETURN void
+#define PyDistArray_Exit_PROTO (void)
+
+#define PyDistArray_MasterSlaveSplit_NUM 2
+#define PyDistArray_MasterSlaveSplit_RETURN PyObject *
+#define PyDistArray_MasterSlaveSplit_PROTO (PyObject *self, PyObject *args)
+
+#define PyDistArray_NewBaseArray_NUM 3
+#define PyDistArray_NewBaseArray_RETURN int
+#define PyDistArray_NewBaseArray_PROTO (PyArrayObject *ary, npy_intp one_node_dist_rank)
+
+#define PyDistArray_DelViewArray_NUM 4
+#define PyDistArray_DelViewArray_RETURN int
+#define PyDistArray_DelViewArray_PROTO (PyArrayObject *array)
+
+#define PyDistArray_GetItem_NUM 5
+#define PyDistArray_GetItem_RETURN int
+#define PyDistArray_GetItem_PROTO (PyArrayObject *ary, char *retdata, npy_intp coord[NPY_MAXDIMS])
+
+#define PyDistArray_PutItem_NUM 6
+#define PyDistArray_PutItem_RETURN int
+#define PyDistArray_PutItem_PROTO (PyArrayObject *ary, npy_intp coord[NPY_MAXDIMS], PyObject *item)
+
+#define PyDistArray_ProcGridSet_NUM 7
+#define PyDistArray_ProcGridSet_RETURN PyObject *
+#define PyDistArray_ProcGridSet_PROTO (PyArrayObject *self, PyObject *args)
+
+#define PyDistArray_UnDist_NUM 8
+#define PyDistArray_UnDist_RETURN int
+#define PyDistArray_UnDist_PROTO (dndarray *ary)
+
+#define PyDistArray_IsDist_NUM 9
+#define PyDistArray_IsDist_RETURN int
+#define PyDistArray_IsDist_PROTO (PyArrayObject *ary)
+
+#define PyDistArray_NewViewArray_NUM 10
+#define PyDistArray_NewViewArray_RETURN int
+#define PyDistArray_NewViewArray_PROTO (PyArrayObject *orig_ary, PyArrayObject *new_ary, int nslice, dndslice slice[NPY_MAXDIMS])
+
+/* Total number of C API pointers */
+#define DistNumPy_API_pointers 11
+
+
+#ifdef DISTNUMPY_MODULE
+/* This section is used when compiling distnumpymodule.c */
+
+static PyDistArray_Init_RETURN PyDistArray_Init PyDistArray_Init_PROTO;
+static PyDistArray_Exit_RETURN PyDistArray_Exit PyDistArray_Exit_PROTO;
+static PyDistArray_MasterSlaveSplit_RETURN PyDistArray_MasterSlaveSplit PyDistArray_MasterSlaveSplit_PROTO;
+static PyDistArray_NewBaseArray_RETURN PyDistArray_NewBaseArray PyDistArray_NewBaseArray_PROTO;
+static PyDistArray_DelViewArray_RETURN PyDistArray_DelViewArray PyDistArray_DelViewArray_PROTO;
+static PyDistArray_GetItem_RETURN PyDistArray_GetItem PyDistArray_GetItem_PROTO;
+static PyDistArray_PutItem_RETURN PyDistArray_PutItem PyDistArray_PutItem_PROTO;
+static PyDistArray_ProcGridSet_RETURN PyDistArray_ProcGridSet PyDistArray_ProcGridSet_PROTO;
+static PyDistArray_UnDist_RETURN PyDistArray_UnDist PyDistArray_UnDist_PROTO;
+static PyDistArray_IsDist_RETURN PyDistArray_IsDist PyDistArray_IsDist_PROTO;
+static PyDistArray_NewViewArray_RETURN PyDistArray_NewViewArray PyDistArray_NewViewArray_PROTO;
+
+#else
+/* This section is used in modules that use distnumpy's API */
+
+static void **DistNumPy_API;
+
+#define PyDistArray_Init \
+ (*(PyDistArray_Init_RETURN (*)PyDistArray_Init_PROTO) DistNumPy_API[PyDistArray_Init_NUM])
+
+#define PyDistArray_Exit \
+ (*(PyDistArray_Exit_RETURN (*)PyDistArray_Exit_PROTO) DistNumPy_API[PyDistArray_Exit_NUM])
+
+#define PyDistArray_MasterSlaveSplit \
+ (*(PyDistArray_MasterSlaveSplit_RETURN (*)PyDistArray_MasterSlaveSplit_PROTO) DistNumPy_API[PyDistArray_MasterSlaveSplit_NUM])
+
+#define PyDistArray_NewBaseArray \
+ (*(PyDistArray_NewBaseArray_RETURN (*)PyDistArray_NewBaseArray_PROTO) DistNumPy_API[PyDistArray_NewBaseArray_NUM])
+
+#define PyDistArray_DelViewArray \
+ (*(PyDistArray_DelViewArray_RETURN (*)PyDistArray_DelViewArray_PROTO) DistNumPy_API[PyDistArray_DelViewArray_NUM])
+
+#define PyDistArray_GetItem \
+ (*(PyDistArray_GetItem_RETURN (*)PyDistArray_GetItem_PROTO) DistNumPy_API[PyDistArray_GetItem_NUM])
+
+#define PyDistArray_PutItem \
+ (*(PyDistArray_PutItem_RETURN (*)PyDistArray_PutItem_PROTO) DistNumPy_API[PyDistArray_PutItem_NUM])
+
+#define PyDistArray_ProcGridSet \
+ (*(PyDistArray_ProcGridSet_RETURN (*)PyDistArray_ProcGridSet_PROTO) DistNumPy_API[PyDistArray_ProcGridSet_NUM])
+
+#define PyDistArray_UnDist \
+ (*(PyDistArray_UnDist_RETURN (*)PyDistArray_UnDist_PROTO) DistNumPy_API[PyDistArray_UnDist_NUM])
+
+ #define PyDistArray_IsDist \
+ (*(PyDistArray_IsDist_RETURN (*)PyDistArray_IsDist_PROTO) DistNumPy_API[PyDistArray_IsDist_NUM])
+
+#define PyDistArray_NewViewArray \
+ (*(PyDistArray_NewViewArray_RETURN (*)PyDistArray_NewViewArray_PROTO) DistNumPy_API[PyDistArray_NewViewArray_NUM])
+
+/* Return -1 and set exception on error, 0 on success. */
+static int
+import_distnumpy(void)
+{
+    PyObject *c_api_object;
+    PyObject *module;
+
+    module = PyImport_ImportModule("distnumpy");
+    if (module == NULL)
+        return -1;
+
+    c_api_object = PyObject_GetAttrString(module, "_C_API");
+    if (c_api_object == NULL) {
+        Py_DECREF(module);
+        return -1;
+    }
+    if (PyCObject_Check(c_api_object))
+        DistNumPy_API = (void **)PyCObject_AsVoidPtr(c_api_object);
+
+    Py_DECREF(c_api_object);
+    Py_DECREF(module);
+    return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(DISTNUMPY_API_H) */
diff --git a/distnumpy/include/distnumpy_priv.h b/distnumpy/include/distnumpy_priv.h
new file mode 100644
index 000000000000..ef99b858f18b
--- /dev/null
+++ b/distnumpy/include/distnumpy_priv.h
@@ -0,0 +1,217 @@
+#ifndef DISTNUMPY_PRIV_H
+#define DISTNUMPY_PRIV_H
+#include "mpi.h"
+#include <sys/time.h>
+#include "distnumpy_types.h"
+#include "numpy/ufuncobject.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DISTNUMPY_DEBUG
+//#define DNPY_STATISTICS
+//#define DNDY_TIME
+//#define DNPY_SPMD
+
+//Minimum jobsize for an OpenMP thread. >blocksize means no OpenMP.
+#define DNPY_MIN_THREAD_JOBSIZE 10
+
+//Maximum message size (in bytes)
+#define DNPY_MAX_MSG_SIZE 1024*4
+
+//Maximum number of memory allocations in the memory pool.
+#define DNPY_MAX_MEM_POOL 10
+
+//Maximum number of view block operations in the sub-view-block DAG.
+#define DNPY_MAX_VB_IN_SVB_DAG 100
+
+//Disable Lazy Evaluation by definding this macro.
+#undef DNPY_NO_LAZY_EVAL
+
+//Maximum number of allocated arrays
+#define DNPY_MAX_NARRAYS 1024
+
+//Maximum number of operation merged together.
+#define DNPY_MAX_OP_MERGES 10
+
+//Default blocksize
+#define DNPY_BLOCKSIZE 2
+
+//Maximum number of nodes in the ready queue.
+#define DNPY_RDY_QUEUE_MAXSIZE 1024*10
+
+//Maximum MPI tag.
+#define DNPY_MAX_MPI_TAG 1048576
+
+//The maximum size of the work buffer in bytes (should be power of 2).
+#define DNPY_WORK_BUFFER_MAXSIZE 536870912 //½GB
+
+//The work buffer memory alignment.
+#define DNPY_WORK_BUFFER_MEM_ALIGNMENT 32
+
+//Operation types
+enum opt {DNPY_MSG_END, DNPY_CREATE_ARRAY, DNPY_DESTROY_ARRAY,
+          DNPY_CREATE_VIEW, DNPY_SHUTDOWN, DNPY_PUT_ITEM, DNPY_GET_ITEM,
+          DNPY_UFUNC, DNPY_UFUNC_REDUCE, DNPY_ZEROFILL, DNPY_DATAFILL,
+          DNPY_DATADUMP, DNPY_DIAGONAL, DNPY_MATMUL,
+          DNPY_RECV, DNPY_SEND, DNPY_BUF_RECV, DNPY_BUF_SEND, DNPY_APPLY,
+          DNPY_EVALFLUSH, DNPY_READ, DNPY_WRITE, DNPY_COMM, DNPY_NONCOMM,
+          DNPY_REDUCE_SEND, DNPY_REDUCE_RECV, DNPY_INIT_BLOCKSIZE,
+          DNPY_TIME_RESET, DNPY_TIME_GETDICT, DNPY_INIT_PGRID,
+          DNPY_COPY_INTO, DNPY_UNDIST};
+
+
+//Macro that increases the work buffer pointer.
+#define WORKBUF_INC(bytes_taken)                                       \
+{                                                                      \
+    workbuf_nextfree += bytes_taken;                                   \
+    workbuf_nextfree += DNPY_WORK_BUFFER_MEM_ALIGNMENT -               \
+                        (((npy_intp)workbuf_nextfree)                  \
+                        % DNPY_WORK_BUFFER_MEM_ALIGNMENT);             \
+    if(workbuf_nextfree >= workbuf_max)                                \
+    {                                                                  \
+        fprintf(stderr, "Work buffer overflow - increase the maximum " \
+                "work buffer size or decrease the maximum DAG size. "  \
+                "The current values are %dMB and %d nodes,"            \
+                "respectively.\n", DNPY_WORK_BUFFER_MAXSIZE / 1048576, \
+                DNPY_MAX_VB_IN_SVB_DAG);                               \
+        MPI_Abort(MPI_COMM_WORLD, -1);                                 \
+    }                                                                  \
+    assert(((npy_intp) workbuf_nextfree) %                             \
+                       DNPY_WORK_BUFFER_MEM_ALIGNMENT == 0);           \
+}
+
+
+//Variables for statistics.
+#ifdef DNPY_STATISTICS
+    static int node_uid_count = 0;
+    static int op_uid_count = 0;
+#endif
+
+
+//The Super-type of a operation.
+//refcount         - number of dependency nodes in the svb DAG.
+//op               - the operation, e.g. DNPY_RECV and DNPY_UFUNC.
+//optype           - the operation type, e.g. DNPY_COMM/_NONCOMM.
+//narys & views    - list of array views involved.
+//svbs             - list of sub-view-blocks involved (one per array),
+//                   NULL when whole arrays are involved.
+//accesstype       - access type e.g. DNPY_READ (one per array)
+//uid              - unique identification - only used for statistics.
+#define DNDOP_HEAD_BASE                     \
+    npy_intp refcount;                      \
+    char op;                                \
+    char optype;                            \
+    char narys;                             \
+    dndview *views[NPY_MAXARGS];            \
+    dndsvb *svbs[NPY_MAXARGS];              \
+    char accesstypes[NPY_MAXARGS];
+#ifdef DNPY_STATISTICS
+    #define DNDOP_HEAD DNDOP_HEAD_BASE npy_intp uid;
+#else
+    #define DNDOP_HEAD DNDOP_HEAD_BASE
+#endif
+typedef struct dndop_struct dndop;
+struct dndop_struct {DNDOP_HEAD};
+
+//Type describing a communication DAG node.
+typedef struct
+{
+    DNDOP_HEAD
+    //The MPI tag used for the communication.
+    npy_intp mpi_tag;
+    //The MPI rank of the process that is the remote communication peer.
+    int remote_rank;
+} dndop_comm;
+
+//Type describing an apply-sub-view-block, which is a subsection of a
+//sub-view-block that is used in apply.
+typedef struct
+{
+    npy_intp dims[NPY_MAXDIMS];
+    npy_intp stride[NPY_MAXDIMS];
+    npy_intp offset;
+} dndasvb;
+
+//Type describing a universal function DAG node.
+typedef struct
+{
+    DNDOP_HEAD
+    //List of apply-sub-view-block.
+    dndasvb asvb[NPY_MAXARGS];
+    //Number of output array views.
+    char nout;
+    //The operation described as a function, a data and a Python pointer.
+    PyUFuncGenericFunction func;
+    void *funcdata;
+    PyObject *PyOp;
+} dndop_ufunc;
+
+//Type describing a DAG node.
+struct dndnode_struct
+{
+    //The operation associated with this dependency.
+    dndop *op;
+    //The index to use when accessing op->views[] and op->svbs[].
+    int op_ary_idx;
+    //Next node in the dependency list.
+    dndnode *next;
+    //Unique identification used for statistics.
+    #ifdef DNPY_STATISTICS
+        npy_intp uid;
+    #endif
+};
+
+//MPI process variables.
+static int myrank, worldsize;
+static npy_intp blocksize;
+#ifndef DNPY_SPMD
+static npy_intp msg[DNPY_MAX_MSG_SIZE];
+#endif
+static npy_intp initmsg_not_handled=1;
+//The work buffer and its next free slot.
+static void *workbuf;
+static void *workbuf_nextfree;
+static void *workbuf_max;
+//Unique identification counter
+static npy_intp uid_count=0;
+//Cartesian dimension information - one for every dimension-order.
+static int *cart_dim_strides[NPY_MAXDIMS];
+static int *cart_dim_sizes[NPY_MAXDIMS];
+//Pointer to the python module who has the ufunc operators.
+static PyObject *ufunc_module;
+//The ready queue for operations and its current size.
+static dndop *ready_queue[DNPY_RDY_QUEUE_MAXSIZE];
+static npy_intp ready_queue_size=0;
+//Unique MPI tag.
+static int mpi_tag=0;
+//Pointer to the PyUFunc_Reduce function in umath_ufunc_object.inc
+typedef PyObject* (reduce_func_type)(PyUFuncObject *self,
+                                     PyArrayObject *arr,
+                                     PyArrayObject *out,
+                                     int axis, int otype,
+                                     void *threadlock);
+static reduce_func_type *reduce_func = NULL;
+
+
+//Variables for timing.
+struct timeval tv;
+struct timezone tz;
+static dndtime dndt;
+unsigned long long totaldelta;
+
+#define DNDTIME(output)                                         \
+    gettimeofday(&tv, &tz);                                     \
+    output = (unsigned long long) tv.tv_usec +                  \
+             (unsigned long long) tv.tv_sec * 1000000;
+#define DNDTIME_SUM(in,sum)                                     \
+    gettimeofday(&tv, &tz);                                     \
+    sum += ((unsigned long long) tv.tv_usec +                   \
+            (unsigned long long) tv.tv_sec * 1000000) - in;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(DISTNUMPY_PRIV_H) */
diff --git a/distnumpy/include/distnumpy_prototypes.h b/distnumpy/include/distnumpy_prototypes.h
new file mode 100644
index 000000000000..fa7ab34912fe
--- /dev/null
+++ b/distnumpy/include/distnumpy_prototypes.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DISTNUMPY_PROTOTYPES_H
+#define DISTNUMPY_PROTOTYPES_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+//Datatype prototypes.
+typedef struct dndnode_struct dndnode;
+typedef struct dndarray_struct dndarray;
+typedef struct dndview_struct dndview;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(DISTNUMPY_PROTOTYPES_H) */
diff --git a/distnumpy/include/distnumpy_types.h b/distnumpy/include/distnumpy_types.h
new file mode 100644
index 000000000000..2f6d1a264fed
--- /dev/null
+++ b/distnumpy/include/distnumpy_types.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DISTNUMPY_TYPES_H
+#define DISTNUMPY_TYPES_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <mpi.h>
+#include <distnumpy_prototypes.h>
+#include <numpy/ndarraytypes.h>
+
+//Type describing a distributed array.
+struct dndarray_struct
+{
+    //Unique identification.
+    npy_intp uid;
+    //Reference count.
+    int refcount;
+    //Number of dimensions.
+    int ndims;
+    //Size of dimensions.
+    npy_intp dims[NPY_MAXDIMS];
+    //Size of block-dimensions.
+    npy_intp blockdims[NPY_MAXDIMS];
+    //Number of blocks (global).
+    npy_intp nblocks;
+    //Data type of elements in array.
+    int dtype;
+    //Size of an element in bytes.
+    int elsize;
+    //Pointer to local data.
+    char *data;
+    //Number of elements (global).
+    npy_intp nelem;
+    //Number of local elements (local to the MPI-process).
+    npy_intp localsize;
+    //Size of local dimensions (local to the MPI-process).
+    npy_intp localdims[NPY_MAXDIMS];
+    //Size of local block-dimensions (local to the MPI-process).
+    npy_intp localblockdims[NPY_MAXDIMS];
+    //MPI-datatype that correspond to an array element.
+    MPI_Datatype mpi_dtype;
+    //Root nodes (one per block).
+    dndnode **rootnodes;
+    //Next and prev are used for traversing all arrays.
+    dndarray *next;
+    dndarray *prev;
+    //When onerank is positiv this array is only distributed on that
+    //MPI-process rank.
+    npy_intp onerank;
+    //Flag indicating whether the array is distributed or not.
+    int isdist;
+    //Memory protected start address (incl.).
+    npy_intp mprotected_start;
+    //memory protected end address (excl.).
+    npy_intp mprotected_end;
+    //Pointer to the NumPy array that created this dndarray.
+    PyArrayObject *pyary;
+};
+
+//dndslice constants.
+#define PseudoIndex -1//Adds a extra 1-dim - 'A[1,newaxis]'
+#define RubberIndex -2//A[1,2,...] (Not used in DistNumPy)
+#define SingleIndex -3//Dim not visible - 'A[1]'
+
+//Type describing a slice of a dimension.
+typedef struct
+{
+    //Start index.
+    npy_intp start;
+    //Elements between index.
+    npy_intp step;
+    //Number of steps (Length of the dimension).
+    npy_intp nsteps;
+} dndslice;
+
+//View-alteration flags.
+#define DNPY_NDIMS      0x001
+#define DNPY_STEP       0x002
+#define DNPY_NSTEPS     0x004
+#define DNPY_NONALIGNED 0x008
+
+//Type describing a view of a distributed array.
+struct dndview_struct
+{
+    //Unique identification.
+    npy_intp uid;
+    //The array this view is a view of.
+    dndarray *base;
+    //Number of viewable dimensions.
+    int ndims;
+    //Number of sliceses. NB: nslice >= base->ndims.
+    int nslice;
+    //Sliceses - the global view of the base-array.
+    dndslice slice[NPY_MAXDIMS];
+    //A bit mask specifying which alterations this view represents.
+    //Possible flags:
+    //Zero            - no alterations.
+    //DNPY_NDIMS      - number of dimensions altered.
+    //DNPY_STEP       - 'step' altered.
+    //DNPY_NSTEPS     - 'nsteps' altered.
+    //DNPY_NONALIGNED - 'start % blocksize != 0' or 'step != 1'.
+    int alterations;
+    //Number of view-blocks.
+    npy_intp nblocks;
+    //Number of view-blocks in each viewable dimension.
+    npy_intp blockdims[NPY_MAXDIMS];
+};
+
+
+//Type describing a sub-section of a view block.
+typedef struct
+{
+    //The rank of the MPI-process that owns this sub-block.
+    int rank;
+    //Start index (one per base-dimension).
+    npy_intp start[NPY_MAXDIMS];
+    //Number of elements (one per base-dimension).
+    npy_intp nsteps[NPY_MAXDIMS];
+    //Number of elements to next dimension (one per base-dimension).
+    npy_intp stride[NPY_MAXDIMS];
+    //The MPI communication offset (in bytes).
+    npy_intp comm_offset;
+    //Number of elements in this sub-view-block.
+    npy_intp nelem;
+    //This sub-view-block's root node.
+    dndnode **rootnode;
+    //Pointer to data. NULL if data needs to be fetched.
+    char *data;
+    //The rank of the MPI process that have received this svb.
+    //A negative value means that nothing has been received.
+    int comm_received_by;
+} dndsvb;
+
+//Type describing a view block.
+typedef struct
+{
+    //The id of the view block.
+    npy_intp uid;
+    //All sub-view-blocks in this view block (Row-major).
+    dndsvb *sub;
+    //Number of sub-view-blocks.
+    npy_intp nsub;
+    //Number of sub-view-blocks in each dimension.
+    npy_intp svbdims[NPY_MAXDIMS];
+} dndvb;
+
+//PyObject for the block iterator.
+typedef struct
+{
+    PyObject_HEAD
+    //The view that is iterated.
+    dndview *view;
+    //Current block coordinate.
+    npy_intp curblock[NPY_MAXDIMS];
+    //Slice of the blocks in the iterator.
+    dndslice slice[NPY_MAXDIMS];
+    //Strides for the Python array object.
+    npy_intp strides[NPY_MAXDIMS];
+    //Dimensions for the Python array object.
+    npy_intp dims[NPY_MAXDIMS];
+} dndblock_iter;
+
+//Type describing the timing data.
+typedef struct
+{
+    unsigned long long total;
+    unsigned long long dag_svb_flush;
+    unsigned long long dag_svb_rm;
+    unsigned long long apply_ufunc;
+    unsigned long long ufunc_comm;
+    unsigned long long comm_init;
+    unsigned long long arydata_free;
+    unsigned long long reduce_1d;
+    unsigned long long reduce_nd;
+    unsigned long long reduce_nd_apply;
+    unsigned long long zerofill;
+    unsigned long long ufunc_svb;
+    unsigned long long dag_svb_add;
+    unsigned long long calc_vblock;
+    unsigned long long arydata_malloc;
+    unsigned long long msg2slaves;
+    unsigned long long final_barrier;
+    npy_intp mem_reused;
+    npy_intp nconnect;
+    npy_intp nconnect_max;
+    npy_intp napply;
+    npy_intp nflush;
+} dndtime;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(DISTNUMPY_TYPES_H) */
diff --git a/distnumpy/setup.py b/distnumpy/setup.py
new file mode 100644
index 000000000000..ffcfaefcd7f4
--- /dev/null
+++ b/distnumpy/setup.py
@@ -0,0 +1,55 @@
+"""
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+"""
+
+from distutils.core import setup, Extension
+from os.path import join
+
+def build(build_path):
+    print "build_path: ", build_path
+    setup(name='DistNumPy',
+          version='1.0',
+          ext_modules=[Extension(name='distnumpymodule',
+                                 sources=[join('distnumpy','src','distnumpymodule.c')],
+                                 include_dirs=[join('distnumpy','include'),
+                                               join('distnumpy','private'),
+                                               join('numpy','core','include'),
+                                               join(build_path, 'numpy','core','include','numpy')],
+                                 extra_compile_args=[],
+                                 extra_link_args=[],
+                                 depends=[join('distnumpy','src','helpers.c'),
+                                          join('distnumpy','src','helpers.h'),
+                                          join('distnumpy','src','array_database.c'),
+                                          join('distnumpy','src','array_database.h'),
+                                          join('distnumpy','src','arrayobject.c'),
+                                          join('distnumpy','src','arrayobject.h'),
+                                          join('distnumpy','src','dependency_system.c'),
+                                          join('distnumpy','src','dependency_system.h'),
+                                          join('distnumpy','src','process_grid.c'),
+                                          join('distnumpy','src','process_grid.h'),
+                                          join('distnumpy','src','arraydata.c'),
+                                          join('distnumpy','src','arraydata.h'),
+                                          join('distnumpy','src','memory.c'),
+                                          join('distnumpy','src','memory.h')]
+                                 )],
+          )
+
+if __name__ == '__main__':
+    print('This is the wrong setup.py file to run')
diff --git a/distnumpy/src/array_database.c b/distnumpy/src/array_database.c
new file mode 100644
index 000000000000..3b49e3d2e90f
--- /dev/null
+++ b/distnumpy/src/array_database.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//Array-bases belonging to local MPI process
+static dndarray dndarrays[DNPY_MAX_NARRAYS];
+static npy_intp dndarrays_uid[DNPY_MAX_NARRAYS];
+
+//Array-views belonging to local MPI process
+static dndview dndviews[DNPY_MAX_NARRAYS];
+static npy_intp dndviews_uid[DNPY_MAX_NARRAYS];
+
+/*===================================================================
+ *
+ * Put, get & remove array from the local array database.
+ */
+dndarray *get_dndarray(npy_intp uid)
+{
+    npy_intp i;
+    if(uid)
+        for(i=0; i < DNPY_MAX_NARRAYS; i++)
+            if(dndarrays_uid[i] == uid)
+                return &dndarrays[i];
+    fprintf(stderr, "get_dndarray, uid %ld does not exist\n", (long) uid);
+    MPI_Abort(MPI_COMM_WORLD, -1);
+    return NULL;
+}
+dndarray *put_dndarray(dndarray *ary)
+{
+    npy_intp i;
+
+    for(i=0; i < DNPY_MAX_NARRAYS; i++)
+        if(dndarrays_uid[i] == 0)
+        {
+            memcpy(&dndarrays[i], ary, sizeof(dndarray));
+            dndarrays_uid[i] = ary->uid;
+            return &dndarrays[i];
+        }
+    fprintf(stderr, "put_dndarray, MAX_NARRAYS is exceeded\n");
+    MPI_Abort(MPI_COMM_WORLD, -1);
+    return NULL;
+}
+void rm_dndarray(npy_intp uid)
+{
+    npy_intp i;
+    if(uid)
+        for(i=0; i < DNPY_MAX_NARRAYS; i++)
+            if(dndarrays_uid[i] == uid)
+            {
+                dndarray *ary = &dndarrays[i];
+                //Cleanup base.
+                dndarrays_uid[i] = 0;
+                //Remove the array from to the linked list.
+                if(ary->next != NULL)
+                    ary->next->prev = ary->prev;
+                if(ary->prev != NULL)
+                    ary->prev->next = ary->next;
+                else
+                    rootarray = ary->next;
+
+                MPI_Type_free(&ary->mpi_dtype);
+                if(ary->data != NULL)
+                {
+                    mem_pool_put((dndmem*) (ary->data-sizeof(dndmem)));
+                }
+                free(ary->rootnodes);
+                --ndndarrays;
+                assert(ndndarrays >= 0);
+                return;
+            }
+    fprintf(stderr, "rm_dndarray, uid %ld does not exist\n", (long)uid);
+    MPI_Abort(MPI_COMM_WORLD, -1);
+    return;
+}/* Put, get & rm dndarray */
+
+/*===================================================================
+ *
+ * Put, get & remove views from the local array database.
+ */
+dndview *get_dndview(npy_intp uid)
+{
+    npy_intp i;
+    if(uid)
+        for(i=0; i < DNPY_MAX_NARRAYS; i++)
+            if(dndviews_uid[i] == uid)
+                return &dndviews[i];
+    fprintf(stderr, "get_dndview, uid %ld does not exist\n", (long) uid);
+    MPI_Abort(MPI_COMM_WORLD, -1);
+    return NULL;
+}
+dndview *put_dndview(dndview *view)
+{
+    npy_intp i;
+
+    for(i=0; i < DNPY_MAX_NARRAYS; i++)
+        if(dndviews_uid[i] == 0)
+        {
+            memcpy(&dndviews[i], view, sizeof(dndview));
+            dndviews_uid[i] = view->uid;
+            return &dndviews[i];
+        }
+    fprintf(stderr, "put_dndview, MAX_NARRAYS is exceeded\n");
+    MPI_Abort(MPI_COMM_WORLD, -1);
+    return NULL;
+}
+//NB: rm_dndview will also free memory allocted for the dndarray
+//if it is the last reference to the dndarray.
+void rm_dndview(npy_intp uid)
+{
+    npy_intp i;
+    if(uid)
+        for(i=0; i < DNPY_MAX_NARRAYS; i++)
+            if(dndviews_uid[i] == uid)
+            {
+                dndview *view = &dndviews[i];
+                //Cleanup base.
+                dndviews_uid[i] = 0;
+                if(view->base->ndims == 0)//Dummy Scalar.
+                {
+                    //Remove the array from to the linked list.
+                    if(view->base->next != NULL)
+                        view->base->next->prev = view->base->prev;
+                    if(view->base->prev != NULL)
+                        view->base->prev->next = view->base->next;
+                    else
+                        rootarray = view->base->next;
+                    free(view->base->rootnodes);
+                    free(view->base->data);
+                    free(view->base);
+                }
+                else if(--view->base->refcount == 0)
+                {
+                    //Remove the array.
+                    rm_dndarray(view->base->uid);
+                }
+                return;
+            }
+    fprintf(stderr, "rm_dndview, uid %ld does not exist\n", (long)uid);
+    MPI_Abort(MPI_COMM_WORLD, -1);
+    return;
+}/* Put, get & rm dndview */
+
diff --git a/distnumpy/src/array_database.h b/distnumpy/src/array_database.h
new file mode 100644
index 000000000000..5de3d1742a41
--- /dev/null
+++ b/distnumpy/src/array_database.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * There is a local array database on each MPI-process.
+ * The database consist of all array-views distributed.
+ */
+
+
+#ifndef ARRAY_DATABASE_H
+#define ARRAY_DATABASE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+//Current number of dndviews allocated.
+npy_intp ndndarrays=0;
+//Root when traversing all arrays.
+dndarray *rootarray = NULL;
+
+/*===================================================================
+ *
+ * Put, get & remove arrays from the local array database.
+ */
+dndarray *get_dndarray(npy_intp uid);
+dndarray *put_dndarray(dndarray *ary);
+void rm_dndarray(npy_intp uid);
+
+/*===================================================================
+ *
+ * Put, get & remove views from the local array database.
+ */
+dndview *get_dndview(npy_intp uid);
+dndview *put_dndview(dndview *view);
+void rm_dndview(npy_intp uid);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(ARRAY_DATABASE_H) */
diff --git a/distnumpy/src/arraydata.c b/distnumpy/src/arraydata.c
new file mode 100644
index 000000000000..94aed3d44a10
--- /dev/null
+++ b/distnumpy/src/arraydata.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * The Array Data Protection handles the event when NumPy or external
+ * libraries access the array data directly. Since DistNumPy distribute
+ * this data, the result of such direct array data access is a
+ * segmentation fault. The handle this access we allocate protected
+ * memory and makes the local array data pointer points to this memory.
+ */
+
+#include <errno.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+/*
+ *===================================================================
+ * Signal handler for SIGSEGV.
+ * Private.
+ */
+static void
+sighandler(int signal_number, siginfo_t *info, void *context)
+{
+    //Iterate through all arrays.
+    dndarray *tary = rootarray;
+    while(tary != NULL)
+    {
+        npy_uintp addr = (npy_uintp)info->si_addr;
+        if(tary->mprotected_start <= addr && addr < tary->mprotected_end)
+           break;
+
+        //Go to the next ary.
+        tary = tary->next;
+    }
+
+    if(tary == NULL)//Normal segfault.
+    {
+        signal(signal_number, SIG_DFL);
+    }
+    else//Segfault triggered by accessing the protected data pointer.
+    {
+        printf("Warning - un-distributing array(%ld) because of "
+               "direct data access(%p).\n", tary->uid, info->si_addr);
+        PyDistArray_UnDist(tary);
+    }
+}
+
+/*
+ *===================================================================
+ * Initialization of the Array Data Protection.
+ */
+int arydat_init(void)
+{
+   // Install Signal handler
+   struct sigaction sact;
+
+   sigfillset(&(sact.sa_mask));
+   sact.sa_flags = SA_SIGINFO | SA_ONSTACK;
+   sact.sa_sigaction = sighandler;
+   sigaction (SIGSEGV, &sact, &sact);
+
+
+    return 0;
+} /* arydat_init */
+
+/*
+ *===================================================================
+ * Finalization of the Array Data Protection.
+ */
+int arydat_finalize(void)
+{
+
+    return 0;
+} /* arydat_finalize */
+
+/*
+ *===================================================================
+ * Allocate protected data memory for the 'ary'.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int arydat_malloc(PyArrayObject *ary)
+{
+    dndview *view = PyDistArray_ARRAY(ary);
+    npy_int size = view->base->nelem * view->base->elsize;
+
+    //Allocate page-size aligned memory.
+    //The MAP_PRIVATE and MAP_ANONYMOUS flags is not 100% portable. See:
+    //<http://stackoverflow.com/questions/4779188/how-to-use-mmap-to-allocate-a-memory-in-heap>
+    void *addr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+    if(addr == MAP_FAILED)
+    {
+        int errsv = errno;//mmap() sets the errno.
+        PyErr_Format(PyExc_RuntimeError, "The Array Data Protection "
+                     "could not mmap a data region. "
+                     "Returned error code by mmap: %s.", strerror(errsv));
+        return -1;
+    }
+
+    //Protect the memory.
+    if(mprotect(addr, size, PROT_NONE) == -1)
+    {
+        int errsv = errno;//mprotect() sets the errno.
+        PyErr_Format(PyExc_RuntimeError, "The Array Data Protection "
+                     "could not mmap a data region. "
+                     "Returned error code by mmap: %s.", strerror(errsv));
+        return -1;
+    }
+
+    //Update the ary data pointer.
+    PyArray_BYTES(ary) = addr;
+    //We also need to save the start and end address.
+    view->base->mprotected_start = (npy_uintp)addr;
+    view->base->mprotected_end = view->base->mprotected_start + size;
+
+    return 0;
+}/* arydat_malloc */
+
+/*
+ *===================================================================
+ * Free protected memory.
+ */
+int arydat_free(PyArrayObject *ary)
+{
+    void *addr = PyArray_DATA(ary);
+    dndview *view = PyDistArray_ARRAY(ary);
+    npy_int size = view->base->nelem * view->base->elsize;
+
+    if(munmap(addr, size) == -1)
+    {
+        int errsv = errno;//munmmap() sets the errno.
+        PyErr_Format(PyExc_RuntimeError, "The Array Data Protection "
+                     "could not mummap a data region. "
+                     "Returned error code by mmap: %s.", strerror(errsv));
+        return -1;
+    }
+    return 0;
+}
diff --git a/distnumpy/src/arraydata.h b/distnumpy/src/arraydata.h
new file mode 100644
index 000000000000..4ef28861998e
--- /dev/null
+++ b/distnumpy/src/arraydata.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ARRAYDATA_H
+#define ARRAYDATA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ *===================================================================
+ * Initialization of the Array Data Protection.
+ */
+int arydat_init(void);
+
+/*
+ *===================================================================
+ * Finalization of the Array Data Protection.
+ */
+int arydat_finalize(void);
+
+/*
+ *===================================================================
+ * Allocate data memory for the 'ary'.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int arydat_malloc(PyArrayObject *ary);
+
+/*
+ *===================================================================
+ * Free protected memory.
+ */
+int arydat_free(PyArrayObject *ary);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(ARRAYDATA_H) */
diff --git a/distnumpy/src/arrayobject.c b/distnumpy/src/arrayobject.c
new file mode 100644
index 000000000000..20df6e7a8cde
--- /dev/null
+++ b/distnumpy/src/arrayobject.c
@@ -0,0 +1,740 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+/*
+ *===================================================================
+ * Check whether the array distributed or not.
+ */
+static int
+PyDistArray_IsDist(PyArrayObject *ary)
+{
+    if(PyDistArray_ARRAY(ary) != NULL)
+        return PyDistArray_ARRAY(ary)->base->isdist;
+    return 0;//False
+}/* PyDistArray_IsDist */
+
+/*
+ *===================================================================
+ * Create a new base array and updates the PyArrayObject.
+ * If 'one_node_dist_rank' is positive it specifies the rank of an
+ * one-node-distribution.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_NewBaseArray(PyArrayObject *ary, npy_intp one_node_dist_rank)
+{
+    int i;
+
+    //Make sure that the init message has been handled.
+    if(initmsg_not_handled)
+        PyDistArray_ProcGridSet(NULL,NULL);
+
+    //Create dndarray.
+    dndarray newarray;
+    newarray.dtype = PyArray_TYPE(ary);
+    newarray.elsize = PyArray_ITEMSIZE(ary);
+    newarray.ndims = PyArray_NDIM(ary);
+    newarray.nelem = PyArray_SIZE(ary);
+    newarray.isdist = 1;
+    newarray.refcount = 1;
+    newarray.onerank = one_node_dist_rank;
+    for(i=0; i<PyArray_NDIM(ary); i++)
+        newarray.dims[i] = PyArray_DIM(ary, i);
+
+    //Create dndview. NB: the base will have to be set when 'newarray'
+    //has found its final resting place. (Done by put_dndarray).
+    dndview newview;
+    newview.uid = ++uid_count;
+    newview.nslice = PyArray_NDIM(ary);
+    newview.ndims = PyArray_NDIM(ary);
+    newview.alterations = 0;
+    for(i=0; i<PyArray_NDIM(ary); i++)
+    {
+        //Default the view will span over the whole array.
+        newview.slice[i].start = 0;
+        newview.slice[i].step = 1;
+        newview.slice[i].nsteps = PyArray_DIM(ary, i);
+    }
+
+#ifndef DNPY_SPMD
+    //Tell slaves about the new array
+    msg[0] = DNPY_CREATE_ARRAY;
+    memcpy(&msg[1], &newarray, sizeof(dndarray));
+    memcpy(((char *) &msg[1]) + sizeof(dndarray), &newview,
+           sizeof(dndview));
+
+    *(((char *) &msg[1])+sizeof(dndarray)+sizeof(dndview)) = DNPY_MSG_END;
+
+    msg2slaves(msg,2*sizeof(npy_intp)+sizeof(dndarray)+sizeof(dndview));
+#endif
+
+    dndview *ret = handle_NewBaseArray(&newarray, &newview);
+
+    if(ret == NULL)
+        return -1;
+
+    PyDistArray_ARRAY(ary) = ret;
+    ret->base->pyary = ary;
+
+    //Protect the original NumPy data pointer.
+    //This is only done by the Master MPI Process.
+    return arydat_malloc(ary);
+} /* PyDistArray_NewBaseArray */
+
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_NewBaseArray.
+ * Return NULL and set exception on error.
+ * Return a pointer to the new dndview on success.
+ */
+dndview *handle_NewBaseArray(dndarray *array, dndview *view)
+{
+    int ndims = array->ndims;
+    int *cdims = cart_dim_sizes[ndims-1];
+    npy_intp i;
+    int cartcoord[NPY_MAXDIMS];
+
+    ++ndndarrays;
+
+    //Save array uid.
+    array->uid = view->uid;
+
+    //Save the new array-base.
+    dndarray *ary = put_dndarray(array);
+    view->base = ary;
+
+    //Append the array to the linked list.
+    ary->prev = NULL;
+    ary->next = rootarray;
+    rootarray = ary;
+    if(ary->next != NULL)
+    {
+        assert(ary->next->prev == NULL);
+        ary->next->prev = rootarray;
+    }
+
+    //Get cartesian coords.
+    rank2cart(ndims, myrank, cartcoord);
+
+    //Accumulate the total number of local sizes and save it.
+    npy_intp localsize = 1;
+    ary->nblocks = 1;
+    for(i=0; i < ary->ndims; i++)
+    {
+        if(ary->onerank < 0)
+            ary->localdims[i] = dnumroc(ary->dims[i], blocksize,
+                                        cartcoord[i], cdims[i], 0);
+        else
+            ary->localdims[i] = (ary->onerank==myrank)?ary->dims[i]:0;
+
+        localsize *= ary->localdims[i];
+        ary->localblockdims[i] = ceil(ary->localdims[i] /
+                                      (double) blocksize);
+        ary->blockdims[i] = ceil(ary->dims[i] / (double) blocksize);
+        ary->nblocks *= ary->blockdims[i];
+    }
+    ary->localsize = localsize;
+    if(ary->localsize == 0)
+    {
+        memset(ary->localdims, 0, ary->ndims * sizeof(npy_intp));
+        memset(ary->localblockdims, 0, ary->ndims * sizeof(npy_intp));
+    }
+    if(ary->nblocks == 0)
+        memset(ary->blockdims, 0, ary->ndims * sizeof(npy_intp));
+
+    //Allocate the root nodes array.
+    ary->rootnodes = malloc(ary->nblocks * sizeof(dndnode*));
+    if(ary->rootnodes == NULL)
+    {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for(i=0; i<ary->nblocks; i++)
+        ary->rootnodes[i] = NULL;
+
+    //The memory allocation is delayed to the point where it is used.
+    ary->data = NULL;
+
+    //Create a MPI-datatype that correspond to an array element.
+    MPI_Type_contiguous(ary->elsize, MPI_BYTE, &ary->mpi_dtype);
+    MPI_Type_commit(&ary->mpi_dtype);
+
+    //Compute number of blocks.
+    view->nblocks = 1;
+    for(i=0; i<ndims;i++)
+    {
+        view->blockdims[i] = ceil(ary->dims[i] / (double) blocksize);
+        view->nblocks *= view->blockdims[i];
+    }
+    if(view->nblocks == 0)
+        memset(view->blockdims, 0, ndims * sizeof(npy_intp));
+
+    //Save and return the new view.
+    return put_dndview(view);
+} /* handle_NewBaseArray */
+
+
+/*
+ *===================================================================
+ * Create a new view of an array and updates the PyArrayObject.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_NewViewArray(PyArrayObject *orig_ary, PyArrayObject *new_ary,
+                         int nslice, dndslice slice[NPY_MAXDIMS])
+{
+    dndview *orgview = PyDistArray_ARRAY(orig_ary);
+
+    //Create new view based on 'org_view' and the 'slice'.
+    dndview newview;
+    newview.uid = ++uid_count;
+    newview.ndims = 0;
+    newview.alterations = 0;
+    newview.nblocks = 1;
+
+    //Merging the two views.
+    int si = 0; //slice index.
+    int ni = 0; //new index.
+    int oi = 0; //old index.
+    int di = 0; //dim index.
+    while(si < nslice || oi < orgview->nslice)
+    {
+        //If we come to the end of the slices, that happens
+        //if not all dimensions is included in the 'slices', we will
+        //use the whole dimension.
+        int vs = (si < nslice)?1:0;//Valid slice.
+
+        //If dimension is invisible we will just copy it to 'newview'.
+        if(oi < orgview->nslice &&
+           orgview->slice[oi].nsteps == SingleIndex)
+        {
+            memcpy(&newview.slice[ni], &orgview->slice[oi],
+                   sizeof(dndslice));
+            ni++; oi++; di++;
+            newview.alterations |= DNPY_NDIMS;
+        }
+        //A single index makes the dimension invisible.
+        else if(vs && slice[si].nsteps == SingleIndex)
+        {
+            //If dimension is a Pseudo-dimension then just go to next
+            //dimension.
+            if(orgview->slice[oi].nsteps == PseudoIndex)
+            {
+                si++; oi++;
+            }
+            else
+            {//Copy single index to 'newview'.
+                newview.slice[ni].step = 0;
+                newview.slice[ni].nsteps = SingleIndex;
+                newview.slice[ni].start = orgview->slice[oi].start +
+                                          (vs?slice[si].start:0) *
+                                          orgview->slice[oi].step;
+                si++; ni++; oi++; di++;
+                newview.alterations |= DNPY_NDIMS;
+            }
+        }
+        //If a extra pseudo index should be added we just copy the
+        //slice to 'newview'.
+        else if(vs && slice[si].nsteps == PseudoIndex)
+        {
+            memcpy(&newview.slice[ni], &slice[si], sizeof(dndslice));
+            ni++; si++;
+            newview.ndims++;
+            newview.alterations |= DNPY_NDIMS;
+        }
+        else if(orgview->slice[oi].nsteps == PseudoIndex)
+        {
+            memcpy(&newview.slice[ni], &orgview->slice[oi],
+                   sizeof(dndslice));
+
+            if(slice[si].start > 0)
+            {
+                //This is a special case where the user indexes the
+                //PseudoIndex, which is legal and will return [].
+                newview.nblocks = 0;
+            }
+
+            ni++; oi++; si++;
+            newview.ndims++;
+            newview.alterations |= DNPY_NDIMS;
+        }
+        //If no special slices we just merge the two views.
+        else
+        {
+            if(vs)
+            {
+                newview.slice[ni].start = orgview->slice[oi].start +
+                                           slice[si].start *
+                                           orgview->slice[oi].step;
+                newview.slice[ni].step = slice[si].step *
+                                          orgview->slice[oi].step;
+                newview.slice[ni].nsteps = slice[si].nsteps;
+            }
+            else
+                memcpy(&newview.slice[ni], &orgview->slice[oi],
+                       sizeof(dndslice));
+
+            if(newview.slice[ni].step > 1)
+                newview.alterations |= DNPY_STEP | DNPY_NSTEPS;
+            else if(newview.slice[ni].nsteps < orgview->base->dims[di])
+            {
+                newview.alterations |= DNPY_NSTEPS;
+            }
+            newview.ndims++;
+            si++; ni++; oi++; di++;
+        }
+    }
+    //Save the total number of sliceses for the new view.
+    newview.nslice = ni;
+
+    //Check if the view is not block alligned
+    for(si=0; si<newview.nslice; ++si)
+        if(newview.slice[si].start % blocksize != 0 ||
+           newview.slice[si].step != 1)
+        {
+            newview.alterations |= DNPY_NONALIGNED;
+            break;
+        }
+
+#ifndef DNPY_SPMD
+    //Tell slaves about the new view.
+    //NB: It is up to the slaves to add the newview.base adresse.
+    msg[0] = DNPY_CREATE_VIEW;
+    msg[1] = orgview->uid;
+    memcpy(&msg[2], &newview, sizeof(dndview));
+    *(((char *) &msg[2])+sizeof(dndview)) = DNPY_MSG_END;
+
+    msg2slaves(msg, 3 * sizeof(npy_intp) + sizeof(dndview));
+#endif
+
+    PyDistArray_ARRAY(new_ary) = handle_NewViewArray(orgview, &newview);
+
+    return 0;
+
+}/* PyDistArray_NewViewArray */
+
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_NewViewArray.
+ * Return NULL and set exception on error.
+ * Return a pointer to the new dndview on success.
+ */
+dndview *handle_NewViewArray(dndview *orgview, dndview *newview)
+{
+    npy_intp n, i, j;
+
+    //Add the base to the new view.
+    assert(orgview->base->refcount > 0);
+    newview->base = orgview->base;
+    newview->base->refcount++;
+
+    //Compute size of view-blocks.
+    if(newview->nblocks == 0)//The view is empty
+    {
+        memset(newview->blockdims, 0, newview->ndims * sizeof(npy_intp));
+    }
+    else
+    {
+        newview->nblocks = 1;
+        n=0;
+        for(i=0; i < newview->nslice; i++)
+        {
+            if(newview->slice[i].nsteps != SingleIndex)
+            {
+                j = 1;//SingleIndex has length one.
+                if(newview->slice[i].nsteps != PseudoIndex)
+                    j = newview->slice[i].nsteps;
+
+                newview->blockdims[n] = ceil(j / (double) blocksize);
+                newview->nblocks *= newview->blockdims[n];
+                n++;
+            }
+        }
+    }
+    //Save and return the view.
+    return put_dndview(newview);
+}/* handle_NewViewArray */
+
+/*
+ *===================================================================
+ * Delete array view.
+ * When it is the last view of the base array, the base array is de-
+ * allocated.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_DelViewArray(PyArrayObject *array)
+{
+    //Get arrray structs.
+    dndview *ary = PyDistArray_ARRAY(array);
+
+#ifndef DNPY_SPMD
+    //Tell slaves about the destruction
+    msg[0] = DNPY_DESTROY_ARRAY;
+    msg[1] = ary->uid;
+    msg[2] = DNPY_MSG_END;
+    msg2slaves(msg,3 * sizeof(npy_intp));
+#endif
+
+    if(handle_DelViewArray(ary->uid) == -1)
+        return -1;
+
+    //We have to free the protected data pointer when the NumPy array
+    //is not a view.
+    if((array->flags & NPY_OWNDATA) && array->data != NULL)
+        return arydat_free(array);
+
+    return 0;
+
+} /* PyDistArray_DelViewArray */
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_NewBaseArray.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_DelViewArray(npy_intp uid)
+{
+    dndview *view = get_dndview(uid);
+
+    dndop *op = workbuf_nextfree;
+    WORKBUF_INC(sizeof(dndop));
+    op->op = DNPY_DESTROY_ARRAY;
+    op->optype = DNPY_NONCOMM;
+    op->narys = 1;
+    op->refcount = 0;
+    op->views[0] = view;
+    op->svbs[0] = NULL;//Whole array.
+    op->accesstypes[0] = DNPY_WRITE;
+
+    dndnode *node = workbuf_nextfree;
+    WORKBUF_INC(sizeof(dndnode));
+    node->op = op;
+    node->op_ary_idx = 0;
+    dep_add(node, 1, 0);
+
+    return 0;
+} /* handle_DelViewArray */
+
+/*
+ *===================================================================
+ * Assign the value to array at coordinate.
+ * 'coord' size must be the same as view->ndims.
+ * Steals all reference to item. (Item is lost).
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_PutItem(PyArrayObject *ary, npy_intp coord[NPY_MAXDIMS],
+                    PyObject *item)
+{
+    //Get arrray structs.
+    dndview *view = PyDistArray_ARRAY(ary);
+
+    //Convert item to a compatible type.
+    PyObject *item2 = PyArray_FROM_O(item);
+    PyObject *citem2 = PyArray_Cast((PyArrayObject*)item2,
+                                    view->base->dtype);
+
+    //Cleanup and return error if the cast failed.
+    if(citem2 == NULL)
+    {
+        Py_DECREF(item2);
+        return -1;
+    }
+
+#ifndef DNPY_SPMD
+    int ndims = view->ndims;
+    int elsize = view->base->elsize;
+    //Tell slaves about the new item.
+    msg[0] = DNPY_PUT_ITEM;
+    msg[1] = view->uid;
+    memcpy(&msg[2], PyArray_DATA(citem2), elsize);
+    memcpy(((char *) &msg[2]) + elsize, coord,
+           sizeof(npy_intp) * ndims);
+    *(((char *) &msg[2]) + elsize + sizeof(npy_intp) * ndims) = DNPY_MSG_END;
+
+    msg2slaves(msg, 3 * sizeof(npy_intp) + elsize +
+                    ndims * sizeof(npy_intp));
+#endif
+
+    handle_PutGetItem(1, view, PyArray_DATA(citem2), coord);
+
+    //Clean up.
+    Py_DECREF(citem2);
+    Py_DECREF(item2);
+
+    return 0;//Succes
+} /* PyDistArray_PutItem */
+
+/*
+ *===================================================================
+ * Get a single value specified by coordinate from the array.
+ * 'coord' size must be the same as view->ndims.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_GetItem(PyArrayObject *ary, char *retdata,
+                    npy_intp coord[NPY_MAXDIMS])
+{
+    //Get arrray structs.
+    dndview *view = PyDistArray_ARRAY(ary);
+
+#ifndef DNPY_SPMD
+    //Tell slaves to send item.
+    msg[0] = DNPY_GET_ITEM;
+    msg[1] = view->uid;
+    memcpy(&msg[2], coord, sizeof(npy_intp)*view->ndims);
+    *(((char *) &msg[2]) + sizeof(npy_intp)*view->ndims) = DNPY_MSG_END;
+
+    msg2slaves(msg, 3*sizeof(npy_intp) + view->ndims*sizeof(npy_intp));
+#endif
+
+    handle_PutGetItem(0, view, retdata, coord);
+
+    return 0;
+} /* PyDistArray_GetItem */
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_PutItem and PyDistArray_GetItem.
+ * Direction: 0=Get, 1=Put.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_PutGetItem(int Direction, dndview *view, char* item,
+                      npy_intp coord[NPY_MAXDIMS])
+{
+    npy_intp i,j,b,offset;
+    npy_intp n, s;
+    npy_intp tcoord[NPY_MAXDIMS];
+    npy_intp rcoord[NPY_MAXDIMS];
+    npy_intp nsteps[NPY_MAXDIMS];
+    npy_intp step[NPY_MAXDIMS];
+    char *data;
+    dndvb vblock;
+
+    dep_flush(1);
+
+    //Convert to block coordinates.
+    for(i=0; i<view->ndims; i++)
+        tcoord[i] = coord[i] / blocksize;
+
+    //Get view block info.
+    calc_vblock(view, tcoord, &vblock);
+
+    //Convert PseudoIndex and SingleIndex.
+    j=0;n=0;
+    for(i=0; i < view->nslice; i++)
+    {
+        if(view->slice[i].nsteps == PseudoIndex)
+        {
+            assert(view->slice[i].start == 0);
+            n++;
+        }
+        else if(view->slice[i].nsteps == SingleIndex)
+        {
+            rcoord[j] = 0;//The offset is already incl. in the svb.
+            nsteps[j] = 1;
+            step[j] = 1;
+            j++;
+        }
+        else
+        {
+            rcoord[j] = coord[n];
+            nsteps[j] = view->slice[i].nsteps;
+            step[j] = view->slice[i].step;
+            j++; n++;
+        }
+    }
+
+    //Convert global coordinate to index coordinates
+    //relative to the view.
+    for(i=0; i<view->base->ndims; i++)
+        tcoord[i] = rcoord[i] % blocksize;
+
+    //Find sub view block and convert icoord to coordinate
+    //relative to the sub view block.
+    s=1;b=0;
+    for(i=view->base->ndims-1; i>=0; i--)//Row-major.
+    {
+        j = vblock.sub[b].nsteps[i];
+        while(tcoord[i] >= vblock.sub[b].nsteps[i])
+        {
+            tcoord[i] -= vblock.sub[b].nsteps[i];
+            j += vblock.sub[b].nsteps[i];
+            b += s;
+        }
+        while(j < MIN(blocksize, nsteps[i] - rcoord[i]))
+        {
+            j += vblock.sub[b].nsteps[i];
+        }
+        s *= vblock.svbdims[i];
+    }
+
+    //Compute offset.
+    offset = 0;
+    for(i=view->base->ndims-1; i>=0; i--)//Row-major.
+        offset += (vblock.sub[b].start[i] + tcoord[i] * step[i]) *
+                   vblock.sub[b].stride[i];
+    delayed_array_allocation(view->base);
+    data = view->base->data + offset*view->base->elsize;
+
+#ifndef DNPY_SPMD
+    if(vblock.sub[b].rank == 0)//Local copying.
+    {
+        if(myrank == 0)
+        {
+            if(Direction)
+                memcpy(data, item, view->base->elsize);
+            else
+                memcpy(item, data, view->base->elsize);
+        }
+    }
+    else if(myrank == 0)
+    {
+        if(Direction)
+            MPI_Ssend(item, 1, view->base->mpi_dtype, vblock.sub[b].rank,
+                     0, MPI_COMM_WORLD);
+        else
+            MPI_Recv(item, 1, view->base->mpi_dtype, vblock.sub[b].rank,
+                     0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    }
+    else if(myrank == vblock.sub[b].rank)
+    {
+        if(Direction)
+            MPI_Recv(data, 1, view->base->mpi_dtype, 0, 0,
+                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        else
+            MPI_Ssend(data, 1, view->base->mpi_dtype, 0, 0,
+                     MPI_COMM_WORLD);
+    }
+#else
+    if(Direction)
+    {
+        if(vblock.sub[b].rank == 0)//Local copying.
+        {
+            if(myrank == 0)
+                memcpy(data, item, view->base->elsize);
+        }
+        else if(myrank == 0)
+        {
+            MPI_Ssend(item, 1, view->base->mpi_dtype, vblock.sub[b].rank,
+                     0, MPI_COMM_WORLD);
+        }
+        else if(myrank == vblock.sub[b].rank)
+        {
+            MPI_Recv(data, 1, view->base->mpi_dtype, 0, 0,
+                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+    else
+    {
+        if(vblock.sub[b].rank == myrank)//Local copying.
+            memcpy(item, data, view->base->elsize);
+
+        MPI_Bcast(item, view->base->elsize, MPI_BYTE, vblock.sub[b].rank,
+                  MPI_COMM_WORLD);
+    }
+#endif
+
+    dep_flush(1);//Will cleanup the used sub-view-blocks.
+
+    return 0;
+} /* handle_PutGetItem */
+
+
+/*===================================================================
+ *
+ * Un-distributes the array by transferring all data to the master
+ * MPI-process.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int PyDistArray_UnDist(dndarray *ary)
+{
+    #ifndef DNPY_SPMD
+        msg[0] = DNPY_UNDIST;
+        msg[1] = ary->uid;
+        msg[2] = DNPY_MSG_END;
+        msg2slaves(msg, 3*sizeof(npy_intp));
+    #endif
+
+    if(ary->isdist)
+    {
+        //Un-protect the memory.
+        if(mprotect(PyArray_DATA(ary->pyary), ary->nelem * ary->elsize,
+                    PROT_READ|PROT_WRITE) == -1)
+        {
+            int errsv = errno;//mprotect() sets the errno.
+            PyErr_Format(PyExc_RuntimeError, "PyDistArray_UnDist: "
+                         "could not un-protect a data region. "
+                         "Returned error code by mprotect: %s.",
+                         strerror(errsv));
+            return -1;
+        }
+
+        //Transfer all items to the pyary.
+        npy_intp coord[NPY_MAXDIMS];
+        memset(coord, 0, ary->ndims * sizeof(npy_intp));
+        int notfinished = 1;
+        char *data = PyArray_DATA(ary->pyary);
+        while(notfinished)
+        {
+            PyDistArray_GetItem(ary->pyary, data, coord);
+            data += ary->elsize;
+            //Go to next coordinate.
+            int i;
+            for(i=ary->ndims-1; i >= 0; i--)
+            {
+                coord[i]++;
+                if(coord[i] >= ary->dims[i])
+                {
+                    //We are finished, if wrapping around.
+                    if(i == 0)
+                    {
+                        notfinished = 0;
+                        break;
+                    }
+                    coord[i] = 0;//Start coord.
+                }
+                else
+                    break;
+            }
+        }
+    }
+    ary->isdist = 0;//Not distributed anymore.
+    return 0;
+} /* PyDistArray_UnDist */
+
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_UnDist.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_UnDist(dndarray *ary)
+{
+
+
+    return 0;
+} /* handle_UnDist */
diff --git a/distnumpy/src/arrayobject.h b/distnumpy/src/arrayobject.h
new file mode 100644
index 000000000000..4cdd46c0b207
--- /dev/null
+++ b/distnumpy/src/arrayobject.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ARRAYOBJECT_H
+#define ARRAYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "distnumpy_types.h"
+
+/*
+ *===================================================================
+ * Check whether the array distributed or not.
+ */
+static int
+PyDistArray_IsDist(PyArrayObject *ary);
+
+/*
+ *===================================================================
+ * Create a new base array and updates the PyArrayObject.
+ * If 'one_node_dist_rank' is positive it specifies the rank of an
+ * one-node-distribution.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_NewBaseArray(PyArrayObject *ary, npy_intp one_node_dist_rank);
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_NewBaseArray.
+ * Return NULL and set exception on error.
+ * Return a pointer to the new dndview on success.
+ */
+dndview *handle_NewBaseArray(dndarray *ary, dndview *view);
+
+/*
+ *===================================================================
+ * Create a new view of an array and updates the PyArrayObject.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_NewViewArray(PyArrayObject *orig_ary, PyArrayObject *new_ary,
+                         int nslice, dndslice slice[NPY_MAXDIMS]);
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_NewViewArray.
+ * Return NULL and set exception on error.
+ * Return a pointer to the new dndview on success.
+ */
+dndview *handle_NewViewArray(dndview *orgview, dndview *newview);
+
+/*
+ *===================================================================
+ * Delete array view.
+ * When it is the last view of the base array, the base array is de-
+ * allocated.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_DelViewArray(PyArrayObject *array);
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_NewBaseArray.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_DelViewArray(npy_intp uid);
+
+/*
+ *===================================================================
+ * Assign the value to array at coordinate.
+ * 'coord' size must be the same as view->ndims.
+ * Steals all reference to item. (Item is lost).
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_PutItem(PyArrayObject *ary, npy_intp coord[NPY_MAXDIMS],
+                    PyObject *item);
+
+/*
+ *===================================================================
+ * Get a single value specified by coordinate from the array.
+ * 'coord' size must be the same as view->ndims.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_GetItem(PyArrayObject *ary, char *retdata,
+                    npy_intp coord[NPY_MAXDIMS]);
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_PutItem and PyDistArray_GetItem.
+ * Direction: 0=Get, 1=Put.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_PutGetItem(int Direction, dndview *view, char* item,
+                      npy_intp coord[NPY_MAXDIMS]);
+
+/*===================================================================
+ *
+ * Un-distributes the array by transferring all data to the master
+ * MPI-process.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int PyDistArray_UnDist(dndarray *ary);
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_UnDist.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_UnDist(dndarray *ary);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(ARRAYOBJECT_H) */
diff --git a/distnumpy/src/dependency_system.c b/distnumpy/src/dependency_system.c
new file mode 100644
index 000000000000..648ed60d3926
--- /dev/null
+++ b/distnumpy/src/dependency_system.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*===================================================================
+ *
+ * Returns True if there is data conflict/overlap.
+ */
+static char dndnode_conflict(const dndop *A, const int Aidx,
+                             const dndop *B, const int Bidx)
+{
+    npy_intp d;
+    char Aaccesstype = A->accesstypes[Aidx];
+    dndarray *Abase = A->views[Aidx]->base;
+    char Baccesstype = B->accesstypes[Bidx];
+    dndarray *Bbase = B->views[Bidx]->base;
+    dndsvb *Asvb = A->svbs[Aidx];
+    dndsvb *Bsvb = B->svbs[Bidx];
+
+    if(Abase->uid == Bbase->uid)
+        if(Aaccesstype == DNPY_WRITE || Baccesstype == DNPY_WRITE)
+        {
+            if(Asvb == NULL || Bsvb == NULL)
+                return 1;//Depend on the whole array.
+
+            char conflict = 1;
+            for(d=0; d<Abase->ndims; d++)
+            {
+                if(Bsvb->start[d] >=
+                   Asvb->start[d] + Asvb->nsteps[d]
+                   ||
+                   Asvb->start[d] >=
+                   Bsvb->start[d] + Bsvb->nsteps[d])
+                {
+                    conflict = 0;
+                    break;//No conflict at the svb level.
+                }
+            }
+            if(conflict)
+                return 1;
+        }
+    return 0;
+}/* dndnode_conflict */
+
+/*===================================================================
+ *
+ * Add the list of node to the dependency system.
+ * Node that all nodes in the list must relate to the same operation.
+ */
+void dep_add(dndnode *nodes, int nnodes, int force_laziness)
+{
+    npy_intp i;
+    int n, idx;
+
+    #ifdef DNDY_TIME
+        unsigned long long tdelta;
+        DNDTIME(tdelta);
+    #endif
+
+    //Init values for statustics.
+    #ifdef DNPY_STATISTICS
+        for(n=0; n<nnodes; n++)
+            nodes[n].uid = ++node_uid_count;
+        nodes[0].op->uid = ++op_uid_count;
+    #endif
+    //Handle one node at a time.
+    for(n=0; n<nnodes; n++)
+    {
+        dndnode *node = &nodes[n];
+        assert(node->op != NULL);
+        assert(node->op_ary_idx >= 0);
+        assert(node->op_ary_idx < node->op->narys);
+        assert(nodes[0].op == node->op);
+        node->next = NULL;
+        idx = node->op_ary_idx;
+        //We append the new node to the linked list and for each
+        //conflict we increase the refcount of the new node.
+        if(node->op->svbs[idx] == NULL)//Whole array.
+        {
+            dndarray *ary = node->op->views[idx]->base;
+            for(i=0; i<ary->nblocks; i++)
+            {
+                if(ary->rootnodes[i] == NULL)
+                    ary->rootnodes[i] = node;
+                else
+                {
+                    dndnode *tnode = ary->rootnodes[i];
+                    while(1)
+                    {
+                        if(tnode->op != node->op)
+                            node->op->refcount++;
+                        if(tnode->next == NULL)//We are finished.
+                            break;
+                        tnode = tnode->next;//Go to next node.
+                    }
+                    tnode->next = node;
+                    assert(tnode->next->next == NULL);
+                }
+                //Need to clone the new node to get it "spanning" over
+                //the whole array.
+                memcpy(workbuf_nextfree, node, sizeof(dndnode));
+                node = workbuf_nextfree;
+                WORKBUF_INC(sizeof(dndnode));
+
+                #ifdef DNPY_STATISTICS
+                    node->uid = ++node_uid_count;
+                #endif
+            }
+        }
+        else
+        {
+            assert(node->op->svbs[idx]->rootnode != NULL);
+            dndnode *tnode = *node->op->svbs[idx]->rootnode;
+            if(tnode == NULL)
+                *node->op->svbs[idx]->rootnode = node;
+            else
+            {
+                while(1)
+                {
+                    if(tnode->op != node->op &&
+                       dndnode_conflict(node->op, idx, tnode->op,
+                                        tnode->op_ary_idx))
+                        node->op->refcount++;
+
+                    if(tnode->next == NULL)//We are finished.
+                        break;
+                    tnode = tnode->next;//Go to next node.
+                }
+                tnode->next = node;
+            }
+        }
+    }
+
+    //Place the operation in the ready queue when no dependency was
+    //found.
+    if(nodes[0].op->refcount == 0)
+    {
+        assert(ready_queue_size+1 <= DNPY_RDY_QUEUE_MAXSIZE);
+        ready_queue[ready_queue_size++] = nodes[0].op;
+    }
+
+    assert(ready_queue_size > 0);
+
+    #ifdef DNDY_TIME
+        DNDTIME_SUM(tdelta, dndt.dag_svb_add)
+    #endif
+
+    #ifdef DNPY_NO_LAZY_EVAL
+    if(!force_laziness)
+        dag_svb_flush(0);//Note that we do not free the work buffer
+    #endif
+} /* dep_add */
+
+
+/*===================================================================
+ *
+ * Removes a operation from the dependency system.
+ * op2apply is the operations that should be applyed.
+ * If op2apply is NULL then op2apply is ignored.
+ * Returns number of operations in op2apply.
+ */
+npy_intp dep_remove(dndop *op, dndop *op2apply[])
+{
+    int j;
+    npy_intp b, i, nops=1, mnops=DNPY_MAX_OP_MERGES;
+
+    #ifdef DNDY_TIME
+        unsigned long long tdelta;
+        DNDTIME(tdelta);
+    #endif
+
+    if(op2apply != NULL)
+        op2apply[0] = op;
+    else
+        mnops = 1;
+
+    for(i=0; i<nops; i++)
+    {
+        if(op2apply != NULL)
+            op = op2apply[i];
+
+        for(j=0; j<op->narys; j++)
+        {
+            if(op->svbs[j] == NULL)//Whole array.
+            {
+                dndarray *ary = op->views[j]->base;
+                assert(ary != NULL);
+                for(b=0; b<ary->nblocks; b++)
+                {
+                    assert(ary->rootnodes[b] != NULL);
+                    while(ary->rootnodes[b] != NULL &&
+                          ary->rootnodes[b]->op == op)
+                        ary->rootnodes[b] = ary->rootnodes[b]->next;
+
+                    dndnode *n1 = ary->rootnodes[b];
+
+                    //We are finished if the list has become empty.
+                    if(n1 == NULL)
+                        continue;
+
+                    //Handle the first node in the list.
+                    if(--n1->op->refcount == 0)
+                    {
+                        if(nops < mnops && n1->op->optype == DNPY_NONCOMM)
+                            op2apply[nops++] = n1->op;
+                        else
+                            ready_queue[ready_queue_size++] = n1->op;
+                    }
+                    //Handle the rest of the nodes in the list.
+                    dndnode *n2 = n1->next;
+                    while(n2 != NULL)
+                    {
+                        assert(n1->next == n2);
+                        if(n2->op == op)//Remove the node.
+                            n1->next = n2->next;
+                        else
+                        {
+                            if(--n2->op->refcount == 0)
+                            {
+                                if(nops < mnops && n2->op->optype == DNPY_NONCOMM)
+                                    op2apply[nops++] = n2->op;
+                                else
+                                    ready_queue[ready_queue_size++] = n2->op;
+                            }
+                            n1 = n2;
+                        }
+                        n2 = n2->next;
+                    }
+                }
+            }
+            else
+            {
+                dndsvb *svb = op->svbs[j];
+                while((*svb->rootnode) != NULL &&
+                      (*svb->rootnode)->op == op)
+                    *svb->rootnode = (*svb->rootnode)->next;
+                dndnode *n1 = *svb->rootnode;
+
+                //We are finished if the list has become empty.
+                if(n1 == NULL)
+                    continue;
+
+                //Handle the first node in the list.
+                if(dndnode_conflict(op, j, n1->op, n1->op_ary_idx))
+                    if(--n1->op->refcount == 0)
+                    {
+                        if(nops < mnops && n1->op->optype == DNPY_NONCOMM)
+                            op2apply[nops++] = n1->op;
+                        else
+                            ready_queue[ready_queue_size++] = n1->op;
+                    }
+
+                //Handle the rest of the nodes in the list.
+                dndnode *n2 = n1->next;
+                while(n2 != NULL)
+                {
+                    assert(n1->next == n2);
+                    if(n2->op == op)//Remove the node.
+                        n1->next = n2->next;
+                    else
+                    {
+                        if(dndnode_conflict(op, j, n2->op, n2->op_ary_idx))
+                            if(--n2->op->refcount == 0)
+                            {
+                                if(nops < mnops && n2->op->optype == DNPY_NONCOMM)
+                                    op2apply[nops++] = n2->op;
+                                else
+                                    ready_queue[ready_queue_size++] = n2->op;
+                            }
+                        n1 = n2;
+                    }
+                    n2 = n2->next;//Go to next node.
+                }
+            }
+        }
+    }
+
+    #ifdef DNDY_TIME
+        DNDTIME_SUM(tdelta, dndt.dag_svb_rm)
+    #endif
+
+    return nops;
+}/* dep_remove */
+
+
+/*===================================================================
+ *
+ * Flush the dependency system.
+ * Frees the work buffer when 'free_workbuf' is true.
+ */
+void dep_flush(int free_workbuf)
+{
+    npy_intp i, j, f, commsize, ncommsize;
+    int fcomm[DNPY_RDY_QUEUE_MAXSIZE];
+    int fcommsize;
+    MPI_Request reqs[DNPY_RDY_QUEUE_MAXSIZE];
+    MPI_Status reqstatus[DNPY_RDY_QUEUE_MAXSIZE];
+    dndop *comm[DNPY_RDY_QUEUE_MAXSIZE];
+    dndop *ncomm[DNPY_RDY_QUEUE_MAXSIZE];
+    MPI_Datatype dtype[DNPY_RDY_QUEUE_MAXSIZE];
+    npy_intp dtypesize=0;
+
+    #ifdef DNDY_TIME
+        ++dndt.nflush;
+        unsigned long long tdelta;
+        DNDTIME(tdelta);
+    #endif
+    #ifdef DNPY_STATISTICS
+        dag_svb_dump();
+    #endif
+
+    commsize=0; ncommsize=0;
+    while(ready_queue_size + commsize + ncommsize > 0)
+    {
+        #ifdef DNDY_TIME
+            unsigned long long tdelta2;
+            DNDTIME(tdelta2);
+        #endif
+
+        assert(ready_queue_size <= DNPY_RDY_QUEUE_MAXSIZE);
+        //Sort the queue into two queues - one for communication and
+        //one for non-communication nodes.
+        //Furthermore, initiate the communication nodes.
+        for(i=0; i<ready_queue_size; i++)
+        {
+            assert(ready_queue[i]->refcount == 0);
+            //Init. all communication nodes in the ready queue.
+            if(ready_queue[i]->optype == DNPY_COMM)
+            {
+                dndop_comm *C = (dndop_comm*) ready_queue[i];
+                MPI_Datatype comm_dtype;
+                assert(C->refcount == 0);
+                switch(C->op)
+                {
+                    case DNPY_RECV:
+                        assert(C->narys == 1);
+                        comm_dtype = calc_svb_MPIdatatype(C->views[0],
+                                                          C->svbs[0]);
+                        delayed_array_allocation(C->views[0]->base);
+                        MPI_Irecv(C->views[0]->base->data +
+                                  C->svbs[0]->comm_offset,
+                                  1, comm_dtype,
+                                  C->remote_rank, C->mpi_tag,
+                                  MPI_COMM_WORLD, &reqs[commsize]);
+                        MPI_Type_free(&comm_dtype);
+                        break;
+                    case DNPY_BUF_RECV:
+                        assert(C->narys == 1);
+                        assert(C->svbs[0]->data == NULL);
+                        C->svbs[0]->data = workbuf_nextfree;
+                        WORKBUF_INC(C->svbs[0]->nelem *
+                                    C->views[0]->base->elsize);
+                        assert(C->svbs[0]->data != NULL);
+                        MPI_Irecv(C->svbs[0]->data, C->svbs[0]->nelem,
+                                  C->views[0]->base->mpi_dtype,
+                                  C->remote_rank, C->mpi_tag,
+                                  MPI_COMM_WORLD, &reqs[commsize]);
+                        break;
+                    case DNPY_SEND:
+                        assert(C->narys == 1);
+                        comm_dtype = calc_svb_MPIdatatype(C->views[0],
+                                                          C->svbs[0]);
+                        delayed_array_allocation(C->views[0]->base);
+                        MPI_Isend(C->views[0]->base->data +
+                                  C->svbs[0]->comm_offset,
+                                  1, comm_dtype,
+                                  C->remote_rank, C->mpi_tag,
+                                  MPI_COMM_WORLD, &reqs[commsize]);
+                        dtype[dtypesize++] = comm_dtype;
+                        //At the moment we have to delay this freeing to
+                        //the end of the flush. I’m not sure if this is
+                        //a bug in DistNumPy or the MPICH-2 implementa-
+                        //tion.
+                        //MPI_Type_free(&comm_dtype);
+                        break;
+                    case DNPY_BUF_SEND:
+                        assert(C->narys == 1);
+                        assert(C->svbs[0]->data != NULL);
+                        MPI_Isend(C->svbs[0]->data, C->svbs[0]->nelem,
+                                  C->views[0]->base->mpi_dtype,
+                                  C->remote_rank, C->mpi_tag,
+                                  MPI_COMM_WORLD, &reqs[commsize]);
+                        break;
+                    case DNPY_REDUCE_SEND:
+                        assert(C->narys == 1);
+                        comm_dtype = calc_svb_MPIdatatype(C->views[0],
+                                                          C->svbs[0]);
+                        assert(C->svbs[0]->data != NULL);
+                        MPI_Isend(C->svbs[0]->data +
+                                  C->svbs[0]->comm_offset,
+                                  1, comm_dtype,
+                                  C->remote_rank, C->mpi_tag,
+                                  MPI_COMM_WORLD, &reqs[commsize]);
+                        dtype[dtypesize++] = comm_dtype;
+                        //At the moment we have to delay this freeing to
+                        //the end of the flush. I’m not sure if this is
+                        //a bug in DistNumPy or the MPICH-2 implementa-
+                        //tion.
+                        //MPI_Type_free(&comm_dtype);
+                        break;
+                    case DNPY_COPY_INTO:
+                        if(C->narys == 1)
+                        {
+                            if(C->accesstypes[0] == DNPY_WRITE)
+                            {
+                                comm_dtype = calc_svb_MPIdatatype(C->views[0],
+                                                                  C->svbs[0]);
+                                delayed_array_allocation(C->views[0]->base);
+                                MPI_Irecv(C->views[0]->base->data +
+                                          C->svbs[0]->comm_offset,
+                                          1, comm_dtype,
+                                          C->remote_rank, C->mpi_tag,
+                                          MPI_COMM_WORLD, &reqs[commsize]);
+                                MPI_Type_free(&comm_dtype);
+                            }
+                            else
+                            {
+                                assert(C->accesstypes[0] == DNPY_READ);
+                                comm_dtype = calc_svb_MPIdatatype(C->views[0],
+                                                                  C->svbs[0]);
+                                delayed_array_allocation(C->views[0]->base);
+                                MPI_Isend(C->views[0]->base->data +
+                                          C->svbs[0]->comm_offset,
+                                          1, comm_dtype,
+                                          C->remote_rank, C->mpi_tag,
+                                          MPI_COMM_WORLD, &reqs[commsize]);
+                                dtype[dtypesize++] = comm_dtype;
+                                //At the moment we have to delay this freeing to
+                                //the end of the flush. I’m not sure if this is
+                                //a bug in DistNumPy or the MPICH-2 implementa-
+                                //tion.
+                                //MPI_Type_free(&comm_dtype);
+                            }
+                        }
+                        break;
+                    default:
+                        fprintf(stderr, "Unknown DAG operation: %s.\n",
+                                optype2str(C->op));
+                        MPI_Abort(MPI_COMM_WORLD, -1);
+                }
+                comm[commsize++] = ready_queue[i];
+            }
+            else
+            {
+                assert(ready_queue[i]->optype == DNPY_NONCOMM);
+                ncomm[ncommsize++] = ready_queue[i];
+            }
+        }
+        //The ready queue is now empty.
+        ready_queue_size = 0;
+
+        #ifdef DNDY_TIME
+            DNDTIME_SUM(tdelta2, dndt.comm_init)
+        #endif
+
+        //Apply one non-communication node and move new non-depend
+        //nodes to the ready queue.
+        //Instead of moving new non-depended non-communication nodes
+        //to the ready queue they are directly applied.
+        if(ncommsize > 0)
+        {
+            dndop *op = ncomm[--ncommsize];//Using a FILO order.
+            dndop **op2apply = workbuf_nextfree;
+            npy_intp nops = dep_remove(op, op2apply);
+            npy_intp cur_op;
+            npy_intp mreserved = nops * DNPY_WORK_BUFFER_MEM_ALIGNMENT;
+            WORKBUF_INC(mreserved);//Reserve memory.
+
+            for(cur_op=0; cur_op < nops; cur_op++)
+            {
+                op = op2apply[cur_op];
+                switch(op->op)
+                {
+                    case DNPY_UFUNC:
+                        //apply_ufunc((dndop_ufunc*)op);
+                        Py_DECREF(((dndop_ufunc*)op)->PyOp);
+                        break;
+                    case DNPY_DESTROY_ARRAY:
+                        assert(op->narys == 1);
+                        assert(op->views[0] != NULL);
+                        rm_dndview(op->views[0]->uid);
+                        break;
+                    default:
+                        fprintf(stderr, "Unknown DAG operation: %s.\n",
+                                optype2str(op->op));
+                        MPI_Abort(MPI_COMM_WORLD, -1);
+                }
+            }
+            workbuf_nextfree -= mreserved;//Unreserve memory.
+
+            ++dndt.napply;
+            dndt.nconnect += commsize;
+            dndt.nconnect_max = MAX(dndt.nconnect_max, commsize);
+        }
+        //Test for ready communication and possibly move new non-depend
+        //nodes to the ready queue. Furthermore, if there is nothing
+        //else to do (no operations that are ready) we wait.
+        if(commsize > 0)
+        {
+            #ifdef DNDY_TIME
+                unsigned long long tdelta2;
+                DNDTIME(tdelta2);
+            #endif
+
+            if(ncommsize > 0)
+            {
+                MPI_Testsome(commsize, reqs, &fcommsize, fcomm,
+                             reqstatus);
+                assert(fcommsize != MPI_UNDEFINED);
+            }
+            else if(ready_queue_size == 0)
+            {
+                MPI_Waitsome(commsize, reqs, &fcommsize, fcomm,
+                             reqstatus);
+                assert(fcommsize != MPI_UNDEFINED);
+            }
+            else
+                fcommsize = 0;
+
+            #ifdef DNDY_TIME
+                DNDTIME_SUM(tdelta2, dndt.ufunc_comm)
+            #endif
+
+            for(f=0; f<fcommsize; f++)
+            {
+                dndop *op = comm[fcomm[f]];
+                dep_remove(op, NULL);
+            }
+            for(f=0; f<fcommsize; f++)
+            {
+                j=0;
+                for(i=0; i<commsize; i++)
+                    if(fcomm[f]-f != i)
+                    {
+                        comm[j] = comm[i];
+                        reqs[j++] = reqs[i];
+                    }
+            }
+            commsize -= fcommsize;
+        }
+    }
+    //Do the delayed MPI type freeing.
+    for(i=0; i<dtypesize; i++)
+        MPI_Type_free(&dtype[i]);
+
+    assert(ready_queue_size == 0);
+    #ifdef DNDY_TIME
+        DNDTIME_SUM(tdelta, dndt.dag_svb_flush)
+    #endif
+    if(free_workbuf)
+    {
+        workbuf_nextfree = workbuf;
+        WORKBUF_INC(1);//Making sure that the memory is aligned.
+    }
+}/* dep_flush */
+
+
+
diff --git a/distnumpy/src/dependency_system.h b/distnumpy/src/dependency_system.h
new file mode 100644
index 000000000000..dff09576f5d7
--- /dev/null
+++ b/distnumpy/src/dependency_system.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DEPENDENCY_SYSTEM_H
+#define DEPENDENCY_SYSTEM_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*===================================================================
+ *
+ * Add the list of node to the dependency system.
+ * Node that all nodes in the list must relate to the same operation.
+ */
+void dep_add(dndnode *nodes, int nnodes, int force_laziness);
+
+/*===================================================================
+ *
+ * Removes a operation from the dependency system.
+ * op2apply is the operations that should be applyed.
+ * If op2apply is NULL then op2apply is ignored.
+ * Returns number of operations in op2apply.
+ */
+npy_intp dep_remove(dndop *op, dndop *op2apply[]);
+
+/*===================================================================
+ *
+ * Flush the dependency system.
+ * Frees the work buffer when 'free_workbuf' is true.
+ */
+void dep_flush(int free_workbuf);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(DEPENDENCY_SYSTEM_H) */
diff --git a/distnumpy/src/distnumpymodule.c b/distnumpy/src/distnumpymodule.c
new file mode 100644
index 000000000000..e23211efdb63
--- /dev/null
+++ b/distnumpy/src/distnumpymodule.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <Python.h>
+#define DISTNUMPY_MODULE
+#include "distnumpy.h"
+//Tells numpy that this file initiate the module.
+#define PY_ARRAY_UNIQUE_SYMBOL DISTNUMPY_ARRAY_API
+#include "numpy/arrayobject.h"
+#include "distnumpy_priv.h"
+#include <mpi.h>
+
+//We include all .h and .c files.
+//NumPy distutil complains when having multiple module files.
+#include "helpers.h"
+#include "array_database.h"
+#include "memory.h"
+#include "arrayobject.h"
+#include "dependency_system.h"
+#include "process_grid.h"
+#include "arraydata.h"
+#include "helpers.c"
+#include "array_database.c"
+#include "memory.c"
+#include "arrayobject.c"
+#include "dependency_system.c"
+#include "process_grid.c"
+#include "arraydata.c"
+
+/*
+ * ===================================================================
+ * Initialization of distnumpy.
+ * Return -1 and set exception on error, 0 on success.
+ */
+static int
+PyDistArray_Init(void)
+{
+    int provided;
+    int flag;
+    int i;
+
+    //Make sure we only initialize once.
+    MPI_Initialized(&flag);
+    if (flag)
+    {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "DistNumPy error - multiple "
+                        "initialization attempts.");
+        return -1;
+    }
+
+    //We make use of MPI_Init_thread even though we only ask for
+    //a MPI_THREAD_SINGLE level thread-safety because MPICH2 only
+    //supports MPICH_ASYNC_PROGRESS when MPI_Init_thread is used.
+    //Note that when MPICH_ASYNC_PROGRESS is defined the thread-safety
+    //level will automatically be set to MPI_THREAD_MULTIPLE.
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &provided);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+
+    //Allocate buffers.
+    workbuf = malloc(DNPY_WORK_BUFFER_MAXSIZE);
+    workbuf_nextfree = workbuf;
+    assert(workbuf != NULL);
+
+    //We subtract one MB to avoid segmentation faults when the workbuf
+    //is used before the call to WORKBUF_INC()
+    workbuf_max = ((char*)workbuf) + DNPY_WORK_BUFFER_MAXSIZE - 1048576;
+
+    //Lets make sure that the memory is aligned.
+    WORKBUF_INC(1);
+
+    //Allocate cart_dim_sizes and cart_dim_strides.
+    for(i=0; i<NPY_MAXDIMS; i++)
+    {
+        cart_dim_sizes[i] = malloc((i+1)*sizeof(int));
+        cart_dim_strides[i] = malloc((i+1)*sizeof(int));
+    }
+
+    //Set blocksize
+    if(myrank == 0)
+    {
+        char *env;
+        //Check for user-defined block size.
+        env = getenv("DNPY_BLOCKSIZE");
+        if(env == NULL)
+            blocksize = DNPY_BLOCKSIZE;
+        else
+            blocksize = atoi(env);
+
+        if(blocksize <= 0)
+        {
+            fprintf(stderr, "User-defined blocksize must be greater "
+                            "than zero\n");
+            MPI_Abort(MPI_COMM_WORLD, -1);
+        }
+    }
+    MPI_Bcast(&blocksize, sizeof(npy_intp), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    //Init the Array Data Protection.
+    arydat_init();
+
+    return 0;
+} /* PyDistArray_Init */
+
+/*
+ * ===================================================================
+ * De-initialization of distnumpy.
+ */
+static void
+PyDistArray_Exit(void)
+{
+    int i;
+
+
+#ifndef DNPY_SPMD
+    if(myrank == 0)
+    {
+        //Shutdown slaves
+        msg[0] = DNPY_SHUTDOWN;
+        msg[1] = DNPY_MSG_END;
+        msg2slaves(msg, 2 * sizeof(npy_intp));
+        #ifdef DISTNUMPY_DEBUG
+            printf("Rank 0 received msg: SHUTDOWN\n");
+        #endif
+    }
+#endif
+    //Make sure that the sub-view-block DAG is flushed.
+    dep_flush(1);
+
+    //Free buffers.
+    free(workbuf);
+    //Free Cartesian Information.
+    for(i=0; i < NPY_MAXDIMS; i++)
+    {
+        free(cart_dim_strides[i]);
+        free(cart_dim_sizes[i]);
+    }
+    int nleaks = 0;
+    for(i=0; i < DNPY_MAX_NARRAYS; i++)
+        if(dndviews_uid[i] != 0)
+            nleaks++;
+
+    if(nleaks > 0)
+        printf("DistNumPy - Warning %d distributed arrays didn't get "
+               "deallocated.\n", nleaks);
+
+    //De-allocate the memory pool.
+    mem_pool_finalize();
+
+    //Finalize the Array Data Protection.
+    arydat_finalize();
+
+    MPI_Finalize();
+} /* PyDistArray_Exit */
+
+
+/*
+ * ===================================================================
+ * From this point on the master will continue with the pyton code
+ * and the slaves will stay in C.
+ * If returning False the Python must call sys.exit(0) immediately.
+ */
+static PyObject *
+PyDistArray_MasterSlaveSplit(PyObject *self, PyObject *args)
+{
+    //Initiate timers to zero.
+    memset(&dndt, 0, sizeof(dndtime));
+    DNDTIME(totaldelta)
+
+#ifdef DNPY_SPMD
+    return Py_True;
+#else
+
+    if(myrank == 0)
+        return Py_True;
+
+    int shutdown = 0;
+    while(shutdown == 0)//Work loop
+    {
+        char *t1, *t2, *t3;
+        npy_intp d1, d2, d3, d4, d5;
+        long l1;
+        dndview *ary, *ary2, *ary3;
+        //Receive message from master.
+        MPI_Bcast(msg, DNPY_MAX_MSG_SIZE, MPI_BYTE, 0, MPI_COMM_WORLD);
+        char *msg_data = (char *) &msg[1];
+        #ifdef DISTNUMPY_DEBUG
+            printf("Rank %d received msg: ", myrank);
+        #endif
+        switch(msg[0])
+        {
+            case DNPY_INIT_PGRID:
+                handle_ProcGridSet((int*)msg_data);
+                break;
+            case DNPY_INIT_BLOCKSIZE:
+                //blocksize = *((npy_intp*)msg_data);
+                break;
+            case DNPY_CREATE_ARRAY:
+                t1 = msg_data + sizeof(dndarray);
+                handle_NewBaseArray((dndarray*) msg_data, (dndview*) t1);
+                break;
+            case DNPY_DESTROY_ARRAY:
+                handle_DelViewArray(*((npy_intp*)msg_data));
+                break;
+            case DNPY_CREATE_VIEW:
+            {
+                dndview *v1 = get_dndview(*((npy_intp*)msg_data));
+                dndview *v2 = (dndview *)(msg_data+sizeof(npy_intp));
+                handle_NewViewArray(v1,v2);
+                break;
+            }
+            case DNPY_SHUTDOWN:
+                shutdown = 1;
+                break;
+            case DNPY_EVALFLUSH:
+                dep_flush(1);
+                break;
+            case DNPY_PUT_ITEM:
+                ary = get_dndview(*((npy_intp*)msg_data));
+                t1 = msg_data+sizeof(npy_intp);
+                t2 = t1+ary->base->elsize;
+                handle_PutGetItem(1, ary, t1, (npy_intp*) t2);
+                break;
+            case DNPY_GET_ITEM:
+                ary = get_dndview(*((npy_intp*)msg_data));
+                t1 = msg_data+sizeof(npy_intp);
+                handle_PutGetItem(0, ary, NULL, (npy_intp*) t1);
+                break;
+            case DNPY_UNDIST:
+            {
+                dndarray *a = get_dndarray(*((npy_intp*)msg_data));
+                handle_UnDist(a);
+                break;
+            }
+            case DNPY_COPY_INTO:
+                d1 = *((npy_intp*)msg_data);
+                d2 = *(((npy_intp*)msg_data)+1);
+                //do_COPY_INTO(d1,d2);
+                break;
+            case DNPY_UFUNC:
+                d1 = *((npy_intp*)msg_data);
+                d2 = *(((npy_intp*)msg_data)+1);
+                d3 = *(((npy_intp*)msg_data)+2);
+                d4 = *(((npy_intp*)msg_data)+3);
+                d5 = *(((npy_intp*)msg_data)+4);
+                t1 = msg_data+sizeof(npy_intp)*5;
+                t2 = t1+d5;
+                t3 = t2+d1*sizeof(npy_intp);
+                //do_UFUNC((npy_intp *)t2,d1,d2,d3,d4,d5,t1,t3);
+                break;
+            case DNPY_UFUNC_REDUCE:
+                d1 = *((npy_intp*)msg_data);
+                d2 = *(((npy_intp*)msg_data)+1);
+                d3 = *(((npy_intp*)msg_data)+2);
+                d4 = *(((npy_intp*)msg_data)+3);
+                d5 = *(((npy_intp*)msg_data)+4);
+                t1 = msg_data+sizeof(npy_intp)*5;
+                //do_UFUNC_REDUCE(d1, d2, d3, d4, NULL, d5, t1);
+                break;
+            case DNPY_ZEROFILL:
+                //do_ZEROFILL(get_dndview(*((npy_intp*)msg_data)));
+                break;
+            case DNPY_DATAFILL:
+                d1 = ((npy_intp*)msg_data)[0]; // view uid
+                l1 = (long) ((npy_intp*)msg_data)[1]; // filepos
+                t1 = msg_data+sizeof(npy_intp)+sizeof(long); // get filename
+                //do_FILEIO(get_dndview(d1), t1, l1, DNPY_DATAFILL);
+                break;
+            case DNPY_DATADUMP:
+                d1 = ((npy_intp*)msg_data)[0]; // view uid
+                l1 = (long) ((npy_intp*)msg_data)[1]; // filepos
+                t1 = msg_data+sizeof(npy_intp)+sizeof(long); // get filename
+                //do_FILEIO(get_dndview(d1), t1, l1, DNPY_DATADUMP);
+                break;
+            case DNPY_DIAGONAL:
+                ary  = get_dndview(((npy_intp*)msg_data)[0]);
+                ary2 = get_dndview(((npy_intp*)msg_data)[1]);
+                d1 = ((npy_intp*)msg_data)[2];
+                d2 = ((npy_intp*)msg_data)[3];
+                d3 = ((npy_intp*)msg_data)[4];
+                //do_DIAGONAL(ary, ary2, d1, d2, d3);
+                break;
+            case DNPY_MATMUL:
+                ary  = get_dndview(((npy_intp*)msg_data)[0]);
+                ary2 = get_dndview(((npy_intp*)msg_data)[1]);
+                ary3 = get_dndview(((npy_intp*)msg_data)[2]);
+                //do_MATMUL(ary, ary2, ary3);
+                break;
+            case DNPY_TIME_RESET:
+                //do_TIME_RESET();
+                break;
+            case DNPY_TIME_GETDICT:
+                //do_TIME_GETDICT();
+                break;
+            default:
+                fprintf(stderr, "Unknown msg: %ld\n", (long)msg[0]);
+                MPI_Abort(MPI_COMM_WORLD, -1);
+        }
+    }
+    return Py_False;
+#endif
+} /* PyDistArray_MasterSlaveSplit */
+
+
+static PyMethodDef DistNumPyMethods[] = {
+    {"MasterSlaveSplit", PyDistArray_MasterSlaveSplit, METH_VARARGS,
+     "From this point on the master will continue with the pyton code"\
+     " and the slaves will stay in C"},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+PyMODINIT_FUNC
+initdistnumpy(void)
+{
+    PyObject *m;
+    static void *DistNumPy_API[DistNumPy_API_pointers];
+    PyObject *c_api_object;
+
+    m = Py_InitModule("distnumpy", DistNumPyMethods);
+    if (m == NULL)
+        return;
+
+    /* Initialize the C API pointer array */
+    DistNumPy_API[PyDistArray_Init_NUM] = (void *)PyDistArray_Init;
+    DistNumPy_API[PyDistArray_Exit_NUM] = (void *)PyDistArray_Exit;
+    DistNumPy_API[PyDistArray_MasterSlaveSplit_NUM] = (void *)PyDistArray_MasterSlaveSplit;
+    DistNumPy_API[PyDistArray_NewBaseArray_NUM] = (void *)PyDistArray_NewBaseArray;
+    DistNumPy_API[PyDistArray_DelViewArray_NUM] = (void *)PyDistArray_DelViewArray;
+    DistNumPy_API[PyDistArray_GetItem_NUM] = (void *)PyDistArray_GetItem;
+    DistNumPy_API[PyDistArray_PutItem_NUM] = (void *)PyDistArray_PutItem;
+    DistNumPy_API[PyDistArray_ProcGridSet_NUM] = (void *)PyDistArray_ProcGridSet;
+    DistNumPy_API[PyDistArray_UnDist_NUM] = (void *)PyDistArray_UnDist;
+    DistNumPy_API[PyDistArray_IsDist_NUM] = (void *)PyDistArray_IsDist;
+    DistNumPy_API[PyDistArray_NewViewArray_NUM] = (void *)PyDistArray_NewViewArray;
+
+    /* Create a CObject containing the API pointer array's address */
+    c_api_object = PyCObject_FromVoidPtr((void *)DistNumPy_API, NULL);
+
+    if (c_api_object != NULL)
+        PyModule_AddObject(m, "_C_API", c_api_object);
+
+    // Import NumPy
+    import_array();
+}
diff --git a/distnumpy/src/helpers.c b/distnumpy/src/helpers.c
new file mode 100644
index 000000000000..30d9e5c29e17
--- /dev/null
+++ b/distnumpy/src/helpers.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*===================================================================
+ *
+ * Computes the number of elements in a dimension of a distributed
+ * array owned by the MPI-process indicated by proc_dim_rank.
+ * From Fortran source: http://www.cs.umu.se/~dacke/ngssc/numroc.f
+*/
+npy_intp dnumroc(npy_intp nelem_in_dim, npy_intp block_size,
+                 int proc_dim_rank, int nproc_in_dim,
+                 int first_process)
+{
+    //Figure process's distance from source process.
+    int mydist = (nproc_in_dim + proc_dim_rank - first_process) %
+                  nproc_in_dim;
+
+    //Figure the total number of whole NB blocks N is split up into.
+    npy_intp nblocks = nelem_in_dim / block_size;
+
+    //Figure the minimum number of elements a process can have.
+    npy_intp numroc = nblocks / nproc_in_dim * block_size;
+
+    //See if there are any extra blocks
+    npy_intp extrablocks = nblocks % nproc_in_dim;
+
+    //If I have an extra block.
+    if(mydist < extrablocks)
+        numroc += block_size;
+
+    //If I have last block, it may be a partial block.
+    else if(mydist == extrablocks)
+        numroc += nelem_in_dim % block_size;
+
+    return numroc;
+} /* dnumroc */
+
+/*===================================================================
+ *
+ * Process cartesian coords <-> MPI rank.
+ */
+int cart2rank(int ndims, const int coords[NPY_MAXDIMS])
+{
+    int *strides = cart_dim_strides[ndims-1];
+    int rank = 0;
+    int i;
+    for(i=0; i<ndims; i++)
+        rank += coords[i] * strides[i];
+    assert(rank < worldsize);
+    return rank;
+}
+void rank2cart(int ndims, int rank, int coords[NPY_MAXDIMS])
+{
+    int i;
+    int *strides = cart_dim_strides[ndims-1];
+    memset(coords, 0, ndims*sizeof(int));
+    for(i=0; i<ndims; i++)
+    {
+        coords[i] = rank / strides[i];
+        rank = rank % strides[i];
+    }
+} /* cart2rank & rank2cart */
+
+
+/*===================================================================
+ *
+ * Sends a message to all slaves.
+ * msgsize is in bytes.
+ */
+#ifndef DNPY_SPMD
+void msg2slaves(npy_intp *msg, int msgsize)
+{
+    if(msgsize > DNPY_MAX_MSG_SIZE)
+    {
+        fprintf(stderr, "msg2slaves, the messages is greater "
+                        "than DNPY_MAX_MSG_SIZE\n");
+        MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+
+    #ifdef DNDY_TIME
+        unsigned long long tdelta;
+        DNDTIME(tdelta);
+    #endif
+
+    MPI_Bcast(msg, DNPY_MAX_MSG_SIZE, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    #ifdef DNDY_TIME
+        DNDTIME_SUM(tdelta, dndt.msg2slaves)
+    #endif
+
+} /* msg2slaves */
+#endif
+
+
+/*===================================================================
+ *
+ * Returns a string describing the operation type.
+ */
+char *optype2str(int optype)
+{
+    switch(optype)
+    {
+        case DNPY_CREATE_ARRAY:
+            return "DNPY_CREATE_ARRAY";
+        case DNPY_DESTROY_ARRAY:
+            return "del";
+        case DNPY_CREATE_VIEW:
+            return "DNPY_CREATE_VIEW";
+        case DNPY_PUT_ITEM:
+            return "DNPY_PUT_ITEM";
+        case DNPY_GET_ITEM:
+            return "DNPY_GET_ITEM";
+        case DNPY_UFUNC:
+            return "ufunc";
+        case DNPY_RECV:
+            return "recv";
+        case DNPY_SEND:
+            return "send";
+        case DNPY_BUF_RECV:
+            return "Brecv";
+        case DNPY_BUF_SEND:
+            return "Bsend";
+        case DNPY_APPLY:
+            return "apply";
+        case DNPY_UFUNC_REDUCE:
+            return "DNPY_UFUNC_REDUCE";
+        case DNPY_ZEROFILL:
+            return "DNPY_ZEROFILL";
+        case DNPY_DATAFILL:
+            return "DNPY_DATAFILL";
+        case DNPY_DIAGONAL:
+            return "DNPY_DIAGONAL";
+        case DNPY_MATMUL:
+            return "DNPY_MATMUL";
+        case DNPY_REDUCE_SEND:
+            return "reduce_send";
+        case DNPY_REDUCE_RECV:
+            return "DNPY_REDUCE_RECV";
+        default:
+            return "\"Unknown data type\"";
+    }
+} /* optype2str */
+
+
+/*===================================================================
+ *  Returns a MPI data type that match the specified sub-view-block.
+ */
+static MPI_Datatype
+calc_svb_MPIdatatype(const dndview *view, dndsvb *svb)
+{
+
+    npy_intp i,j,stride;
+    MPI_Datatype comm_viewOLD, comm_viewNEW;
+    npy_intp start[NPY_MAXDIMS];
+    npy_intp step[NPY_MAXDIMS];
+    npy_intp nsteps[NPY_MAXDIMS];
+
+    //Convert vcoord to coord, which have length view->base->ndims.
+    j=0;
+    for(i=0; i < view->nslice; i++)
+    {
+        if(view->slice[i].nsteps == PseudoIndex)
+        {
+            continue;
+        }
+        if(view->slice[i].nsteps == SingleIndex)
+        {
+            nsteps[j] = 1;
+            step[j] = 1;
+        }
+        else
+        {
+            nsteps[j] = view->slice[i].nsteps;
+            step[j] = view->slice[i].step;
+        }
+        start[j++] = view->slice[i].start;
+    }
+
+    //Compute the MPI datatype for communication.
+    MPI_Type_dup(view->base->mpi_dtype, &comm_viewOLD);
+    for(i=view->base->ndims-1; i >= 0; i--)//Row-major.
+    {
+        //Compute the MPI datatype for the view.
+        stride = svb->stride[i] * step[i] * view->base->elsize;
+        MPI_Type_create_hvector(svb->nsteps[i], 1, stride,
+                                comm_viewOLD, &comm_viewNEW);
+
+        //Cleanup and iterate comm types.
+        MPI_Type_free(&comm_viewOLD);
+        comm_viewOLD = comm_viewNEW;
+    }
+    MPI_Type_commit(&comm_viewNEW);
+    return comm_viewNEW;
+}/* calc_svb_MPIdatatype */
+
+
+/*===================================================================
+ *
+ * Calculate the view block at the specified block-coordinate.
+ * NB: vcoord is the visible coordinates and must therefore have
+ * length view->ndims.
+ */
+void calc_vblock(const dndview *view, const npy_intp vcoord[NPY_MAXDIMS],
+                 dndvb *vblock)
+{
+    npy_intp i, j, B, item_idx, s, offset, goffset, voffset, boffset;
+    npy_intp notfinished, stride, vitems, vvitems, vblocksize;
+    npy_intp comm_offset, nelem;
+    npy_intp coord[NPY_MAXDIMS];
+    npy_intp scoord[NPY_MAXDIMS];
+    npy_intp ncoord[NPY_MAXDIMS];
+    int pcoord[NPY_MAXDIMS];
+    int *cdims = cart_dim_sizes[view->base->ndims-1];
+    npy_intp start[NPY_MAXDIMS];
+    npy_intp step[NPY_MAXDIMS];
+    npy_intp nsteps[NPY_MAXDIMS];
+    dndsvb *svb;
+
+    //Convert vcoord to coord, which have length view->base->ndims.
+    j=0;s=0;
+    for(i=0; i < view->nslice; i++)
+    {
+        if(view->slice[i].nsteps == PseudoIndex)
+        {
+            assert(vcoord[s] == 0);
+            s++;
+            continue;
+        }
+        if(view->slice[i].nsteps == SingleIndex)
+        {
+            nsteps[j] = 1;
+            step[j] = 1;
+            coord[j] = 0;
+        }
+        else
+        {
+            coord[j] = vcoord[s];
+            nsteps[j] = view->slice[i].nsteps;
+            step[j] = view->slice[i].step;
+            s++;
+        }
+        assert(nsteps[j] > 0);
+        start[j++] = view->slice[i].start;
+    }
+    assert(j == view->base->ndims);
+
+    vblock->sub = workbuf_nextfree;
+    svb = vblock->sub;
+
+    //Init number of sub-view-block in each dimension.
+    memset(vblock->svbdims, 0, view->base->ndims * sizeof(npy_intp));
+
+    //Sub-vblocks coordinate.
+    memset(scoord, 0, view->base->ndims * sizeof(npy_intp));
+    //Compute all sub-vblocks associated with the n'th vblock.
+    notfinished=1; s=0;
+    while(notfinished)
+    {
+        dndnode **rootnode = view->base->rootnodes;
+        stride = 1;
+        for(i=view->base->ndims-1; i >= 0; i--)//Row-major.
+        {
+            //Non-block coordinates.
+            ncoord[i] = coord[i] * blocksize;
+            //View offset relative to array-view (non-block offset).
+            voffset = ncoord[i] + scoord[i];
+            //Global offset relative to array-base.
+            goffset = voffset * step[i] + start[i];
+            //Global block offset relative to array-base.
+            B = goffset / blocksize;
+            //Compute this sub-view-block's root node.
+            rootnode += B * stride;
+            stride *= view->base->blockdims[i];
+            //Process rank of the owner in the i'th dimension.
+            pcoord[i] = B % cdims[i];
+            //Local block offset relative to array-base.
+            boffset = B / cdims[i];
+            //Item index local to the block.
+            item_idx = goffset % blocksize;
+            //Local offset relative to array-base.
+            offset = boffset * blocksize + item_idx;
+            //Save offset.
+            svb[s].start[i] = offset;
+            //Viewable items left in the block.
+            vitems = MAX((blocksize - item_idx) / step[i], 1);
+            //Size of current view block.
+            vblocksize = MIN(blocksize, nsteps[i] - ncoord[i]);
+            //Viewable items left in the view-block.
+            vvitems = vblocksize - (voffset % blocksize);
+            //Compute nsteps.
+            svb[s].nsteps[i] = MIN(blocksize, MIN(vvitems, vitems));
+            //Debug check.
+            assert(svb[s].nsteps[i] > 0);
+        }
+        //Find rank.
+        if(view->base->onerank < 0)
+            svb[s].rank = cart2rank(view->base->ndims,pcoord);
+        else
+            svb[s].rank = view->base->onerank;
+
+        assert(svb[s].rank >= 0);
+        //Data has not been fetched.
+        svb[s].data = NULL;
+        //Communication has not been handled.
+        svb[s].comm_received_by = -1;
+        //Save rootnode.
+        svb[s].rootnode = rootnode;
+
+        //Compute the strides (we need the rank to do this).
+        stride = 1;
+        if(view->base->onerank < 0)
+            for(i=view->base->ndims-1; i >= 0; i--)
+            {
+                svb[s].stride[i] = stride;
+                stride *= dnumroc(view->base->dims[i], blocksize,
+                                  pcoord[i], cdims[i], 0);
+                assert(svb[s].stride[i] > 0);
+            }
+        else//All on one rank.
+            for(i=view->base->ndims-1; i >= 0; i--)
+            {
+                svb[s].stride[i] = stride;
+                stride = view->base->dims[i];
+            }
+
+        //Compute the MPI datatype for communication.
+        comm_offset = 0;
+        nelem = 1;
+        for(i=view->base->ndims-1; i >= 0; i--)//Row-major.
+        {
+            //Compute offsets.
+            comm_offset += svb[s].start[i] * svb[s].stride[i];
+            //Computing total number of elements.
+            nelem *= svb[s].nsteps[i];
+        }
+        //Save offsets.
+        svb[s].comm_offset = comm_offset * view->base->elsize;
+
+        //Save total number of elements.
+        svb[s].nelem = nelem;
+
+        //Save data pointer if local data.
+        if(svb[s].rank == myrank)
+        {
+            delayed_array_allocation(view->base);
+            vblock->sub[s].data = view->base->data + svb[s].comm_offset;
+        }
+
+        //Iterate Sub-vblocks coordinate (Row-major).
+        for(j=view->base->ndims-1; j >= 0; j--)
+        {
+            //Count svbdims.
+            vblock->svbdims[j]++;
+
+            scoord[j] += svb[s].nsteps[j];
+            if(scoord[j] >= MIN(blocksize, nsteps[j] - ncoord[j]))
+            {
+                //We a finished, if wrapping around.
+                if(j == 0)
+                {
+                    notfinished = 0;
+                    break;
+                }
+                scoord[j] = 0;
+            }
+            else
+                break;
+        }
+        //Reset svbdims because we need the last iteration.
+        if(notfinished)
+            for(i=view->base->ndims-1; i > j; i--)
+                vblock->svbdims[i] = 0;
+
+        s++;
+    }
+    //Save number of sub-vblocks.
+    vblock->nsub = s;
+    assert(vblock->nsub > 0);
+    //And the next free work buffer slot.
+    WORKBUF_INC(s * sizeof(dndsvb));
+} /* calc_vblock */
+
+/*===================================================================
+ *
+ * Convert visible vblock dimension index to base vblock
+ * dimension index.
+ */
+npy_intp idx_v2b(const dndview *view, npy_intp vindex)
+{
+    assert(vindex < view->ndims);
+    npy_intp i, bindex=0;
+    for(i=0; i < view->nslice; i++)
+    {
+        if(view->slice[i].nsteps == SingleIndex)
+        {
+            if(view->base->ndims > 1)
+                bindex++;
+            continue;
+        }
+        if(vindex == 0)
+            break;
+        if(view->slice[i].nsteps != PseudoIndex)
+            bindex++;
+        vindex--;
+    }
+    //We need the MIN since bindex is too high when PseudoIndex is
+    //used at the end of the view.
+    return MIN(bindex, view->base->ndims-1);
+} /* idx_v2b */
+
+/*===================================================================
+ *
+ * Convert visible vblock dimension index to slice dimension index.
+ */
+npy_intp idx_v2s(const dndview *view, npy_intp vindex)
+{
+    npy_intp i;
+    assert(vindex < view->ndims);
+    for(i=0; i < view->nslice; i++)
+    {
+        if(view->slice[i].nsteps == SingleIndex)
+            continue;
+        if(vindex == 0)
+            break;
+        vindex--;
+    }
+    assert(i < view->nslice);
+    return i;
+} /* idx_v2s */
diff --git a/distnumpy/src/helpers.h b/distnumpy/src/helpers.h
new file mode 100644
index 000000000000..9a423cc7facc
--- /dev/null
+++ b/distnumpy/src/helpers.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HELPERS_H
+#define HELPERS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <mpi.h>
+
+/*===================================================================
+ *
+ * Computes the number of elements in a dimension of a distributed
+ * array owned by the MPI-process indicated by proc_dim_rank.
+ * From Fortran source: http://www.cs.umu.se/~dacke/ngssc/numroc.f
+*/
+npy_intp dnumroc(npy_intp nelem_in_dim, npy_intp block_size,
+                 int proc_dim_rank, int nproc_in_dim,
+                 int first_process);
+
+/*===================================================================
+ *
+ * Process cartesian coords <-> MPI rank.
+ */
+int cart2rank(int ndims, const int coords[NPY_MAXDIMS]);
+void rank2cart(int ndims, int rank, int coords[NPY_MAXDIMS]);
+
+/*===================================================================
+ *
+ * Sends a message to all slaves.
+ * msgsize is in bytes.
+ */
+#ifndef DNPY_SPMD
+void msg2slaves(npy_intp *msg, int msgsize);
+#endif
+
+/*===================================================================
+ *
+ * Returns a string describing the operation type.
+ */
+char *optype2str(int optype);
+
+/*===================================================================
+ *  Returns a MPI data type that match the specified sub-view-block.
+ */
+static MPI_Datatype
+calc_svb_MPIdatatype(const dndview *view, dndsvb *svb);
+
+/*===================================================================
+ *
+ * Calculate the view block at the specified block-coordinate.
+ * NB: vcoord is the visible coordinates and must therefore have
+ * length view->ndims.
+ */
+void calc_vblock(const dndview *view, const npy_intp vcoord[NPY_MAXDIMS],
+                 dndvb *vblock);
+
+/*===================================================================
+ *
+ * Convert visible vblock dimension index to base vblock
+ * dimension index.
+ */
+npy_intp idx_v2b(const dndview *view, npy_intp vindex);
+
+/*===================================================================
+ *
+ * Convert visible vblock dimension index to slice dimension index.
+ */
+npy_intp idx_v2s(const dndview *view, npy_intp vindex);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(HELPERS_H) */
diff --git a/distnumpy/src/memory.c b/distnumpy/src/memory.c
new file mode 100644
index 000000000000..bf05e3f642f3
--- /dev/null
+++ b/distnumpy/src/memory.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//Memory pool.
+static dndmem *mem_pool = NULL;
+
+/*===================================================================
+ *
+ * Frees the memory pool from a given memory allocation.
+ * Private
+ */
+static void mem_pool_free(dndmem *mem)
+{
+    while(mem != NULL)
+    {
+        dndmem *next = mem->next;
+        MPI_Free_mem(mem);
+        mem = next;
+    }
+} /* mem_pool_free */
+
+
+/*===================================================================
+ *
+ * Put memory allocation into the memory pool.
+ */
+void mem_pool_put(dndmem *mem)
+{
+    //Put the allocated memory in front of the pool.
+    mem->next = mem_pool;
+    mem_pool = mem;
+} /* mem_pool_put */
+
+
+/*===================================================================
+ *
+ * Makes sure that the array's memory has been allocated.
+ */
+void delayed_array_allocation(dndarray *ary)
+{
+    #ifdef DNDY_TIME
+        unsigned long long tdelta;
+        DNDTIME(tdelta);
+    #endif
+    npy_intp size = ary->localsize * ary->elsize;
+    npy_intp count = 0;
+    dndmem *free = NULL;
+
+    if(ary->data != NULL)//Already allocated.
+        return;
+
+    //Check if there is some free memory in the memory pool.
+    if(mem_pool != NULL)
+    {
+        dndmem *prev = mem_pool;
+        dndmem *next = mem_pool->next;
+
+        //Handle first iteration as a special case.
+        if(mem_pool->size == size)
+        {
+            ary->data = ((char*)mem_pool) + sizeof(dndmem);
+            mem_pool = mem_pool->next;//Remove from pool.
+            #ifdef DNDY_TIME
+                ++dndt.mem_reused;
+            #endif
+        }
+        else//Handle the rest.
+        {
+            while(next != NULL)
+            {
+                assert(prev->next == next);
+                if(next->size == size)
+                {
+                    ary->data = ((char*)next) + sizeof(dndmem);
+                    prev->next = next->next;//Remove from pool.
+                    #ifdef DNDY_TIME
+                        ++dndt.mem_reused;
+                    #endif
+                    break;
+                }
+                if(++count == DNPY_MAX_MEM_POOL)
+                {
+                    //Will remove all mem after this one.
+                    prev->next = NULL;
+                    free = next;
+                }
+
+                //Go to next memory allocation.
+                prev = next;
+                next = next->next;
+            }
+        }
+    }
+
+    if(ary->data == NULL)//Need to allocate new memory.
+    {
+        dndmem *mem;
+        if(MPI_Alloc_mem(size + sizeof(dndmem), MPI_INFO_NULL,
+                         &mem) != MPI_SUCCESS)
+        {
+            fprintf(stderr, "Out of memory!\n");
+            MPI_Abort(MPI_COMM_WORLD, -1);
+        }
+        mem->size = size;
+        ary->data = (char*) (mem + 1);
+    }
+
+    //Reduce the pool size to DNPY_MAX_MEM_POOL.
+    if(free != NULL)
+        mem_pool_free(free);
+
+    #ifdef DNDY_TIME
+        DNDTIME_SUM(tdelta, dndt.arydata_malloc)
+    #endif
+}/* delayed_array_allocation */
+
+/*===================================================================
+ *
+ * De-allocate the memory pool.
+ */
+void mem_pool_finalize(void)
+{
+    mem_pool_free(mem_pool);
+} /* finalize_mem_pool */
diff --git a/distnumpy/src/memory.h b/distnumpy/src/memory.h
new file mode 100644
index 000000000000..b24ff1d91c71
--- /dev/null
+++ b/distnumpy/src/memory.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * We use a memory pool to reduce the memory allocation overhead.
+ */
+
+#ifndef MEMORY_H
+#define MEMORY_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+//Type describing a memory allocation.
+typedef struct dndmem_struct dndmem;
+struct dndmem_struct
+{
+    //Size of allocated memory.
+    npy_intp size;
+    //Pointer to the next free memory allocation.
+    dndmem *next;
+};
+
+/*===================================================================
+ *
+ * Put memory allocation into the memory pool.
+ */
+void mem_pool_put(dndmem *mem);
+
+/*===================================================================
+ *
+ * Makes sure that the array's memory has been allocated.
+ */
+void delayed_array_allocation(dndarray *ary);
+
+/*===================================================================
+ *
+ * De-allocate the memory pool.
+ */
+void mem_pool_finalize(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(MEMORY_H) */
diff --git a/distnumpy/src/process_grid.c b/distnumpy/src/process_grid.c
new file mode 100644
index 000000000000..91bae93e6694
--- /dev/null
+++ b/distnumpy/src/process_grid.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*
+ * ===================================================================
+ * Setting the process grid.
+ * Accepts NULL as a default value request.
+ * When called no distributed array must be allocated.
+ */
+static PyObject *
+PyDistArray_ProcGridSet(PyArrayObject *self, PyObject *args)
+{
+    PyObject *pgrid = Py_None;
+    int i,j;
+    int tmpsizes[NPY_MAXDIMS*NPY_MAXDIMS];
+
+    if(args != NULL)
+        if (!PyArg_ParseTuple(args, "O", &pgrid))
+            return NULL;
+
+    if(!initmsg_not_handled && ndndarrays > 0)
+    {
+        PyErr_Format(PyExc_RuntimeError, "numpy.datalayout must be "
+                "called when no distributed array are allocated "
+                "(%ld arrays are currently allocated).", ndndarrays);
+        return NULL;
+    }
+
+    //Check for user-defined process grid.
+    //The syntax used is: ndims:dim:size;
+    //E.g. DNPY_PROC_SIZE="2:2:4;3:3:2" which means that array
+    //with two dimensions should, at its second dimension, have
+    //a size of four etc.
+    memset(tmpsizes, 0, NPY_MAXDIMS*NPY_MAXDIMS*sizeof(int));
+    char *env = getenv("DNPY_PROC_GRID");
+    if(env != NULL)
+    {
+        char *res = strtok(env, ";");
+        while(res != NULL)
+        {
+            char *size_ptr;
+            int dsize = 0;
+            int dim = 0;
+            int ndims = strtol(res, &size_ptr, 10);
+            if(size_ptr != '\0')
+                dim = strtol(size_ptr+1, &size_ptr, 10);
+            if(size_ptr != '\0')
+                dsize = strtol(size_ptr+1, NULL, 10);
+            //Make sure the input is valid.
+            if(dsize <= 0 || dim <= 0 || dim > ndims ||
+               ndims <= 0 || ndims > NPY_MAXDIMS)
+            {
+                PyErr_Format(PyExc_ValueError, "DNPY_PROC_GRID, invalid"
+                             " syntax or value at \"%s\"\n", res);
+                return NULL;
+            }
+            tmpsizes[(ndims-1)*NPY_MAXDIMS+(dim-1)] = dsize;
+            //Go to next token.
+            res = strtok(NULL, ";");
+        }
+    }
+    else if(pgrid != Py_None)
+    {//The environment variable supersedes the function call.
+        for(i=0; i<PySequence_Size(pgrid); i++)
+        {
+            PyObject *tuple = PySequence_ITEM(pgrid, i);
+            if(!PySequence_Check(tuple) || PySequence_Size(tuple) != 3)
+            {
+                PyErr_Format(PyExc_ValueError, "The datalayout "
+                    "must use the following layout: [(#dimensions, "
+                    "dimension, size),...] where each tuple impose a "
+                    "dimension size.");
+                return NULL;
+            }
+            PyObject *item = PySequence_ITEM(tuple, 0);
+            int ndims = PyInt_AsLong(item);
+            Py_DECREF(item);
+            item = PySequence_ITEM(tuple, 1);
+            int dim = PyInt_AsLong(item);
+            Py_DECREF(item);
+            item = PySequence_ITEM(tuple, 2);
+            int dsize = PyInt_AsLong(item);
+            Py_DECREF(item);
+            Py_DECREF(tuple);
+
+            //Make sure the input is valid.
+            if(dsize <= 0 || dim <= 0 || dim > ndims ||
+               ndims <= 0 || ndims > NPY_MAXDIMS)
+            {
+                PyErr_Format(PyExc_ValueError, "invalid values");
+                return NULL;
+            }
+            tmpsizes[(ndims-1)*NPY_MAXDIMS+(dim-1)] = dsize;
+        }
+    }
+
+    //Find a balanced distributioin of processes per direction
+    //based on the restrictions specified by the user.
+    for(i=0; i<NPY_MAXDIMS; i++)
+    {
+        int ndims = i+1;
+        int t[NPY_MAXDIMS];
+        int d = 0;
+        //Need to reverse the order to match MPI_Dims_create
+        for(j=i; j>=0; j--)
+            t[d++] = tmpsizes[i*NPY_MAXDIMS+j];
+
+        MPI_Dims_create(worldsize, ndims, t);
+        d = ndims;
+        for(j=0; j<ndims; j++)
+            tmpsizes[i*NPY_MAXDIMS+j] = t[--d];
+    }
+
+    #ifndef DNPY_SPMD
+        //Tell slaves
+        msg[0] = DNPY_INIT_PGRID;
+        msg[1] = DNPY_INIT_PGRID;
+        memcpy(&msg[1], tmpsizes, NPY_MAXDIMS*NPY_MAXDIMS*sizeof(int));
+        *(((int*)&msg[1])+NPY_MAXDIMS*NPY_MAXDIMS) = DNPY_MSG_END;
+        msg2slaves(msg, NPY_MAXDIMS*NPY_MAXDIMS*sizeof(int));
+    #endif
+
+    handle_ProcGridSet(tmpsizes);
+
+    Py_RETURN_NONE;
+}/* PyDistArray_ProcGridSet */
+
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_ProcGridSet.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_ProcGridSet(int pgrid[NPY_MAXDIMS*NPY_MAXDIMS])
+{
+    int j,i;
+    initmsg_not_handled = 0;
+    assert(ndndarrays == 0);
+
+    //Save the cart_dim_sizes and compute the cart_dim_strides.
+    for(i=0; i<NPY_MAXDIMS; i++)
+    {
+        int ndims = i+1;
+        for(j=0; j<ndims; j++)
+            cart_dim_sizes[i][j] = pgrid[i*NPY_MAXDIMS+j];
+
+        //Set cartesian information.
+        memset(cart_dim_strides[i], 0, ndims*sizeof(int));
+
+        //Compute strides for all dims. Using row-major like MPI.
+        //A 2x2 process grid looks like:
+        //    coord (0,0): rank 0.
+        //    coord (0,1): rank 1.
+        //    coord (1,0): rank 2.
+        //    coord (1,1): rank 3.
+        for(j=0; j<ndims; j++)
+        {
+            int stride = 1, s;
+            for(s=j+1; s<ndims; s++)
+                stride *= cart_dim_sizes[i][s];
+            cart_dim_strides[i][j] = stride;
+        }
+    }
+
+    return 0;
+}/* handle_ProcGridSet */
diff --git a/distnumpy/src/process_grid.h b/distnumpy/src/process_grid.h
new file mode 100644
index 000000000000..34ab609b94cb
--- /dev/null
+++ b/distnumpy/src/process_grid.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011 Mads R. B. Kristensen <madsbk@gmail.com>
+ *
+ * This file is part of DistNumPy <https://github.com/distnumpy>.
+ *
+ * DistNumPy is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * DistNumPy is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with DistNumPy. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PROCESS_GRID_H
+#define PROCESS_GRID_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * ===================================================================
+ * Setting the process grid.
+ * Accepts NULL as a default value request.
+ * When called no distributed array must be allocated.
+ */
+static PyObject *
+PyDistArray_ProcGridSet(PyArrayObject *self, PyObject *args);
+
+/*===================================================================
+ *
+ * Handler for PyDistArray_ProcGridSet.
+ * Return -1 and set exception on error, 0 on success.
+ */
+int handle_ProcGridSet(int pgrid[NPY_MAXDIMS*NPY_MAXDIMS]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(PROCESS_GRID_H) */
diff --git a/doc/release/1.6.0-notes.rst b/doc/release/1.6.0-notes.rst
index 38b9df80bbe9..c5f53a0eb387 100644
--- a/doc/release/1.6.0-notes.rst
+++ b/doc/release/1.6.0-notes.rst
@@ -1,6 +1,3 @@
-Note: NumPy 1.6.0 is not yet released.
-
-
 =========================
 NumPy 1.6.0 Release Notes
 =========================
@@ -66,11 +63,8 @@ length arrays which caused unpredicted results. Thanks to Lorenz
 Hüdepohl for pointing out the correct way to interface routines with
 assumed shape arrays.
 
-In addition, f2py interprets Fortran expression ``size(array, dim)``
-as ``shape(array, dim-1)`` which makes it possible to automatically
-wrap Fortran routines that use two argument ``size`` function in
-dimension specifications. Before users were forced to apply this
-mapping manually.
+In addition, f2py supports now automatic wrapping of Fortran routines
+that use two argument ``size`` function in dimension specifications.
 
 
 Other new functions
@@ -79,9 +73,6 @@ Other new functions
 ``numpy.ravel_multi_index`` : Converts a multi-index tuple into
 an array of flat indices, applying boundary modes to the indices.
 
-``numpy.slogdet`` : Compute the sign and (natural) logarithm of the determinant
-of an array.
-
 ``numpy.einsum`` : Evaluate the Einstein summation convention.  Using the
 Einstein summation convention, many common multi-dimensional array operations
 can be represented in a simple fashion.  This function provides a way compute
@@ -99,10 +90,11 @@ not match the ufunc implementation.
 Changes
 =======
 
-Changes and improvements in the numpy core
-------------------------------------------
-
+``default error handling``
+--------------------------
 
+The default error handling has been change from ``print`` to ``warn`` for
+all except for ``underflow``, which remains as ``ignore``.
 
 
 ``numpy.distutils``
@@ -135,6 +127,13 @@ conversion of arbitrary python objects into arrays is exposed by
 ``PyArray_GetArrayParamsFromObject``.
 
 
+Deprecated features
+===================
+
+The "normed" keyword in ``numpy.histogram`` is deprecated. Its functionality
+will be replaced by the new "density" keyword.
+
+
 Removed features
 ================
 
diff --git a/doc/source/reference/c-api.array.rst b/doc/source/reference/c-api.array.rst
index 8c2b3a34e5ed..f34176a0014a 100644
--- a/doc/source/reference/c-api.array.rst
+++ b/doc/source/reference/c-api.array.rst
@@ -958,30 +958,60 @@ Converting data types
     returned when the value will not overflow or be truncated to
     an integer when converting to a smaller type.
 
+    This is almost the same as the result of
+    PyArray_CanCastTypeTo(PyArray_MinScalarType(arr), totype, casting),
+    but it also handles a special case arising because the set
+    of uint values is not a subset of the int values for types with the
+    same number of bits.
+
 .. cfunction:: PyArray_Descr* PyArray_MinScalarType(PyArrayObject* arr)
 
     .. versionadded:: 1.6
 
     If *arr* is an array, returns its data type descriptor, but if
     *arr* is an array scalar (has 0 dimensions), it finds the data type
-    of smallest kind and size to which the value may be converted
+    of smallest size to which the value may be converted
     without overflow or truncation to an integer.
+    
+    This function will not demote complex to float or anything to
+    boolean, but will demote a signed integer to an unsigned integer
+    when the scalar value is positive.
 
 .. cfunction:: PyArray_Descr* PyArray_PromoteTypes(PyArray_Descr* type1, PyArray_Descr* type2)
 
     .. versionadded:: 1.6
 
     Finds the data type of smallest size and kind to which *type1* and
-    *type2* may be safely converted.
+    *type2* may be safely converted. This function is symmetric and
+    associative.
 
 .. cfunction:: PyArray_Descr* PyArray_ResultType(npy_intp narrs, PyArrayObject**arrs, npy_intp ndtypes, PyArray_Descr**dtypes)
 
     .. versionadded:: 1.6
 
-    This applies PyArray_PromoteTypes to all the inputs, along with
+    This applies type promotion to all the inputs,
     using the NumPy rules for combining scalars and arrays, to
     determine the output type of a set of operands.  This is the
-    same result type that ufuncs produce.
+    same result type that ufuncs produce. The specific algorithm
+    used is as follows.
+
+    Categories are determined by first checking which of boolean,
+    integer (int/uint), or floating point (float/complex) the maximum
+    kind of all the arrays and the scalars are.
+    
+    If there are only scalars or the maximum category of the scalars
+    is higher than the maximum category of the arrays,
+    the data types are combined with :cfunc:`PyArray_PromoteTypes`
+    to produce the return value.
+
+    Otherwise, PyArray_MinScalarType is called on each array, and
+    the resulting data types are all combined with
+    :cfunc:`PyArray_PromoteTypes` to produce the return value.
+
+    The set of int values is not a subset of the uint values for types
+    with the same number of bits, something not reflected in
+    :cfunc:`PyArray_MinScalarType`, but handled as a special case in
+    PyArray_ResultType.
 
 .. cfunction:: int PyArray_ObjectType(PyObject* op, int mintype)
 
@@ -2287,6 +2317,9 @@ Array Scalars
 
 .. cfunction:: NPY_SCALARKIND PyArray_ScalarKind(int typenum, PyArrayObject** arr)
 
+    See the function :cfunc:`PyArray_MinScalarType` for an alternative
+    mechanism introduced in NumPy 1.6.0.
+
     Return the kind of scalar represented by *typenum* and the array
     in *\*arr* (if *arr* is not ``NULL`` ). The array is assumed to be
     rank-0 and only used if *typenum* represents a signed integer. If
@@ -2300,6 +2333,9 @@ Array Scalars
 
 .. cfunction:: int PyArray_CanCoerceScalar(char thistype, char neededtype, NPY_SCALARKIND scalar)
 
+    See the function :cfunc:`PyArray_ResultType` for details of
+    NumPy type promotion, updated in NumPy 1.6.0.
+
     Implements the rules for scalar coercion. Scalars are only
     silently coerced from thistype to neededtype if this function
     returns nonzero.  If scalar is :cdata:`NPY_NOSCALAR`, then this
diff --git a/doc/source/reference/c-api.iterator.rst b/doc/source/reference/c-api.iterator.rst
index 421678648298..9e443f2cbbd9 100644
--- a/doc/source/reference/c-api.iterator.rst
+++ b/doc/source/reference/c-api.iterator.rst
@@ -323,9 +323,9 @@ Construction and Destruction
                             dtype);
         Py_DECREF(dtype);
 
-.. cfunction:: NpyIter* NpyIter_MultiNew(npy_intp niter, PyArrayObject** op, npy_uint32 flags, NPY_ORDER order, NPY_CASTING casting, npy_uint32* op_flags, PyArray_Descr** op_dtypes)
+.. cfunction:: NpyIter* NpyIter_MultiNew(npy_intp nop, PyArrayObject** op, npy_uint32 flags, NPY_ORDER order, NPY_CASTING casting, npy_uint32* op_flags, PyArray_Descr** op_dtypes)
 
-    Creates an iterator for broadcasting the ``niter`` array objects provided
+    Creates an iterator for broadcasting the ``nop`` array objects provided
     in ``op``, using regular NumPy broadcasting rules.
 
     Any of the :ctype:`NPY_ORDER` enum values may be passed to ``order``.  For
@@ -494,7 +494,7 @@ Construction and Destruction
             Then, call :cfunc:`NpyIter_Reset` to allocate and fill the buffers
             with their initial values.
 
-    Flags that may be passed in ``op_flags[i]``, where ``0 <= i < niter``:
+    Flags that may be passed in ``op_flags[i]``, where ``0 <= i < nop``:
 
         .. cvar:: NPY_ITER_READWRITE
         .. cvar:: NPY_ITER_READONLY
@@ -579,7 +579,7 @@ Construction and Destruction
             Ensures that the input or output matches the iteration
             dimensions exactly.
 
-.. cfunction:: NpyIter* NpyIter_AdvancedNew(npy_intp niter, PyArrayObject** op, npy_uint32 flags, NPY_ORDER order, NPY_CASTING casting, npy_uint32* op_flags, PyArray_Descr** op_dtypes, int oa_ndim, int** op_axes, npy_intp* itershape, npy_intp buffersize)
+.. cfunction:: NpyIter* NpyIter_AdvancedNew(npy_intp nop, PyArrayObject** op, npy_uint32 flags, NPY_ORDER order, NPY_CASTING casting, npy_uint32* op_flags, PyArray_Descr** op_dtypes, int oa_ndim, int** op_axes, npy_intp* itershape, npy_intp buffersize)
 
     Extends :cfunc:`NpyIter_MultiNew` with several advanced options providing
     more control over broadcasting and buffering.
@@ -592,7 +592,7 @@ Construction and Destruction
     If it is provided, ``op_axes`` and/or ``itershape`` must also be provided.
     The ``op_axes`` parameter let you control in detail how the
     axes of the operand arrays get matched together and iterated.
-    In ``op_axes``, you must provide an array of ``niter`` pointers
+    In ``op_axes``, you must provide an array of ``nop`` pointers
     to ``oa_ndim``-sized arrays of type ``npy_intp``.  If an entry
     in ``op_axes`` is NULL, normal broadcasting rules will apply.
     In ``op_axes[j][i]`` is stored either a valid axis of ``op[j]``, or
@@ -906,7 +906,7 @@ Construction and Destruction
     may be smaller than the number of dimensions in the original
     objects.
 
-.. cfunction:: int NpyIter_GetNIter(NpyIter* iter)
+.. cfunction:: int NpyIter_GetNOp(NpyIter* iter)
 
     Returns the number of operands in the iterator.
 
@@ -933,7 +933,7 @@ Construction and Destruction
 
 .. cfunction:: PyArray_Descr** NpyIter_GetDescrArray(NpyIter* iter)
 
-    This gives back a pointer to the ``niter`` data type Descrs for
+    This gives back a pointer to the ``nop`` data type Descrs for
     the objects being iterated.  The result points into ``iter``,
     so the caller does not gain any references to the Descrs.
 
@@ -942,7 +942,7 @@ Construction and Destruction
 
 .. cfunction:: PyObject** NpyIter_GetOperandArray(NpyIter* iter)
 
-    This gives back a pointer to the ``niter`` operand PyObjects
+    This gives back a pointer to the ``nop`` operand PyObjects
     that are being iterated.  The result points into ``iter``,
     so the caller does not gain any references to the PyObjects.
 
@@ -961,12 +961,12 @@ Construction and Destruction
 
 .. cfunction:: void NpyIter_GetReadFlags(NpyIter* iter, char* outreadflags)
 
-    Fills ``niter`` flags. Sets ``outreadflags[i]`` to 1 if
+    Fills ``nop`` flags. Sets ``outreadflags[i]`` to 1 if
     ``op[i]`` can be read from, and to 0 if not.
 
 .. cfunction:: void NpyIter_GetWriteFlags(NpyIter* iter, char* outwriteflags)
 
-    Fills ``niter`` flags. Sets ``outwriteflags[i]`` to 1 if
+    Fills ``nop`` flags. Sets ``outwriteflags[i]`` to 1 if
     ``op[i]`` can be written to, and to 0 if not.
 
 .. cfunction:: int NpyIter_CreateCompatibleStrides(NpyIter* iter, npy_intp itemsize, npy_intp* outstrides)
@@ -1021,7 +1021,7 @@ Functions For Iteration
         char** dataptr = NpyIter_GetDataPtrArray(iter);
 
         do {
-            /* use the addresses dataptr[0], ... dataptr[niter-1] */
+            /* use the addresses dataptr[0], ... dataptr[nop-1] */
         } while(iternext(iter));
 
     When :cdata:`NPY_ITER_EXTERNAL_LOOP` is specified, the typical
@@ -1033,14 +1033,14 @@ Functions For Iteration
         char** dataptr = NpyIter_GetDataPtrArray(iter);
         npy_intp* stride = NpyIter_GetInnerStrideArray(iter);
         npy_intp* size_ptr = NpyIter_GetInnerLoopSizePtr(iter), size;
-        npy_intp iiter, niter = NpyIter_GetNIter(iter);
+        npy_intp iop, nop = NpyIter_GetNOp(iter);
 
         do {
             size = *size_ptr;
             while (size--) {
-                /* use the addresses dataptr[0], ... dataptr[niter-1] */
-                for (iiter = 0; iiter < niter; ++iiter) {
-                    dataptr[iiter] += stride[iiter];
+                /* use the addresses dataptr[0], ... dataptr[nop-1] */
+                for (iop = 0; iop < nop; ++iop) {
+                    dataptr[iop] += stride[iop];
                 }
             }
         } while (iternext());
@@ -1067,7 +1067,7 @@ Functions For Iteration
         char **dataptr = NpyIter_GetDataPtrArray(iter);
         npy_intp *stride = NpyIter_GetInnerStrideArray(iter);
         npy_intp *size_ptr = NpyIter_GetInnerLoopSizePtr(iter), size;
-        npy_intp i, iiter, niter = NpyIter_GetNIter(iter);
+        npy_intp i, iop, nop = NpyIter_GetNOp(iter);
 
         /* One loop with a fixed inner size */
         size = *size_ptr;
@@ -1077,9 +1077,9 @@ Functions For Iteration
              * which divides into FIXED_BUFFER_SIZE
              */
             for (i = 0; i < FIXED_BUFFER_SIZE; ++i) {
-                /* use the addresses dataptr[0], ... dataptr[niter-1] */
-                for (iiter = 0; iiter < niter; ++iiter) {
-                    dataptr[iiter] += stride[iiter];
+                /* use the addresses dataptr[0], ... dataptr[nop-1] */
+                for (iop = 0; iop < nop; ++iop) {
+                    dataptr[iop] += stride[iop];
                 }
             }
             iternext();
@@ -1090,9 +1090,9 @@ Functions For Iteration
         if (size > 0) do {
             size = *size_ptr;
             while (size--) {
-                /* use the addresses dataptr[0], ... dataptr[niter-1] */
-                for (iiter = 0; iiter < niter; ++iiter) {
-                    dataptr[iiter] += stride[iiter];
+                /* use the addresses dataptr[0], ... dataptr[nop-1] */
+                for (iop = 0; iop < nop; ++iop) {
+                    dataptr[iop] += stride[iop];
                 }
             }
         } while (iternext());
@@ -1113,7 +1113,7 @@ Functions For Iteration
 
 .. cfunction:: char** NpyIter_GetDataPtrArray(NpyIter* iter)
 
-    This gives back a pointer to the ``niter`` data pointers.  If
+    This gives back a pointer to the ``nop`` data pointers.  If
     :cdata:`NPY_ITER_EXTERNAL_LOOP` was not specified, each data
     pointer points to the current data item of the iterator.  If
     no inner iteration was specified, it points to the first data
@@ -1147,7 +1147,7 @@ functions provide that information.
 
 .. cfunction:: npy_intp* NpyIter_GetInnerStrideArray(NpyIter* iter)
 
-    Returns a pointer to an array of the ``niter`` strides,
+    Returns a pointer to an array of the ``nop`` strides,
     one for each iterated object, to be used by the inner loop.
 
     This pointer may be cached before the iteration loop, calling
diff --git a/doc/source/reference/routines.indexing.rst b/doc/source/reference/routines.indexing.rst
index 4a0ef840e68d..853d24126cc3 100644
--- a/doc/source/reference/routines.indexing.rst
+++ b/doc/source/reference/routines.indexing.rst
@@ -57,6 +57,7 @@ Iterating over arrays
 .. autosummary::
    :toctree: generated/
 
+   nditer
    ndenumerate
    ndindex
    flatiter
diff --git a/doc/source/reference/routines.poly.rst b/doc/source/reference/routines.polynomials.poly1d.rst
similarity index 95%
rename from doc/source/reference/routines.poly.rst
rename to doc/source/reference/routines.polynomials.poly1d.rst
index f30b2c8844b6..7eef53ce23e8 100644
--- a/doc/source/reference/routines.poly.rst
+++ b/doc/source/reference/routines.polynomials.poly1d.rst
@@ -1,5 +1,5 @@
-Polynomials
-***********
+Poly1d
+======
 
 .. currentmodule:: numpy
 
diff --git a/doc/source/reference/routines.polynomials.polynomial.rst b/doc/source/reference/routines.polynomials.polynomial.rst
new file mode 100644
index 000000000000..aa92ce8fcabb
--- /dev/null
+++ b/doc/source/reference/routines.polynomials.polynomial.rst
@@ -0,0 +1,16 @@
+Polynomial Package (:mod:`numpy.polynomial`)
+============================================
+
+.. currentmodule:: numpy.polynomial
+
+Polynomial Classes
+------------------
+.. autosummary::
+   :toctree: generated/
+
+   Polynomial
+   Chebyshev
+   Legendre
+   Hermite
+   HermiteE
+   Laguerre
diff --git a/doc/source/reference/routines.polynomials.rst b/doc/source/reference/routines.polynomials.rst
new file mode 100644
index 000000000000..59d6bc499d5c
--- /dev/null
+++ b/doc/source/reference/routines.polynomials.rst
@@ -0,0 +1,14 @@
+Polynomials
+***********
+
+The poly1d functions are considered outdated but are retained for
+backward compatibility. New software needing polynomials should
+use the classes in the Polynomial Package.
+
+.. toctree::
+   :maxdepth: 2
+
+   routines.polynomials.polynomial
+   routines.polynomials.poly1d
+
+
diff --git a/doc/source/reference/routines.rst b/doc/source/reference/routines.rst
index 4d3e99181aa5..c44af4427e3a 100644
--- a/doc/source/reference/routines.rst
+++ b/doc/source/reference/routines.rst
@@ -18,7 +18,6 @@ indentation.
    routines.array-creation
    routines.array-manipulation
    routines.indexing
-   routines.nditer
    routines.dtype
    routines.io
    routines.fft
@@ -30,7 +29,7 @@ indentation.
    routines.statistics
    routines.math
    routines.functional
-   routines.poly
+   routines.polynomials
    routines.financial
    routines.set
    routines.window
diff --git a/doc/source/reference/ufuncs.rst b/doc/source/reference/ufuncs.rst
index dedbf292932b..0e7da347eaa9 100644
--- a/doc/source/reference/ufuncs.rst
+++ b/doc/source/reference/ufuncs.rst
@@ -174,6 +174,13 @@ Casting Rules
 .. index::
    pair: ufunc; casting rules
 
+.. note::
+
+   In NumPy 1.6.0, a type promotion API was created to encapsulate the
+   mechansim for determining output types. See the functions
+   :func:`result_type`, :func:`promote_types`, and
+   :func:`min_scalar_type` for more details.
+
 At the core of every ufunc is a one-dimensional strided loop that
 implements the actual function for a specific type combination. When a
 ufunc is created, it is given a static list of inner loops and a
@@ -267,19 +274,56 @@ types, are interpreted accordingly in ufuncs) without worrying about
 whether the precision of the scalar constant will cause upcasting on
 your large (small precision) array.
 
-
 :class:`ufunc`
 ==============
 
 Optional keyword arguments
 --------------------------
 
-All ufuncs take optional keyword arguments. These represent rather
-advanced usage and will not typically be used by most Numpy users.
+All ufuncs take optional keyword arguments. Most of these represent
+advanced usage and will not typically be used.
 
 .. index::
    pair: ufunc; keyword arguments
 
+*out*
+
+    .. versionadded:: 1.6
+
+    The first output can provided as either a positional or a keyword parameter.
+
+*casting*
+
+    .. versionadded:: 1.6
+
+    Provides a policy for what kind of casting is permitted. For compatibility
+    with previous versions of NumPy, this defaults to 'unsafe'. May be 'no',
+    'equiv', 'safe', 'same_kind', or 'unsafe'. See :func:`can_cast` for
+    explanations of the parameter values.
+
+*order*
+
+    .. versionadded:: 1.6
+
+    Specifies the calculation iteration order/memory layout of the output array.
+    Defaults to 'K'. 'C' means the output should be C-contiguous, 'F' means
+    F-contiguous, 'A' means F-contiguous if the inputs are F-contiguous, C-contiguous
+    otherwise, and 'K' means to match the element ordering of the inputs
+    as closely as possible.
+
+*dtype*
+
+    .. versionadded:: 1.6
+
+    Overrides the dtype of the calculation and output arrays. Similar to *sig*.
+
+*subok*
+
+    .. versionadded:: 1.6
+
+    Defaults to true. If set to false, the output will always be a strict
+    array, not a subtype.
+
 *sig*
 
     Either a data-type, a tuple of data-types, or a special signature
@@ -304,6 +348,7 @@ advanced usage and will not typically be used by most Numpy users.
     in a loop.
 
 
+
 Attributes
 ----------
 
diff --git a/numpy/__init__.py b/numpy/__init__.py
index f2ffdf92ce8e..4c4c074b39a3 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -169,3 +169,9 @@ def pkgload(*packages, **options):
     __all__.extend(_mat.__all__)
     __all__.extend(lib.__all__)
     __all__.extend(['linalg', 'fft', 'random', 'ctypeslib', 'ma'])
+
+    #DISTNUMPY
+    import distnumpy
+    if not distnumpy.MasterSlaveSplit():
+        import sys
+        sys.exit(0)
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 414f6f7d1b1b..92b260754e3f 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -159,18 +159,19 @@
           * "buffered" enables buffering when required.
           * "c_index" causes a C-order index to be tracked.
           * "f_index" causes a Fortran-order index to be tracked.
+          * "multi_index" causes a multi-index, or a tuple of indices
+            with one per iteration dimension, to be tracked.
           * "common_dtype" causes all the operands to be converted to
-            a common data type.
+            a common data type, with copying or buffering as necessary.
           * "delay_bufalloc" delays allocation of the buffers until
             a reset() call is made. Allows "allocate" operands to
             be initialized before their values are copied into the buffers.
-          * "external_loop" causes the `values` given to be matched
-            one-dimensional arrays with multiple values.
+          * "external_loop" causes the `values` given to be
+            one-dimensional arrays with multiple values instead of
+            zero-dimensional arrays.
           * "grow_inner" allows the `value` array sizes to be made
             larger than the buffer size when both "buffered" and
             "external_loop" is used.
-          * "multi_index" causes a multi-index, or a tuple of indices
-            with one per iteration dimension.
           * "ranged" allows the iterator to be restricted to a sub-range
             of the iterindex values.
           * "refs_ok" enables iteration of reference types, such as
@@ -182,18 +183,18 @@
         This is a list of flags for each operand. At minimum, one of
         "readonly", "readwrite", or "writeonly" must be specified.
 
-          * "allocate" causes the array to be allocated if it is None
-            in the `op` parameter.
+          * "readonly" indicates the operand will only be read from.
+          * "readwrite" indicates the operand will be read from and written to.
+          * "writeonly" indicates the operand will only be written to.
+          * "no_broadcast" prevents the operand from being broadcasted.
+          * "contig" forces the operand data to be contiguous.
           * "aligned" forces the operand data to be aligned.
+          * "nbo" forces the operand data to be in native byte order.
           * "copy" allows a temporary read-only copy if required.
           * "updateifcopy" allows a temporary read-write copy if required.
-          * "contig" forces the operand data to be contiguous.
-          * "nbo" forces the operand data to be in native byte order.
+          * "allocate" causes the array to be allocated if it is None
+            in the `op` parameter.
           * "no_subtype" prevents an "allocate" operand from using a subtype.
-          * "no_broadcast" prevents the operand from being broadcasted.
-          * "readonly" indicates the operand will only be read from.
-          * "readwrite" indicates the operand will be read from and written to.
-          * "writeonly" indicates the operand will only be written to.
     op_dtypes : dtype or tuple of dtype(s), optional
         The required data type(s) of the operands. If copying or buffering
         is enabled, the data will be converted to/from their original types.
@@ -201,7 +202,9 @@
         Controls the iteration order. 'C' means C order, 'F' means
         Fortran order, 'A' means 'F' order if all the arrays are Fortran
         contiguous, 'C' order otherwise, and 'K' means as close to the
-        order the array elements appear in memory as possible.
+        order the array elements appear in memory as possible. This also
+        affects the element memory order of "allocate" operands, as they
+        are allocated to be compatible with iteration order.
         Default is 'K'.
     casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
         Controls what kind of data casting may occur when making a copy
@@ -211,9 +214,9 @@
           * 'no' means the data types should not be cast at all.
           * 'equiv' means only byte-order changes are allowed.
           * 'safe' means only casts which can preserve values are allowed.
-          * 'unsafe' means any data conversions may be done.
           * 'same_kind' means only safe casts or casts within a kind,
             like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
     op_axes : list of list of ints, optional
         If provided, is a list of ints or None for each operands.
         The list of axes for an operand is a mapping from the dimensions
@@ -232,8 +235,8 @@
     Attributes
     ----------
     dtypes : tuple of dtype(s)
-        The data types provided in `value`. This may be different
-        from the operand data types if buffering is enabled.
+        The data types of the values provided in `value`. This may be
+        different from the operand data types if buffering is enabled.
     finished : bool
         Whether the iteration over the operands is finished or not.
     has_delayed_bufalloc : bool
@@ -266,7 +269,7 @@
         accessed and `has_multi_index` is False.
     ndim : int
         The iterator's dimension.
-    niter : int
+    nop : int
         The number of iterator operands.
     operands : tuple of operand(s)
         The array(s) to be iterated over.
@@ -297,7 +300,7 @@ def iter_add_py(x, y, out=None):
             it = np.nditer([x, y, out], [],
                         [['readonly'], ['readonly'], ['writeonly','allocate']])
             for (a, b, c) in it:
-                addop(a, b, c)
+                addop(a, b, out=c)
             return it.operands[2]
 
     Here is the same function, but following the C-style pattern::
@@ -309,11 +312,55 @@ def iter_add(x, y, out=None):
                         [['readonly'], ['readonly'], ['writeonly','allocate']])
 
             while not it.finished:
-                addop(it[0], it[1], it[2])
+                addop(it[0], it[1], out=it[2])
                 it.iternext()
 
             return it.operands[2]
 
+    Here is an example outer product function::
+
+        def outer_it(x, y, out=None):
+            mulop = np.multiply
+
+            it = np.nditer([x, y, out], ['external_loop'],
+                    [['readonly'], ['readonly'], ['writeonly', 'allocate']],
+                    op_axes=[range(x.ndim)+[-1]*y.ndim,
+                             [-1]*x.ndim+range(y.ndim),
+                             None])
+
+            for (a, b, c) in it:
+                mulop(a, b, out=c)
+
+            return it.operands[2]
+
+        >>> a = np.arange(2)+1
+        >>> b = np.arange(3)+1
+        >>> outer_it(a,b)
+        array([[1, 2, 3],
+               [2, 4, 6]])
+
+    Here is an example function which operates like a "lambda" ufunc::
+
+        def luf(lamdaexpr, *args, **kwargs):
+            "luf(lambdaexpr, op1, ..., opn, out=None, order='K', casting='safe', buffersize=0)"
+            nargs = len(args)
+            op = (kwargs.get('out',None),) + args
+            it = np.nditer(op, ['buffered','external_loop'],
+                    [['writeonly','allocate','no_broadcast']] +
+                                    [['readonly','nbo','aligned']]*nargs,
+                    order=kwargs.get('order','K'),
+                    casting=kwargs.get('casting','safe'),
+                    buffersize=kwargs.get('buffersize',0))
+            while not it.finished:
+                it[0] = lamdaexpr(*it[1:])
+                it.iternext()
+            return it.operands[0]
+
+        >>> a = np.arange(5)
+        >>> b = np.ones(5)
+        >>> luf(lambda i,j:i*i + j/2, a, b)
+        array([  0.5,   1.5,   4.5,   9.5,  16.5])
+
     """)
 
 # nditer methods
@@ -1505,14 +1552,25 @@ def iter_add(x, y, out=None):
         Data type, scalar, or array to cast from.
     totype : dtype or dtype specifier
         Data type to cast to.
-    casting : casting rule
-        May be any of 'no', 'equiv', 'safe', 'same_kind', or 'unsafe'.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.
+
+          * 'no' means the data types should not be cast at all.
+          * 'equiv' means only byte-order changes are allowed.
+          * 'safe' means only casts which can preserve values are allowed.
+          * 'same_kind' means only safe casts or casts within a kind,
+            like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
 
     Returns
     -------
     out : bool
         True if cast can occur according to the casting rule.
 
+    See also
+    --------
+    dtype, result_type
+
     Examples
     --------
 
@@ -1588,6 +1646,8 @@ def iter_add(x, y, out=None):
     kind to which both ``type1`` and ``type2`` may be safely cast.
     The returned data type is always in native byte order.
 
+    This function is symmetric and associative.
+
     Parameters
     ----------
     type1 : dtype or dtype specifier
@@ -1600,10 +1660,13 @@ def iter_add(x, y, out=None):
     out : dtype
         The promoted data type.
 
+    Notes
+    -----
+    .. versionadded:: 1.6.0
+
     See Also
     --------
-    issctype, issubsctype, issubdtype, obj2sctype, sctype2char,
-    maximum_sctype, min_scalar_type
+    result_type, dtype, can_cast
 
     Examples
     --------
@@ -1631,7 +1694,7 @@ def iter_add(x, y, out=None):
     and smallest scalar kind which can hold its value.  For non-scalar
     array ``a``, returns the vector's dtype unmodified.
 
-    As a special case, floating point values are not demoted to integers,
+    Floating point values are not demoted to integers,
     and complex values are not demoted to floats.
 
     Parameters
@@ -1644,11 +1707,13 @@ def iter_add(x, y, out=None):
     out : dtype
         The minimal data type.
 
+    Notes
+    -----
+    .. versionadded:: 1.6.0
 
     See Also
     --------
-    issctype, issubsctype, issubdtype, obj2sctype, sctype2char,
-    maximum_sctype, promote_types
+    result_type, promote_types, dtype, can_cast
 
     Examples
     --------
@@ -1698,6 +1763,33 @@ def iter_add(x, y, out=None):
     out : dtype
         The result type.
 
+    See also
+    --------
+    dtype, promote_types, min_scalar_type, can_cast
+
+    Notes
+    -----
+    .. versionadded:: 1.6.0
+
+    The specific algorithm used is as follows.
+
+    Categories are determined by first checking which of boolean,
+    integer (int/uint), or floating point (float/complex) the maximum
+    kind of all the arrays and the scalars are.
+    
+    If there are only scalars or the maximum category of the scalars
+    is higher than the maximum category of the arrays,
+    the data types are combined with :func:`promote_types`
+    to produce the return value.
+
+    Otherwise, `min_scalar_type` is called on each array, and
+    the resulting data types are all combined with :func:`promote_types`
+    to produce the return value.
+
+    The set of int values is not a subset of the uint values for types
+    with the same number of bits, something not reflected in
+    :func:`min_scalar_type`, but handled as a special case in `result_type`.
+
     Examples
     --------
     >>> np.result_type(3, np.arange(7, dtype='i1'))
@@ -1864,9 +1956,9 @@ def iter_add(x, y, out=None):
           * 'no' means the data types should not be cast at all.
           * 'equiv' means only byte-order changes are allowed.
           * 'safe' means only casts which can preserve values are allowed.
-          * 'unsafe' means any data conversions may be done.
           * 'same_kind' means only safe casts or casts within a kind,
             like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
 
     Returns
     -------
@@ -1879,6 +1971,8 @@ def iter_add(x, y, out=None):
 
     Notes
     -----
+    .. versionadded:: 1.6.0
+
     The subscripts string is a comma-separated list of subscript labels,
     where each label refers to a dimension of the corresponding operand.
     Repeated subscripts labels in one operand take the diagonal.  For example,
@@ -5464,6 +5558,10 @@ def iter_add(x, y, out=None):
         Make a new copy of the data-type object. If ``False``, the result
         may just be a reference to a built-in data-type object.
 
+    See also
+    --------
+    result_type
+
     Examples
     --------
     Using array-scalar type:
diff --git a/numpy/compat/py3k.py b/numpy/compat/py3k.py
index 609f0997497c..001455de5c1e 100644
--- a/numpy/compat/py3k.py
+++ b/numpy/compat/py3k.py
@@ -23,7 +23,7 @@ def asstr(s):
             return s
         return s.decode('latin1')
     def isfileobj(f):
-        return isinstance(f, io.FileIO)
+        return isinstance(f, (io.FileIO, io.BufferedReader))
     def open_latin1(filename, mode='r'):
         return open(filename, mode=mode, encoding='iso-8859-1')
     strchar = 'U'
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 1785f63a9096..5298f412b396 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -335,13 +335,7 @@ def _newnames(datatype, order):
 def _index_fields(ary, fields):
     from multiarray import empty, dtype
     dt = ary.dtype
-    new_dtype = [(name, dt[name]) for name in dt.names if name in fields]
-    future_dtype = [(name, dt[name]) for name in fields if name in dt.names]
-    if not new_dtype == future_dtype:
-        depdoc = "Out of order field selection on recarrays currently returns \
-fields in order. This behavior is deprecated in numpy 1.5 and will change in \
-2.0. See ticket #1431."
-        warnings.warn(depdoc, DeprecationWarning)
+    new_dtype = [(name, dt[name]) for name in fields if name in dt.names]
     if ary.flags.f_contiguous:
         order = 'F'
     else:
@@ -369,6 +363,7 @@ def _index_fields(ary, fields):
     'L': 'L',
     'q': 'q',
     'Q': 'Q',
+    'e': 'e',
     'f': 'f',
     'd': 'd',
     'g': 'g',
@@ -394,6 +389,7 @@ def _index_fields(ary, fields):
     'L': 'u4',
     'q': 'i8',
     'Q': 'u8',
+    'e': 'f2',
     'f': 'f',
     'd': 'd',
     'Zf': 'F',
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 9b6d1c0367de..ff6d0ae87b5c 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -208,12 +208,12 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ',
                 format_function = lambda x: _formatInteger(x, format)
         elif issubclass(dtypeobj, _nt.floating):
             if issubclass(dtypeobj, _nt.longfloat):
-                format_function = _longfloatFormatter(precision)
+                format_function = LongFloatFormat(precision)
             else:
                 format_function = FloatFormat(data, precision, suppress_small)
         elif issubclass(dtypeobj, _nt.complexfloating):
             if issubclass(dtypeobj, _nt.clongfloat):
-                format_function = _clongfloatFormatter(precision)
+                format_function = LongComplexFormat(precision)
             else:
                 format_function = ComplexFormat(data, precision, suppress_small)
         elif issubclass(dtypeobj, _nt.unicode_) or \
@@ -442,6 +442,7 @@ def fillFormat(self, data):
             else:
                 format = '%#'
             format = format + '%d.%df' % (self.max_str_len, precision)
+
         self.special_fmt = '%%%ds' % (self.max_str_len,)
         self.format = format
 
@@ -450,10 +451,16 @@ def __call__(self, x, strip_zeros=True):
         err = _nc.seterr(invalid='ignore')
         try:
             if isnan(x):
-                return self.special_fmt % (_nan_str,)
+                if self.sign:
+                    return self.special_fmt % ('+' + _nan_str,)
+                else:
+                    return self.special_fmt % (_nan_str,)
             elif isinf(x):
                 if x > 0:
-                    return self.special_fmt % (_inf_str,)
+                    if self.sign:
+                        return self.special_fmt % ('+' + _inf_str,)
+                    else:
+                        return self.special_fmt % (_inf_str,)
                 else:
                     return self.special_fmt % ('-' + _inf_str,)
         finally:
@@ -489,26 +496,46 @@ def _formatInteger(x, format):
     else:
         return "%s" % x
 
-def _longfloatFormatter(precision):
+class LongFloatFormat(object):
     # XXX Have to add something to determine the width to use a la FloatFormat
     # Right now, things won't line up properly
-    def formatter(x):
+    def __init__(self, precision, sign=False):
+        self.precision = precision
+        self.sign = sign
+
+    def __call__(self, x):
         if isnan(x):
-            return _nan_str
+            if self.sign:
+                return '+' + _nan_str
+            else:
+                return ' ' + _nan_str
         elif isinf(x):
             if x > 0:
-                return _inf_str
+                if self.sign:
+                    return '+' + _inf_str
+                else:
+                    return ' ' + _inf_str
             else:
                 return '-' + _inf_str
-        return format_longfloat(x, precision)
-    return formatter
-
-def _clongfloatFormatter(precision):
-    def formatter(x):
-        r = format_longfloat(x.real, precision)
-        i = format_longfloat(x.imag, precision)
-        return '%s+%sj' % (r, i)
-    return formatter
+        elif x >= 0:
+            if self.sign:
+                return '+' + format_longfloat(x, self.precision)
+            else:
+                return ' ' + format_longfloat(x, self.precision)
+        else:
+            return format_longfloat(x, self.precision)
+
+
+class LongComplexFormat(object):
+    def __init__(self, precision):
+        self.real_format = LongFloatFormat(precision)
+        self.imag_format = LongFloatFormat(precision, sign=True)
+
+    def __call__(self, x):
+        r = self.real_format(x.real)
+        i = self.imag_format(x.imag)
+        return r + i + 'j'
+
 
 class ComplexFormat(object):
     def __init__(self, x, precision, suppress_small):
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 633c1be96832..64754fa87c84 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -8,4 +8,4 @@
 0x00000005 = 77e2e846db87f25d7cf99f9d812076f0
 # Version 6 added new iterator, half float and casting functions,
 # PyArray_CountNonzero, PyArray_NewLikeArray and PyArray_MatrixProduct2.
-0x00000006 = a413221a7ff73fcf251aeb5ee8fa73ff
+0x00000006 = e61d5dc51fa1c6459328266e215d6987
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index b314f9f843cb..db2c368dd127 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -271,7 +271,7 @@
     'NpyIter_ResetBasePointers':            235,
     'NpyIter_ResetToIterIndexRange':        236,
     'NpyIter_GetNDim':                      237,
-    'NpyIter_GetNIter':                     238,
+    'NpyIter_GetNOp':                       238,
     'NpyIter_GetIterNext':                  239,
     'NpyIter_GetIterSize':                  240,
     'NpyIter_GetIterIndexRange':            241,
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 4cd2de70ee0b..51e83dfdc725 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -33,9 +33,9 @@
  * the places where static allocation is used would need to be changed
  * to dynamic (including inside of several structures)
  */
-
-#define NPY_MAXDIMS 32
-#define NPY_MAXARGS 32
+/* DISTNUMPY - reduced default value to 8 (from 32) */
+#define NPY_MAXDIMS 8
+#define NPY_MAXARGS 8
 
 /* Used for Converter Functions "O&" code in ParseTuple */
 #define NPY_FAIL 0
@@ -74,7 +74,7 @@ enum NPY_TYPES {    NPY_BOOL=0,
                      * New 1.6 types appended, may be integrated
                      * into the above in 2.0.
                      */
-                    NPY_DATETIME, NPY_TIMEDELTA, NPY_HALF, 
+                    NPY_DATETIME, NPY_TIMEDELTA, NPY_HALF,
 
                     NPY_NTYPES,
                     NPY_NOTYPE,
@@ -277,6 +277,17 @@ typedef Py_uintptr_t npy_uintp;
 #define constchar char
 #endif
 
+/* NPY_INTP_FMT Note:
+ *      Unlike the other NPY_*_FMT macros which are used with
+ *      PyOS_snprintf, NPY_INTP_FMT is used with PyErr_Format and
+ *      PyString_Format. These functions use different formatting
+ *      codes which are portably specified according to the Python
+ *      documentation. See ticket #1795.
+ *
+ *      On Windows x64, the LONGLONG formatter should be used, but
+ *      in Python 2.6 the %lld formatter is not supported. In this
+ *      case we work around the problem by using the %zd formatter.
+ */
 #if NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_INT
         #define NPY_INTP NPY_INT
         #define NPY_UINTP NPY_UINT
@@ -303,11 +314,11 @@ typedef Py_uintptr_t npy_uintp;
         #define NPY_MAX_INTP NPY_MAX_LONGLONG
         #define NPY_MIN_INTP NPY_MIN_LONGLONG
         #define NPY_MAX_UINTP NPY_MAX_ULONGLONG
-#ifdef _MSC_VER
+    #if (PY_VERSION_HEX >= 0x02070000)
         #define NPY_INTP_FMT "lld"
-#else
-        #define NPY_INTP_FMT "Ld"
-#endif
+    #else
+        #define NPY_INTP_FMT "zd"
+    #endif
 #endif
 
 /*
@@ -605,6 +616,9 @@ typedef struct _arr_descr {
         PyObject *shape;       /* a tuple */
 } PyArray_ArrayDescr;
 
+/* DISTNUMPY */
+#include "distnumpy_prototypes.h"
+
 /*
  * The main array object structure. It is recommended to use the macros
  * defined below (PyArray_DATA and friends) access fields here, instead
@@ -638,8 +652,12 @@ typedef struct PyArrayObject {
         PyArray_Descr *descr;   /* Pointer to type structure */
         int flags;              /* Flags describing array -- see below */
         PyObject *weakreflist;  /* For weakreferences */
+        dndview *distary;       /* DISTNUMPY Dist Array Struct */
 } PyArrayObject;
 
+/* DISTNUMPY */
+#include "distnumpy_types.h"
+
 #define NPY_AO PyArrayObject
 
 #define fortran fortran_        /* For some compilers */
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index b6f534425a78..34cd727076d5 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -110,9 +110,9 @@ typedef struct {
 
    /* Default user error mode */
 #define UFUNC_ERR_DEFAULT2                               \
-        (UFUNC_ERR_PRINT << UFUNC_SHIFT_DIVIDEBYZERO) +  \
-        (UFUNC_ERR_PRINT << UFUNC_SHIFT_OVERFLOW) +      \
-        (UFUNC_ERR_PRINT << UFUNC_SHIFT_INVALID)
+        (UFUNC_ERR_WARN << UFUNC_SHIFT_DIVIDEBYZERO) +  \
+        (UFUNC_ERR_WARN << UFUNC_SHIFT_OVERFLOW) +      \
+        (UFUNC_ERR_WARN << UFUNC_SHIFT_INVALID)
 
 #if NPY_ALLOW_THREADS
 #define NPY_LOOP_BEGIN_THREADS do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) _save = PyEval_SaveThread();} while (0)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index dce6178464ec..c83a5106c681 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1911,8 +1911,8 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8):
     False
 
     """
-    x = array(a, copy=False)
-    y = array(b, copy=False)
+    x = array(a, copy=False, ndmin=1)
+    y = array(b, copy=False, ndmin=1)
     xinf = isinf(x)
     if not all(xinf == isinf(y)):
         return False
@@ -2141,8 +2141,8 @@ def geterr():
 
     Examples
     --------
-    >>> np.geterr()  # default is all set to 'ignore'
-    {'over': 'ignore', 'divide': 'ignore', 'invalid': 'ignore',
+    >>> np.geterr()
+    {'over': 'warn', 'divide': 'warn', 'invalid': 'warn',
     'under': 'ignore'}
     >>> np.arange(3.) / np.arange(3.)
     array([ NaN,   1.,   1.])
@@ -2390,7 +2390,7 @@ class errstate(object):
     Outside the context the error handling behavior has not changed:
 
     >>> np.geterr()
-    {'over': 'ignore', 'divide': 'ignore', 'invalid': 'ignore',
+    {'over': 'warn', 'divide': 'warn', 'invalid': 'warn',
     'under': 'ignore'}
 
     """
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 78bf14f502e1..f6ae60203797 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -796,6 +796,13 @@ def get_mathlib_info(*args):
         umath_src.append(generate_umath_templated_sources)
         umath_src.append(join('src', 'umath', 'funcs.inc.src'))
 
+    #DISTNUMPY
+    config.add_include_dirs(join('..','..','distnumpy','include'))
+    multiarray_deps.append(join('..','..','distnumpy','include', 'distnumpy.h'))
+    multiarray_deps.append(join('..','..','distnumpy','include', 'distnumpy_api.h'))
+    multiarray_deps.append(join('..','..','distnumpy','include', 'distnumpy_types.h'))
+    multiarray_deps.append(join('..','..','distnumpy','include', 'distnumpy_prototypes.h'))
+
     config.add_extension('multiarray',
                          sources = multiarray_src +
                                 [generate_config_h,
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 15f18879261b..b345014dccc4 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -44,7 +44,12 @@ def atleast_1d(*arys):
     """
     res = []
     for ary in arys:
-        res.append(array(ary,copy=False,subok=True,ndmin=1))
+        ary = asanyarray(ary)
+        if len(ary.shape) == 0 :
+            result = ary.reshape(1)
+        else :
+            result = ary
+        res.append(result)
     if len(res) == 1:
         return res[0]
     else:
@@ -89,7 +94,14 @@ def atleast_2d(*arys):
     """
     res = []
     for ary in arys:
-        res.append(array(ary,copy=False,subok=True,ndmin=2))
+        ary = asanyarray(ary)
+        if len(ary.shape) == 0 :
+            result = ary.reshape(1, 1)
+        elif len(ary.shape) == 1 :
+            result = ary[newaxis, :]
+        else :
+            result = ary
+        res.append(result)
     if len(res) == 1:
         return res[0]
     else:
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index a3008c888b60..a613d3fa8a1e 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -50,6 +50,9 @@ maintainer email:  oliphant.travis@ieee.org
 #include "sequence.h"
 #include "buffer.h"
 
+/* DISTNUMPY */
+#include "distnumpy.h"
+
 /*NUMPY_API
   Compute the size of an array (in number of items)
 */
@@ -144,10 +147,11 @@ PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
             }
         }
         else {
-            /* If the dims match exactly, can assign directly */
-            if (ndim == PyArray_NDIM(dest) &&
-                        PyArray_CompareLists(dims, PyArray_DIMS(dest),
-                                                ndim)) {
+            /*
+             * If there are more than enough dims, use AssignFromSequence
+             * because it can handle this style of broadcasting.
+             */
+            if (ndim >= PyArray_NDIM(dest)) {
                 int res;
                 Py_DECREF(dtype);
                 res = PyArray_AssignFromSequence(dest, src_object);
@@ -263,9 +267,19 @@ array_dealloc(PyArrayObject *self) {
              * self already...
              */
         }
-        PyDataMem_FREE(self->data);
+        /* DISTNUMPY */
+        if(PyDistArray_ARRAY(self) == NULL)
+            PyDataMem_FREE(self->data);
     }
 
+    /* DISTNUMPY */
+    if(PyDistArray_ARRAY(self) != NULL)
+        if(PyDistArray_DelViewArray(self) == -1)
+        {
+            PyErr_Print();
+            PyErr_Clear();
+        }
+
     PyDimMem_FREE(self->dimensions);
     Py_DECREF(self->descr);
     Py_TYPE(self)->tp_free((PyObject *)self);
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 3f2a9ac712fa..fde95c4cb8a6 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -612,7 +612,6 @@ VOID_getitem(char *ip, PyArrayObject *ap)
         return ret;
     }
 
-finish:
     if (PyDataType_FLAGCHK(descr, NPY_ITEM_HASOBJECT)
             || PyDataType_FLAGCHK(descr, NPY_ITEM_IS_POINTER)) {
         PyErr_SetString(PyExc_ValueError,
@@ -1797,9 +1796,18 @@ HALF_fromstr(char *str, npy_half *ip, char **endptr, PyArray_Descr *NPY_UNUSED(i
     return 0;
 }
 
+static int
+BOOL_fromstr(char *str, Bool *ip, char **endptr, PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *ip = (Bool) (result != 0.0);
+    return 0;
+}
 
 /**begin repeat
- * #fname = BOOL, CFLOAT, CDOUBLE, CLONGDOUBLE, OBJECT, STRING, UNICODE, VOID#
+ * #fname = CFLOAT, CDOUBLE, CLONGDOUBLE, OBJECT, STRING, UNICODE, VOID#
  */
 #define @fname@_fromstr NULL
 /**end repeat**/
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 16164011f44d..9bc45a76f388 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -220,6 +220,8 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
         return ret;
     }
     else if (PyDataType_HASFIELDS(descr)) {
+        int base_offset = *offset;
+
         _append_str(str, "T{");
         for (k = 0; k < PyTuple_GET_SIZE(descr->names); ++k) {
             PyObject *name, *item, *offset_obj, *tmp;
@@ -233,14 +235,20 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
 
             child = (PyArray_Descr*)PyTuple_GetItem(item, 0);
             offset_obj = PyTuple_GetItem(item, 1);
-            new_offset = PyInt_AsLong(offset_obj);
+            new_offset = base_offset + PyInt_AsLong(offset_obj);
 
             /* Insert padding manually */
+            if (*offset > new_offset) {
+                PyErr_SetString(PyExc_RuntimeError,
+                                "This should never happen: Invalid offset in "
+                                "buffer format string generation. Please "
+                                "report a bug to the Numpy developers.");
+                return -1;
+            }
             while (*offset < new_offset) {
                 _append_char(str, 'x');
                 ++*offset;
             }
-            *offset += child->elsize;
 
             /* Insert child item */
             _buffer_format_string(child, str, arr, offset,
@@ -287,6 +295,8 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             descr->type_num == NPY_ULONGLONG);
 #endif
 
+        *offset += descr->elsize;
+
         if (descr->byteorder == '=' &&
                 _is_natively_aligned_at(descr, arr, *offset)) {
             /* Prefer native types, to cater for Cython */
@@ -348,6 +358,7 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             break;
         case NPY_LONGLONG:     if (_append_char(str, 'q')) return -1; break;
         case NPY_ULONGLONG:    if (_append_char(str, 'Q')) return -1; break;
+        case NPY_HALF:         if (_append_char(str, 'e')) return -1; break;
         case NPY_FLOAT:        if (_append_char(str, 'f')) return -1; break;
         case NPY_DOUBLE:       if (_append_char(str, 'd')) return -1; break;
         case NPY_LONGDOUBLE:   if (_append_char(str, 'g')) return -1; break;
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 5e25ec1a1a2c..ffeb96ba9353 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -17,39 +17,52 @@
 
 #include "convert.h"
 
-/*NUMPY_API
- * To List
+/*
+ * Converts a subarray of 'self' into lists, with starting data pointer
+ * 'dataptr' and from dimension 'startdim' to the last dimension of 'self'.
+ *
+ * Returns a new reference.
  */
-NPY_NO_EXPORT PyObject *
-PyArray_ToList(PyArrayObject *self)
+static PyObject *
+recursive_tolist(PyArrayObject *self, char *dataptr, int startdim)
 {
-    PyObject *lp;
-    PyArrayObject *v;
-    intp sz, i;
+    npy_intp i, n, stride;
+    PyObject *ret, *item;
 
-    if (!PyArray_Check(self)) {
-        return (PyObject *)self;
+    /* Base case */
+    if (startdim >= PyArray_NDIM(self)) {
+        return PyArray_DESCR(self)->f->getitem(dataptr,self);
     }
-    if (self->nd == 0) {
-        return self->descr->f->getitem(self->data,self);
+
+    n = PyArray_DIM(self, startdim);
+    stride = PyArray_STRIDE(self, startdim);
+
+    ret = PyList_New(n);
+    if (ret == NULL) {
+        return NULL;
     }
 
-    sz = self->dimensions[0];
-    lp = PyList_New(sz);
-    for (i = 0; i < sz; i++) {
-        v = (PyArrayObject *)array_big_item(self, i);
-        if (PyArray_Check(v) && (v->nd >= self->nd)) {
-            PyErr_SetString(PyExc_RuntimeError,
-                            "array_item not returning smaller-" \
-                            "dimensional array");
-            Py_DECREF(v);
-            Py_DECREF(lp);
+    for (i = 0; i < n; ++i) {
+        item = recursive_tolist(self, dataptr, startdim+1);
+        if (item == NULL) {
+            Py_DECREF(ret);
             return NULL;
         }
-        PyList_SetItem(lp, i, PyArray_ToList(v));
-        Py_DECREF(v);
+        PyList_SET_ITEM(ret, i, item);
+
+        dataptr += stride;
     }
-    return lp;
+
+    return ret;
+}
+
+/*NUMPY_API
+ * To List
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_ToList(PyArrayObject *self)
+{
+    return recursive_tolist(self, PyArray_DATA(self), 0);
 }
 
 /* XXX: FIXME --- add ordering argument to
@@ -434,7 +447,7 @@ PyArray_FillWithZero(PyArrayObject *a)
     }
 
     do {
-        stransfer(NULL, 0, *dataptr, stride,
+        stransfer(*dataptr, stride, NULL, 0,
                     *countptr, 0, transferdata);
     } while(iternext(iter));
 
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 3359a5573c44..ee22ba646a7f 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -770,14 +770,14 @@ static int min_scalar_type_num(char *valueptr, int type_num,
             return NPY_BOOL;
         }
         case NPY_UBYTE: {
-            char value = *valueptr;
+            npy_ubyte value = *(npy_ubyte *)valueptr;
             if (value <= NPY_MAX_BYTE) {
                 *is_small_unsigned = 1;
             }
             return NPY_UBYTE;
         }
         case NPY_BYTE: {
-            char value = *valueptr;
+            npy_byte value = *(npy_byte *)valueptr;
             if (value >= 0) {
                 *is_small_unsigned = 1;
                 return NPY_UBYTE;
@@ -1059,26 +1059,55 @@ PyArray_MinScalarType(PyArrayObject *arr)
         int swap = !PyArray_ISNBO(dtype->byteorder);
         int is_small_unsigned = 0;
         /* An aligned memory buffer large enough to hold any type */
-#if NPY_SIZEOF_LONGLONG >= NPY_SIZEOF_CLONGDOUBLE
-        npy_longlong value;
-#else
-        npy_clongdouble value;
-#endif
+        npy_longlong value[4];
         dtype->f->copyswap(&value, data, swap, NULL);
 
         return PyArray_DescrFromType(
-                        min_scalar_type_num((char *)&value, dtype->type_num, &is_small_unsigned));
+                        min_scalar_type_num((char *)&value,
+                                dtype->type_num, &is_small_unsigned));
 
     }
 }
 
+/*
+ * Provides an ordering for the dtype 'kind' character codes, to help
+ * determine when to use the min_scalar_type function. This groups
+ * 'kind' into boolean, integer, floating point, and everything else.
+ */
+static int
+dtype_kind_to_simplified_ordering(char kind)
+{
+    switch (kind) {
+        /* Boolean kind */
+        case 'b':
+            return 0;
+        /* Unsigned int kind */
+        case 'u':
+        /* Signed int kind */
+        case 'i':
+            return 1;
+        /* Float kind */
+        case 'f':
+        /* Complex kind */
+        case 'c':
+            return 2;
+        /* Anything else */
+        default:
+            return 3;
+    }
+}
+
 /*NUMPY_API
  * Produces the result type of a bunch of inputs, using the UFunc
- * type promotion rules.
+ * type promotion rules. Use this function when you have a set of
+ * input arrays, and need to determine an output array dtype.
  *
- * If all the inputs are scalars (have 0 dimensions), does a regular
- * type promotion.  Otherwise, does a type promotion on the MinScalarType
- * of all the inputs.  Data types passed directly are treated as vector
+ * If all the inputs are scalars (have 0 dimensions) or the maximum "kind"
+ * of the scalars is greater than the maximum "kind" of the arrays, does
+ * a regular type promotion.
+ *
+ * Otherwise, does a type promotion on the MinScalarType
+ * of all the inputs.  Data types passed directly are treated as array
  * types.
  *
  */
@@ -1087,7 +1116,7 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                     npy_intp ndtypes, PyArray_Descr **dtypes)
 {
     npy_intp i;
-    int all_scalar;
+    int use_min_scalar = 0;
     PyArray_Descr *ret = NULL, *tmpret;
     int ret_is_small_unsigned = 0;
 
@@ -1103,22 +1132,54 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
         return ret;
     }
 
-    /* Determine if there are any scalars */
-    if (ndtypes > 0) {
-        all_scalar = 0;
-    }
-    else {
-        all_scalar = 1;
+    /*
+     * Determine if there are any scalars, and if so, whether
+     * the maximum "kind" of the scalars surpasses the maximum
+     * "kind" of the arrays
+     */
+    if (narrs > 0) {
+        int all_scalars, max_scalar_kind = -1, max_array_kind = -1;
+        int kind;
+
+        all_scalars = (ndtypes > 0) ? 0 : 1;
+
+        /* Compute the maximum "kinds" and whether everything is scalar */
         for (i = 0; i < narrs; ++i) {
-            if (PyArray_NDIM(arr[i]) != 0) {
-                all_scalar = 0;
-                break;
+            if (PyArray_NDIM(arr[i]) == 0) {
+                kind = dtype_kind_to_simplified_ordering(
+                                    PyArray_DESCR(arr[i])->kind);
+                if (kind > max_scalar_kind) {
+                    max_scalar_kind = kind;
+                }
             }
+            else {
+                all_scalars = 0;
+                kind = dtype_kind_to_simplified_ordering(
+                                    PyArray_DESCR(arr[i])->kind);
+                if (kind > max_array_kind) {
+                    max_array_kind = kind;
+                }
+            }
+        }
+        /*
+         * If the max scalar kind is bigger than the max array kind,
+         * finish computing the max array kind
+         */
+        for (i = 0; i < ndtypes; ++i) {
+            kind = dtype_kind_to_simplified_ordering(dtypes[i]->kind);
+            if (kind > max_array_kind) {
+                max_array_kind = kind;
+            }
+        }
+
+        /* Indicate whether to use the min_scalar_type function */
+        if (!all_scalars && max_array_kind >= max_scalar_kind) {
+            use_min_scalar = 1;
         }
     }
 
     /* Loop through all the types, promoting them */
-    if (all_scalar) {
+    if (!use_min_scalar) {
         for (i = 0; i < narrs; ++i) {
             PyArray_Descr *tmp = PyArray_DESCR(arr[i]);
             /* Combine it with the existing type */
@@ -1132,6 +1193,29 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                     tmpret = PyArray_PromoteTypes(tmp, ret);
                     Py_DECREF(ret);
                     ret = tmpret;
+                    if (ret == NULL) {
+                        return NULL;
+                    }
+                }
+            }
+        }
+
+        for (i = 0; i < ndtypes; ++i) {
+            PyArray_Descr *tmp = dtypes[i];
+            /* Combine it with the existing type */
+            if (ret == NULL) {
+                ret = tmp;
+                Py_INCREF(ret);
+            }
+            else {
+                /* Only call promote if the types aren't the same dtype */
+                if (tmp != ret || !PyArray_ISNBO(tmp->byteorder)) {
+                    tmpret = PyArray_PromoteTypes(tmp, ret);
+                    Py_DECREF(ret);
+                    ret = tmpret;
+                    if (ret == NULL) {
+                        return NULL;
+                    }
                 }
             }
         }
@@ -1153,11 +1237,7 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                 int swap = !PyArray_ISNBO(tmp->byteorder);
                 int type_num;
                 /* An aligned memory buffer large enough to hold any type */
-#if NPY_SIZEOF_LONGLONG >= NPY_SIZEOF_CLONGDOUBLE
-                npy_longlong value;
-#else
-                npy_clongdouble value;
-#endif
+                npy_longlong value[4];
                 tmp->f->copyswap(&value, data, swap, NULL);
                 type_num = min_scalar_type_num((char *)&value,
                                         tmp->type_num, &tmp_is_small_unsigned);
@@ -1229,6 +1309,9 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                     }
                     Py_DECREF(ret);
                     ret = tmpret;
+                    if (ret == NULL) {
+                        return NULL;
+                    }
                 }
             }
         }
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 8cfe5da514ea..7a0107d60ab5 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -25,6 +25,9 @@
 
 #include "lowlevel_strided_loops.h"
 
+/* DISTNUMPY */
+#include "distnumpy.h"
+
 /*
  * Reading from a file or a string.
  *
@@ -546,26 +549,60 @@ setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, npy_intp offset)
     if (slen < 0) {
         goto fail;
     }
-    if (slen != a->dimensions[dim]) {
+    /*
+     * Either the dimensions match, or the sequence has length 1 and can
+     * be broadcast to the destination.
+     */
+    if (slen != a->dimensions[dim] && slen != 1) {
         PyErr_Format(PyExc_ValueError,
                  "cannot copy sequence with size %d to array axis "
                  "with dimension %d", (int)slen, (int)a->dimensions[dim]);
         goto fail;
     }
 
-    for (i = 0; i < slen; i++) {
-        PyObject *o = PySequence_GetItem(s, i);
-        if ((a->nd - dim) > 1) {
-            res = setArrayFromSequence(a, o, dim+1, offset);
+    /* Broadcast the one element from the sequence to all the outputs */
+    if (slen == 1) {
+        PyObject *o;
+        npy_intp alen = a->dimensions[dim];
+
+        o = PySequence_GetItem(s, 0);
+        if (o == NULL) {
+            goto fail;
         }
-        else {
-            res = a->descr->f->setitem(o, (a->data + offset), a);
+        for (i = 0; i < alen; i++) {
+            if ((a->nd - dim) > 1) {
+                res = setArrayFromSequence(a, o, dim+1, offset);
+            }
+            else {
+                res = a->descr->f->setitem(o, (a->data + offset), a);
+            }
+            if (res < 0) {
+                Py_DECREF(o);
+                goto fail;
+            }
+            offset += a->strides[dim];
         }
         Py_DECREF(o);
-        if (res < 0) {
-            goto fail;
+    }
+    /* Copy element by element */
+    else {
+        for (i = 0; i < slen; i++) {
+            PyObject *o = PySequence_GetItem(s, i);
+            if (o == NULL) {
+                goto fail;
+            }
+            if ((a->nd - dim) > 1) {
+                res = setArrayFromSequence(a, o, dim+1, offset);
+            }
+            else {
+                res = a->descr->f->setitem(o, (a->data + offset), a);
+            }
+            Py_DECREF(o);
+            if (res < 0) {
+                goto fail;
+            }
+            offset += a->strides[dim];
         }
-        offset += a->strides[dim];
     }
 
     Py_DECREF(s);
@@ -576,6 +613,56 @@ setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, npy_intp offset)
     return res;
 }
 
+/* DISTNUMPY */
+static int
+setDistArrayFromSequence(PyArrayObject *a, PyObject *s, int dim,
+                         npy_intp coords[NPY_MAXDIMS])
+{
+    Py_ssize_t i, slen;
+    int res = 0;
+    /*
+     * This code is to ensure that the sequence access below will
+     * return a lower-dimensional sequence.
+     */
+    if (PyArray_Check(s) && !(PyArray_CheckExact(s))) {
+      /*
+       * FIXME:  This could probably copy the entire subarray at once here using
+       * a faster algorithm.  Right now, just make sure a base-class array is
+       * used so that the dimensionality reduction assumption is correct.
+       */
+        s = PyArray_EnsureArray(s);
+    }
+
+    if (dim > a->nd) {
+        PyErr_Format(PyExc_ValueError,
+                     "setDistArrayFromSequence: sequence/array dimensions mismatch.");
+        return -1;
+    }
+
+    slen = PySequence_Length(s);
+    if (slen != a->dimensions[dim]) {
+        PyErr_Format(PyExc_ValueError,
+                     "setDistArrayFromSequence: sequence/array shape mismatch.");
+        return -1;
+    }
+
+    for (i = 0; i < slen; i++) {
+        PyObject *o = PySequence_GetItem(s, i);
+        coords[dim] = i;
+        if ((a->nd - dim) > 1) {
+            res = setDistArrayFromSequence(a, o, dim+1, coords);
+        }
+        else {
+            res = PyDistArray_PutItem(a, coords, o);
+        }
+        Py_DECREF(o);
+        if (res < 0) {
+            return res;
+        }
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT int
 PyArray_AssignFromSequence(PyArrayObject *self, PyObject *v)
 {
@@ -589,7 +676,14 @@ PyArray_AssignFromSequence(PyArrayObject *self, PyObject *v)
                         "assignment to 0-d array");
         return -1;
     }
-    return setArrayFromSequence(self, v, 0, 0);
+    /* DISTNUMPY */
+    if(PyDistArray_IsDist(self))
+    {
+        npy_intp coords[NPY_MAXDIMS];
+        return setDistArrayFromSequence(self, v, 0, coords);
+    }
+    else
+        return setArrayFromSequence(self, v, 0, 0);
 }
 
 /*
@@ -978,14 +1072,14 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
             return NULL;
         }
 
-        size *= dim;
-
-        if (size > largest) {
+        if (dim > largest) {
             PyErr_SetString(PyExc_ValueError,
                             "array is too big.");
             Py_DECREF(descr);
             return NULL;
         }
+        size *= dim;
+        largest /= dim;
     }
 
     self = (PyArrayObject *) subtype->tp_alloc(subtype, 0);
@@ -998,13 +1092,18 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
     self->data = NULL;
     if (data == NULL) {
         self->flags = DEFAULT;
-        if (flags) {
+        /* DISTNUMPY */
+        if (flags & NPY_FORTRAN) {
             self->flags |= NPY_F_CONTIGUOUS;
             if (nd > 1) {
                 self->flags &= ~NPY_C_CONTIGUOUS;
             }
-            flags = NPY_F_CONTIGUOUS;
+            flags = NPY_FORTRAN | (flags & DNPY_DIST) |
+                                  (flags & DNPY_DIST_ONENODE);
         }
+        /* DISTNUMPY */
+        self->flags |= flags & DNPY_DIST;
+        self->flags |= flags & DNPY_DIST_ONENODE;
     }
     else {
         self->flags = (flags & ~NPY_UPDATEIFCOPY);
@@ -1039,30 +1138,46 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         self->flags |= NPY_F_CONTIGUOUS;
     }
 
-    if (data == NULL) {
-        /*
-         * Allocate something even for zero-space arrays
-         * e.g. shape=(0,) -- otherwise buffer exposure
-         * (a.data) doesn't work as it should.
-         */
-
-        if (sd == 0) {
-            sd = descr->elsize;
-        }
-        data = PyDataMem_NEW(sd);
-        if (data == NULL) {
-            PyErr_NoMemory();
-            goto fail;
+    /* DISTNUMPY */
+    PyDistArray_ARRAY(self) = NULL;
+    self->data = data;
+    if (self->data == NULL) {
+        if(PyDistArray_WANT_DIST(self))
+        {
+            npy_intp onedist = -1;
+            if(PyDistArray_WANT_ONENODE(self))
+            {
+                onedist = PyInt_AsLong(obj);
+                obj = NULL;
+            }
+            if(PyDistArray_NewBaseArray(self, onedist) < 0)
+                goto fail;
         }
-        self->flags |= OWNDATA;
+        else
+        {
+            /*
+             * Allocate something even for zero-space arrays
+             * e.g. shape=(0,) -- otherwise buffer exposure
+             * (a.data) doesn't work as it should.
+             */
+            if (sd == 0) {
+                sd = descr->elsize;
+            }
+            self->data = PyDataMem_NEW(sd);
+            if (self->data == NULL) {
+                PyErr_NoMemory();
+                goto fail;
+            }
 
-        /*
-         * It is bad to have unitialized OBJECT pointers
-         * which could also be sub-fields of a VOID array
-         */
-        if (PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
-            memset(data, 0, sd);
+            /*
+             * It is bad to have unitialized OBJECT pointers
+             * which could also be sub-fields of a VOID array
+             */
+            if (PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+                memset(self->data, 0, sd);
+            }
         }
+        self->flags |= OWNDATA;
     }
     else {
         /*
@@ -1070,8 +1185,29 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
          * Caller must arrange for this to be reset if truly desired
          */
         self->flags &= ~OWNDATA;
+
+        /* DISTNUMPY */
+        if(PyDistArray_WANT_DIST(self))
+        {
+            PyErr_SetString(PyExc_RuntimeError,
+                            "PyArray_NewFromDescr() does not support "
+                            "creating a view based on a distributed "
+                            "array. Only the creations of new arrays "
+                            "are supported\n");
+            goto fail;
+        }
+    }
+    //The array does not WANT to be distributed anymore. Now it is
+    //either distributed or not.
+    self->flags &= ~DNPY_DIST;
+
+    /*
+     * If the strides were provided to the function, need to
+     * update the flags to get the right CONTIGUOUS, ALIGN properties
+     */
+    if (strides != NULL) {
+        PyArray_UpdateFlags(self, UPDATE_ALL);
     }
-    self->data = data;
 
     /*
      * call the __array_finalize__
@@ -1083,13 +1219,6 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
 
         func = PyObject_GetAttrString((PyObject *)self, "__array_finalize__");
         if (func && func != Py_None) {
-            if (strides != NULL) {
-                /*
-                 * did not allocate own data or funny strides
-                 * update flags before finalize function
-                 */
-                PyArray_UpdateFlags(self, UPDATE_ALL);
-            }
             if (NpyCapsule_Check(func)) {
                 /* A C-function is stored here */
                 PyArray_FinalizeFunc *cfunc;
@@ -1744,7 +1873,8 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
             ret = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, newtype,
                                                  ndim, dims,
                                                  NULL, NULL,
-                                                 flags&NPY_F_CONTIGUOUS, NULL);
+                                                 /* DISTNUMPY */
+                                                 flags & (NPY_F_CONTIGUOUS | DNPY_DIST), NULL);
             if (ret != NULL) {
                 if (ndim > 0) {
                     if (PyArray_AssignFromSequence(ret, op) < 0) {
@@ -1754,7 +1884,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 }
                 else {
                     if (PyArray_DESCR(ret)->f->setitem(op,
-                                                PyArray_DATA(ret), ret) < 0) {
+                        PyArray_DATA(ret), ret) < 0) {
                         Py_DECREF(ret);
                         ret = NULL;
                     }
@@ -3548,7 +3678,8 @@ PyArray_FromString(char *data, npy_intp slen, PyArray_Descr *dtype,
     if (dtype == NULL) {
         dtype=PyArray_DescrFromType(PyArray_DEFAULT);
     }
-    if (PyDataType_FLAGCHK(dtype, NPY_ITEM_IS_POINTER)) {
+    if (PyDataType_FLAGCHK(dtype, NPY_ITEM_IS_POINTER) ||
+                    PyDataType_REFCHK(dtype)) {
         PyErr_SetString(PyExc_ValueError,
                         "Cannot create an object array from"    \
                         " a string");
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index b64e1684a5ae..f04dbe8ebfdd 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -2962,8 +2962,8 @@ PyArray_GetDTypeTransferFunction(int aligned,
     }
 
     /* Handle fields */
-    if (PyDataType_HASFIELDS(src_dtype) ||
-                PyDataType_HASFIELDS(dst_dtype)) {
+    if ((PyDataType_HASFIELDS(src_dtype) || PyDataType_HASFIELDS(dst_dtype)) &&
+            src_type_num != NPY_OBJECT && dst_type_num != NPY_OBJECT) {
         return get_fields_transfer_function(aligned,
                         src_stride, dst_stride,
                         src_dtype, dst_dtype,
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 98a26c3220e7..fe4922854370 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -1917,6 +1917,10 @@ parse_operand_subscripts(char *subscripts, int length,
     /*
      * Find any labels duplicated for this operand, and turn them
      * into negative offets to the axis to merge with.
+     *
+     * In C, the char type may be signed or unsigned, but with
+     * twos complement arithmetic the char is ok either way here, and
+     * later where it matters the char is cast to a signed char.
      */
     for (idim = 0; idim  < ndim-1; ++idim) {
         char *next;
@@ -1928,7 +1932,7 @@ parse_operand_subscripts(char *subscripts, int length,
                                     ndim-idim-1);
             while (next != NULL) {
                 /* The offset from next to out_labels[idim] (negative) */
-                *next = (out_labels+idim)-next;
+                *next = (char)((out_labels+idim)-next);
                 /* Search for the next matching label */
                 next = (char *)memchr(next+1, label,
                                         out_labels+ndim-1-next);
@@ -2128,7 +2132,11 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
 
     /* Match the labels in the operand with the output labels */
     for (idim = 0; idim < ndim; ++idim) {
-        label = labels[idim];
+        /*
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
+        label = (signed char)labels[idim];
         /* If this label says to merge axes, get the actual label */
         if (label < 0) {
             label = labels[idim+label];
@@ -2226,7 +2234,11 @@ get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
     /* Copy the dimensions and strides, except when collapsing */
     icombine = 0;
     for (idim = 0; idim < ndim; ++idim) {
-        label = labels[idim];
+        /*
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
+        label = (signed char)labels[idim];
         /* If this label says to merge axes, get the actual label */
         if (label < 0) {
             combineoffset = label;
@@ -2877,10 +2889,15 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             }
         }
 
-        /* Check whether any dimensions need to be combined */
+        /*
+         * Check whether any dimensions need to be combined
+         *
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
         combine = 0;
         for (idim = 0; idim < ndim; ++idim) {
-            if (labels[idim] < 0) {
+            if ((signed char)labels[idim] < 0) {
                 combine = 1;
             }
         }
@@ -2989,7 +3006,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
     /***************************/
     /*
-     * Accceleration for some specific loop structures. Note
+     * Acceleration for some specific loop structures. Note
      * that with axis coalescing, inputs with more dimensions can
      * be reduced to fit into these patterns.
      */
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 3b3a0a5ae7f9..3b57b4c3dd96 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1786,7 +1786,6 @@ PyArray_Nonzero(PyArrayObject *self)
     NpyIter_IterNextFunc *iternext;
     NpyIter_GetMultiIndexFunc *get_multi_index;
     char **dataptr;
-    npy_intp *innersizeptr;
 
     /* Allocate the result as a 2D array */
     ret_dims[0] = nonzero_count;
@@ -1845,7 +1844,6 @@ PyArray_Nonzero(PyArrayObject *self)
             return NULL;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
-        innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
 
         multi_index = (npy_intp *)PyArray_DATA(ret);
 
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index cc2a0b6cdb56..9c108beb1685 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -179,6 +179,127 @@ parse_index(PyArrayObject *self, PyObject *op,
     return nd_new;
 }
 
+/* DISTNUMPY (We need two extra parameters) */
+NPY_NO_EXPORT int
+parse_dist_index(PyArrayObject *self, PyObject *op, npy_intp *dimensions,
+                 npy_intp *strides, npy_intp *offset_ptr,
+                 int *nslice, dndslice *slice)
+{
+    int i, j, n;
+    int nd_old, nd_new, n_add, n_pseudo;
+    npy_intp n_steps, start, offset, step_size;
+    PyObject *op1 = NULL;
+    int is_slice;
+    int ret_nslice = 0;
+
+    if (PySlice_Check(op) || op == Py_Ellipsis || op == Py_None) {
+        n = 1;
+        op1 = op;
+        Py_INCREF(op);
+        /* this relies on the fact that n==1 for loop below */
+        is_slice = 1;
+    }
+    else {
+        if (!PySequence_Check(op)) {
+            PyErr_SetString(PyExc_IndexError,
+                            "index must be either an int "\
+                            "or a sequence");
+            return -1;
+        }
+        n = PySequence_Length(op);
+        is_slice = 0;
+    }
+
+    nd_old = nd_new = 0;
+
+    offset = 0;
+    for (i = 0; i < n; i++) {
+        if (!is_slice) {
+            if (!(op1=PySequence_GetItem(op, i))) {
+                PyErr_SetString(PyExc_IndexError,
+                                "invalid index");
+                return -1;
+            }
+        }
+        start = parse_subindex(op1, &step_size, &n_steps,
+                               nd_old < self->nd ?
+                               self->dimensions[nd_old] : 0);
+        Py_DECREF(op1);
+        if (start == -1) {
+            break;
+        }
+        if (n_steps == PseudoIndex) {
+            dimensions[nd_new] = 1; strides[nd_new] = 0;
+            slice[ret_nslice].start = 0;
+            slice[ret_nslice].step = 0;
+            slice[ret_nslice].nsteps = PseudoIndex;
+            nd_new++; ret_nslice++;
+        }
+        else {
+            if (n_steps == RubberIndex) {
+                for (j = i + 1, n_pseudo = 0; j < n; j++) {
+                    op1 = PySequence_GetItem(op, j);
+                    if (op1 == Py_None) {
+                        n_pseudo++;
+                    }
+                    Py_DECREF(op1);
+                }
+                n_add = self->nd-(n-i-n_pseudo-1+nd_old);
+                if (n_add < 0) {
+                    PyErr_SetString(PyExc_IndexError,
+                                    "too many indices");
+                    return -1;
+                }
+                for (j = 0; j < n_add; j++) {
+                    dimensions[nd_new] = \
+                        self->dimensions[nd_old];
+                    strides[nd_new] = \
+                        self->strides[nd_old];
+
+                    slice[ret_nslice].start = 0;
+                    slice[ret_nslice].step = 1;
+                    slice[ret_nslice].nsteps = self->dimensions[nd_old];
+                    nd_new++; nd_old++; ret_nslice++;
+                }
+            }
+            else {
+                if (nd_old >= self->nd) {
+                    PyErr_SetString(PyExc_IndexError,
+                                    "too many indices");
+                    return -1;
+                }
+                offset += self->strides[nd_old]*start;
+                nd_old++;
+                if (n_steps != SingleIndex) {
+                    dimensions[nd_new] = n_steps;
+                    strides[nd_new] = step_size * \
+                        self->strides[nd_old-1];
+                    nd_new++;
+                }
+                slice[ret_nslice].start = start;
+                slice[ret_nslice].step = step_size;
+                slice[ret_nslice].nsteps = n_steps;
+                ret_nslice++;
+            }
+        }
+    }
+    if (i < n) {
+        return -1;
+    }
+    n_add = self->nd-nd_old;
+    for (j = 0; j < n_add; j++) {
+        dimensions[nd_new] = self->dimensions[nd_old];
+        strides[nd_new] = self->strides[nd_old];
+        slice[ret_nslice].start = 0;
+        slice[ret_nslice].step = 1;
+        slice[ret_nslice].nsteps = self->dimensions[nd_old];
+        nd_new++; nd_old++; ret_nslice++;
+    }
+    *offset_ptr = offset;
+    *nslice = ret_nslice;
+    return nd_new;
+}
+
 static int
 slice_coerce_index(PyObject *o, npy_intp *v)
 {
@@ -889,13 +1010,11 @@ static int
 iter_ass_sub_int(PyArrayIterObject *self, PyArrayObject *ind,
                  PyArrayIterObject *val, int swap)
 {
-    PyArray_Descr *typecode;
     npy_intp num;
     PyArrayIterObject *ind_it;
     npy_intp index;
     PyArray_CopySwapFunc *copyswap;
 
-    typecode = self->ao->descr;
     copyswap = self->ao->descr->f->copyswap;
     if (ind->nd == 0) {
         num = *((npy_intp *)ind->data);
diff --git a/numpy/core/src/multiarray/iterators.h b/numpy/core/src/multiarray/iterators.h
index 3099425c5538..e8cf028263b5 100644
--- a/numpy/core/src/multiarray/iterators.h
+++ b/numpy/core/src/multiarray/iterators.h
@@ -19,4 +19,10 @@ slice_GetIndices(PySliceObject *r, intp length,
                  intp *start, intp *stop, intp *step,
                  intp *slicelength);
 
+/* DISTNUMPY */
+NPY_NO_EXPORT int
+parse_dist_index(PyArrayObject *self, PyObject *op, npy_intp *dimensions,
+                 npy_intp *strides, npy_intp *offset_ptr,
+                 int *nslice, dndslice *slice);
+
 #endif
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 31472333de45..42281e4ae274 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -64,6 +64,16 @@ array_big_item(PyArrayObject *self, intp i)
     if (r == NULL) {
         return NULL;
     }
+    /* DISTNUMPY */
+    if(PyDistArray_IsDist(self))
+    {
+        //Lets make a slice covering the whole 'self' array beside
+        //the Single Index 'i'.
+        dndslice slice = {i, 0, SingleIndex};
+        //And then create the new view based on 'self'.
+        if(PyDistArray_NewViewArray(self, r, 1,  &slice) == -1)
+            return NULL;
+    }
     Py_INCREF(self);
     r->base = (PyObject *)self;
     PyArray_UpdateFlags(r, CONTIGUOUS | FORTRAN);
@@ -503,6 +513,9 @@ NPY_NO_EXPORT PyObject *
 array_subscript_simple(PyArrayObject *self, PyObject *op)
 {
     intp dimensions[MAX_DIMS], strides[MAX_DIMS];
+    /* DISTNUMPY */
+    dndslice slice[MAX_DIMS];
+    int nslice=0;
     intp offset;
     int nd;
     PyArrayObject *other;
@@ -515,9 +528,18 @@ array_subscript_simple(PyArrayObject *self, PyObject *op)
     PyErr_Clear();
 
     /* Standard (view-based) Indexing */
-    if ((nd = parse_index(self, op, dimensions, strides, &offset)) == -1) {
+    if(PyDistArray_IsDist(self))/* DISTNUMPY */
+    {
+        if((nd = parse_dist_index(self, op, dimensions, strides,
+                                  &offset, &nslice, slice)) == -1)
         return NULL;
     }
+    else
+    {
+        if((nd = parse_index(self, op, dimensions, strides,
+                             &offset)) == -1)
+            return NULL;
+    }
     /* This will only work if new array will be a view */
     Py_INCREF(self->descr);
     if ((other = (PyArrayObject *)
@@ -528,6 +550,12 @@ array_subscript_simple(PyArrayObject *self, PyObject *op)
                               (PyObject *)self)) == NULL) {
         return NULL;
     }
+    /* DISTNUMPY */
+    if(PyDistArray_IsDist(self))
+    {
+        if(PyDistArray_NewViewArray(self, other, nslice,  slice) == -1)
+            return NULL;
+    }
     other->base = (PyObject *)self;
     Py_INCREF(self);
     PyArray_UpdateFlags(other, UPDATE_ALL);
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index ed86e208c908..ed2ee84a60b0 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -18,6 +18,8 @@
 
 #include "methods.h"
 
+/* DISTNUMPY */
+#include "distnumpy.h"
 
 /* NpyArg_ParseKeywords
  *
@@ -2114,6 +2116,20 @@ array_setflags(PyArrayObject *self, PyObject *args, PyObject *kwds)
     return Py_None;
 }
 
+/* DISTNUMPY */
+static PyObject *
+array_undist(PyArrayObject *self, PyObject *args)
+{
+    //No arguments.
+    if (!PyArg_ParseTuple(args, "")) {
+        return NULL;
+    }
+
+    if(PyDistArray_UnDist(PyDistArray_ARRAY(self)->base) == -1)
+        return NULL;
+
+    return Py_None;
+}
 
 static PyObject *
 array_newbyteorder(PyArrayObject *self, PyObject *args)
@@ -2325,6 +2341,10 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
     {"view",
         (PyCFunction)array_view,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    /* DISTNUMPY */
+    {"undist",
+        (PyCFunction)array_undist,
+        METH_VARARGS, NULL},
     {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/multiarray_tests.c.src b/numpy/core/src/multiarray/multiarray_tests.c.src
index f99cb98ad023..d6340025c2fb 100644
--- a/numpy/core/src/multiarray/multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/multiarray_tests.c.src
@@ -44,7 +44,6 @@ static int copy_@type@(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *ni
             ptr += 1;
         }
 
-        Py_INCREF(aout);
         PyList_Append(*out, (PyObject*)aout);
         Py_DECREF(aout);
         PyArray_ITER_NEXT(itx);
@@ -84,7 +83,6 @@ static int copy_object(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *ni
             PyArrayNeighborhoodIter_Next(niterx);
         }
 
-        Py_INCREF(aout);
         PyList_Append(*out, (PyObject*)aout);
         Py_DECREF(aout);
         PyArray_ITER_NEXT(itx);
@@ -238,7 +236,6 @@ copy_double_double(PyArrayNeighborhoodIterObject *itx,
             ptr += 1;
             PyArrayNeighborhoodIter_Next(niterx);
         }
-        Py_INCREF(aout);
         PyList_Append(*out, (PyObject*)aout);
         Py_DECREF(aout);
         PyArrayNeighborhoodIter_Next(itx);
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 28c596d9c259..f39635afe131 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -45,6 +45,9 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "convert_datatype.h"
 #include "nditer_pywrap.h"
 
+/* DISTNUMPY */
+#include "distnumpy.h"
+
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
 
@@ -883,68 +886,51 @@ PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
 }
 
 /*NUMPY_API
- * Fast Copy and Transpose
+ * Copy and Transpose
+ *
+ * Could deprecate this function, as there isn't a speed benefit over
+ * calling Transpose and then Copy.
  */
 NPY_NO_EXPORT PyObject *
 PyArray_CopyAndTranspose(PyObject *op)
 {
-    PyObject *ret, *arr;
-    int nd;
-    npy_intp dims[2];
-    npy_intp i,j;
-    int elsize, str2;
-    char *iptr;
-    char *optr;
+    PyArrayObject *arr, *tmp, *ret;
+    int i;
+    npy_intp new_axes_values[NPY_MAXDIMS];
+    PyArray_Dims new_axes;
 
-    /* make sure it is well-behaved */
-    arr = PyArray_FromAny(op, NULL, 0, 0, CARRAY, NULL);
+    /* Make sure we have an array */
+    arr = (PyArrayObject *)PyArray_FromAny(op, NULL, 0, 0, 0, NULL);
     if (arr == NULL) {
         return NULL;
     }
-    nd = PyArray_NDIM(arr);
-    if (nd == 1) {
-        /* we will give in to old behavior */
-        ret = PyArray_Copy((PyArrayObject *)arr);
-        Py_DECREF(arr);
-        return ret;
-    }
-    else if (nd != 2) {
-        Py_DECREF(arr);
-        PyErr_SetString(PyExc_ValueError,
-                        "only 2-d arrays are allowed");
-        return NULL;
-    }
 
-    /* Now construct output array */
-    dims[0] = PyArray_DIM(arr,1);
-    dims[1] = PyArray_DIM(arr,0);
-    elsize = PyArray_ITEMSIZE(arr);
-    Py_INCREF(PyArray_DESCR(arr));
-    ret = PyArray_NewFromDescr(Py_TYPE(arr),
-                               PyArray_DESCR(arr),
-                               2, dims,
-                               NULL, NULL, 0, arr);
-    if (ret == NULL) {
-        Py_DECREF(arr);
-        return NULL;
-    }
+    if (PyArray_NDIM(arr) > 1) {
+        /* Set up the transpose operation */
+        new_axes.len = PyArray_NDIM(arr);
+        for (i = 0; i < new_axes.len; ++i) {
+            new_axes_values[i] = new_axes.len - i - 1;
+        }
+        new_axes.ptr = new_axes_values;
 
-    /* do 2-d loop */
-    NPY_BEGIN_ALLOW_THREADS;
-    optr = PyArray_DATA(ret);
-    str2 = elsize*dims[0];
-    for (i = 0; i < dims[0]; i++) {
-        iptr = PyArray_BYTES(arr) + i*elsize;
-        for (j = 0; j < dims[1]; j++) {
-            /* optr[i,j] = iptr[j,i] */
-            memcpy(optr, iptr, elsize);
-            optr += elsize;
-            iptr += str2;
+        /* Do the transpose (always returns a view) */
+        tmp = (PyArrayObject *)PyArray_Transpose(arr, &new_axes);
+        if (tmp == NULL) {
+            Py_DECREF(arr);
+            return NULL;
         }
     }
-    NPY_END_ALLOW_THREADS;
-    Py_DECREF(arr);
-    return ret;
+    else {
+        tmp = arr;
+        arr = NULL;
+    }
+
+    /* TODO: Change this to NPY_KEEPORDER for NumPy 2.0 */
+    ret = (PyArrayObject *)PyArray_NewCopy(tmp, NPY_CORDER);
+
+    Py_XDECREF(arr);
+    Py_DECREF(tmp);
+    return (PyObject *)ret;
 }
 
 /*
@@ -1582,9 +1568,11 @@ static PyObject *
 _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
 {
     PyObject *op, *ret = NULL;
+    /* DISTNUMPY */
     static char *kwd[]= {"object", "dtype", "copy", "order", "subok",
-                         "ndmin", NULL};
+                         "ndmin", "dist", NULL};
     Bool subok = FALSE;
+    Bool dist = FALSE;
     Bool copy = TRUE;
     int ndmin = 0, nd;
     PyArray_Descr *type = NULL;
@@ -1597,12 +1585,15 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
                         "only 2 non-keyword arguments accepted");
         return NULL;
     }
-    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i", kwd, &op,
-                PyArray_DescrConverter2, &type,
-                PyArray_BoolConverter, &copy,
-                PyArray_OrderConverter, &order,
-                PyArray_BoolConverter, &subok,
-                &ndmin)) {
+    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&iO&", kwd, &op,
+                                    PyArray_DescrConverter2,
+                                    &type,
+                                    PyArray_BoolConverter, &copy,
+                                    PyArray_OrderConverter, &order,
+                                    PyArray_BoolConverter, &subok,
+                                    &ndmin,
+                                    /* DISTNUMPY */
+                                    PyArray_BoolConverter, &dist)) {
         goto clean_type;
     }
 
@@ -1613,8 +1604,9 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
         goto clean_type;
     }
     /* fast exit if simple call */
-    if ((subok && PyArray_Check(op))
-            || (!subok && PyArray_CheckExact(op))) {
+    if(!dist && ((subok && PyArray_Check(op)) ||
+                (!subok && PyArray_CheckExact(op))))
+    {
         if (type == NULL) {
             if (!copy && STRIDING_OK(op, order)) {
                 Py_INCREF(op);
@@ -1664,6 +1656,11 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
 
     flags |= NPY_FORCECAST;
     Py_XINCREF(type);
+
+    /* DISTNUMPY */
+    if(dist)
+        flags |= DNPY_DIST;
+
     ret = PyArray_CheckFromAny(op, type, 0, 0, flags, NULL);
 
  finish:
@@ -1688,18 +1685,26 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
 static PyObject *
 array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
-
-    static char *kwlist[] = {"shape","dtype","order",NULL};
+    /* DISTNUMPY */
+    static char *kwlist[] = {"shape","dtype","order","dist","onerank",NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
     Bool fortran;
     PyObject *ret = NULL;
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&", kwlist,
-                PyArray_IntpConverter, &shape,
-                PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+    Bool dist = FALSE;
+    int flags = 0;
+    PyObject *onedist = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&O&O",
+                                     kwlist, PyArray_IntpConverter,
+                                     &shape,
+                                     PyArray_DescrConverter,
+                                     &typecode,
+                                     PyArray_OrderConverter, &order,
+                                     /* DISTNUMPY */
+                                     PyArray_BoolConverter, &dist,
+                                     &onedist)) {
         goto fail;
     }
 
@@ -1716,7 +1721,18 @@ array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
             goto fail;
     }
 
-    ret = PyArray_Empty(shape.len, shape.ptr, typecode, fortran);
+    /* DISTNUMPY */
+    if(fortran)
+        flags |= NPY_FORTRAN;
+
+    if(dist)
+    {
+        flags |= DNPY_DIST;
+        if(onedist != NULL)
+            flags |= DNPY_DIST_ONENODE;
+    }
+
+    ret = PyArray_Empty(shape.len, shape.ptr, typecode, flags);
     PyDimMem_FREE(shape.ptr);
     return ret;
 
@@ -3874,6 +3890,13 @@ PyMODINIT_FUNC initmultiarray(void) {
     if (set_typeinfo(d) != 0) {
         goto err;
     }
+
+    //DISTNUMPY
+    if(import_distnumpy())
+        goto err;
+    PyDistArray_Init();
+    Py_AtExit(PyDistArray_Exit);
+
     return RETVAL;
 
  err:
diff --git a/numpy/core/src/multiarray/nditer.c.src b/numpy/core/src/multiarray/nditer.c.src
index e12767fbbd8f..6078952c4ac6 100644
--- a/numpy/core/src/multiarray/nditer.c.src
+++ b/numpy/core/src/multiarray/nditer.c.src
@@ -115,7 +115,7 @@
 
 /*
  * The data layout of the iterator is fully specified by
- * a triple (itflags, ndim, niter).  These three variables
+ * a triple (itflags, ndim, nop).  These three variables
  * are expected to exist in all functions calling these macros,
  * either as true variables initialized to the correct values
  * from the iterator, or as constants in the case of specialized
@@ -125,7 +125,7 @@
 struct NpyIter_InternalOnly {
     /* Initial fixed position data */
     npy_uint32 itflags;
-    npy_uint16 ndim, niter;
+    npy_uint16 ndim, nop;
     npy_intp itersize, iterstart, iterend;
     /* iterindex is only used if RANGED or BUFFERED is set */
     npy_intp iterindex;
@@ -137,53 +137,53 @@ typedef struct NpyIter_AD NpyIter_AxisData;
 typedef struct NpyIter_BD NpyIter_BufferData;
 
 /* Byte sizes of the iterator members */
-#define NIT_PERM_SIZEOF(itflags, ndim, niter) \
+#define NIT_PERM_SIZEOF(itflags, ndim, nop) \
         NPY_INTP_ALIGNED(NPY_MAXDIMS)
-#define NIT_DTYPES_SIZEOF(itflags, ndim, niter) \
-        ((NPY_SIZEOF_INTP)*(niter))
-#define NIT_RESETDATAPTR_SIZEOF(itflags, ndim, niter) \
-        ((NPY_SIZEOF_INTP)*(niter+1))
-#define NIT_BASEOFFSETS_SIZEOF(itflags, ndim, niter) \
-        ((NPY_SIZEOF_INTP)*(niter+1))
-#define NIT_OPERANDS_SIZEOF(itflags, ndim, niter) \
-        ((NPY_SIZEOF_INTP)*(niter))
-#define NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter) \
-        (NPY_INTP_ALIGNED(niter))
-#define NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter) \
-        ((itflags&NPY_ITFLAG_BUFFER) ? ((NPY_SIZEOF_INTP)*(6 + 9*niter)) : 0)
+#define NIT_DTYPES_SIZEOF(itflags, ndim, nop) \
+        ((NPY_SIZEOF_INTP)*(nop))
+#define NIT_RESETDATAPTR_SIZEOF(itflags, ndim, nop) \
+        ((NPY_SIZEOF_INTP)*(nop+1))
+#define NIT_BASEOFFSETS_SIZEOF(itflags, ndim, nop) \
+        ((NPY_SIZEOF_INTP)*(nop+1))
+#define NIT_OPERANDS_SIZEOF(itflags, ndim, nop) \
+        ((NPY_SIZEOF_INTP)*(nop))
+#define NIT_OPITFLAGS_SIZEOF(itflags, ndim, nop) \
+        (NPY_INTP_ALIGNED(nop))
+#define NIT_BUFFERDATA_SIZEOF(itflags, ndim, nop) \
+        ((itflags&NPY_ITFLAG_BUFFER) ? ((NPY_SIZEOF_INTP)*(6 + 9*nop)) : 0)
 
 /* Byte offsets of the iterator members starting from iter->iter_flexdata */
 #define NIT_PERM_OFFSET() \
         (0)
-#define NIT_DTYPES_OFFSET(itflags, ndim, niter) \
+#define NIT_DTYPES_OFFSET(itflags, ndim, nop) \
         (NIT_PERM_OFFSET() + \
-         NIT_PERM_SIZEOF(itflags, ndim, niter))
-#define NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter) \
-        (NIT_DTYPES_OFFSET(itflags, ndim, niter) + \
-         NIT_DTYPES_SIZEOF(itflags, ndim, niter))
-#define NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter) \
-        (NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter) + \
-         NIT_RESETDATAPTR_SIZEOF(itflags, ndim, niter))
-#define NIT_OPERANDS_OFFSET(itflags, ndim, niter) \
-        (NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter) + \
-         NIT_BASEOFFSETS_SIZEOF(itflags, ndim, niter))
-#define NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) \
-        (NIT_OPERANDS_OFFSET(itflags, ndim, niter) + \
-         NIT_OPERANDS_SIZEOF(itflags, ndim, niter))
-#define NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) \
-        (NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) + \
-         NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter))
-#define NIT_AXISDATA_OFFSET(itflags, ndim, niter) \
-        (NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) + \
-         NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter))
+         NIT_PERM_SIZEOF(itflags, ndim, nop))
+#define NIT_RESETDATAPTR_OFFSET(itflags, ndim, nop) \
+        (NIT_DTYPES_OFFSET(itflags, ndim, nop) + \
+         NIT_DTYPES_SIZEOF(itflags, ndim, nop))
+#define NIT_BASEOFFSETS_OFFSET(itflags, ndim, nop) \
+        (NIT_RESETDATAPTR_OFFSET(itflags, ndim, nop) + \
+         NIT_RESETDATAPTR_SIZEOF(itflags, ndim, nop))
+#define NIT_OPERANDS_OFFSET(itflags, ndim, nop) \
+        (NIT_BASEOFFSETS_OFFSET(itflags, ndim, nop) + \
+         NIT_BASEOFFSETS_SIZEOF(itflags, ndim, nop))
+#define NIT_OPITFLAGS_OFFSET(itflags, ndim, nop) \
+        (NIT_OPERANDS_OFFSET(itflags, ndim, nop) + \
+         NIT_OPERANDS_SIZEOF(itflags, ndim, nop))
+#define NIT_BUFFERDATA_OFFSET(itflags, ndim, nop) \
+        (NIT_OPITFLAGS_OFFSET(itflags, ndim, nop) + \
+         NIT_OPITFLAGS_SIZEOF(itflags, ndim, nop))
+#define NIT_AXISDATA_OFFSET(itflags, ndim, nop) \
+        (NIT_BUFFERDATA_OFFSET(itflags, ndim, nop) + \
+         NIT_BUFFERDATA_SIZEOF(itflags, ndim, nop))
 
 /* Internal-only ITERATOR DATA MEMBER ACCESS */
 #define NIT_ITFLAGS(iter) \
         ((iter)->itflags)
 #define NIT_NDIM(iter) \
         ((iter)->ndim)
-#define NIT_NITER(iter) \
-        ((iter)->niter)
+#define NIT_NOP(iter) \
+        ((iter)->nop)
 #define NIT_ITERSIZE(iter) \
         (iter->itersize)
 #define NIT_ITERSTART(iter) \
@@ -192,22 +192,22 @@ typedef struct NpyIter_BD NpyIter_BufferData;
         (iter->iterend)
 #define NIT_ITERINDEX(iter) \
         (iter->iterindex)
-#define NIT_PERM(iter)  ((char*)( \
+#define NIT_PERM(iter)  ((npy_int8 *)( \
         &(iter)->iter_flexdata + NIT_PERM_OFFSET()))
 #define NIT_DTYPES(iter) ((PyArray_Descr **)( \
-        &(iter)->iter_flexdata + NIT_DTYPES_OFFSET(itflags, ndim, niter)))
+        &(iter)->iter_flexdata + NIT_DTYPES_OFFSET(itflags, ndim, nop)))
 #define NIT_RESETDATAPTR(iter) ((char **)( \
-        &(iter)->iter_flexdata + NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter)))
+        &(iter)->iter_flexdata + NIT_RESETDATAPTR_OFFSET(itflags, ndim, nop)))
 #define NIT_BASEOFFSETS(iter) ((npy_intp *)( \
-        &(iter)->iter_flexdata + NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter)))
+        &(iter)->iter_flexdata + NIT_BASEOFFSETS_OFFSET(itflags, ndim, nop)))
 #define NIT_OPERANDS(iter) ((PyArrayObject **)( \
-        &(iter)->iter_flexdata + NIT_OPERANDS_OFFSET(itflags, ndim, niter)))
+        &(iter)->iter_flexdata + NIT_OPERANDS_OFFSET(itflags, ndim, nop)))
 #define NIT_OPITFLAGS(iter) ( \
-        &(iter)->iter_flexdata + NIT_OPITFLAGS_OFFSET(itflags, ndim, niter))
+        &(iter)->iter_flexdata + NIT_OPITFLAGS_OFFSET(itflags, ndim, nop))
 #define NIT_BUFFERDATA(iter) ((NpyIter_BufferData *)( \
-        &(iter)->iter_flexdata + NIT_BUFFERDATA_OFFSET(itflags, ndim, niter)))
+        &(iter)->iter_flexdata + NIT_BUFFERDATA_OFFSET(itflags, ndim, nop)))
 #define NIT_AXISDATA(iter) ((NpyIter_AxisData *)( \
-        &(iter)->iter_flexdata + NIT_AXISDATA_OFFSET(itflags, ndim, niter)))
+        &(iter)->iter_flexdata + NIT_AXISDATA_OFFSET(itflags, ndim, nop)))
 
 /* Internal-only BUFFERDATA MEMBER ACCESS */
 struct NpyIter_BD {
@@ -224,21 +224,21 @@ struct NpyIter_BD {
 #define NBF_STRIDES(bufferdata) ( \
         &(bufferdata)->bd_flexdata + 0)
 #define NBF_PTRS(bufferdata) ((char **) \
-        (&(bufferdata)->bd_flexdata + 1*(niter)))
+        (&(bufferdata)->bd_flexdata + 1*(nop)))
 #define NBF_REDUCE_OUTERSTRIDES(bufferdata) ( \
-        (&(bufferdata)->bd_flexdata + 2*(niter)))
+        (&(bufferdata)->bd_flexdata + 2*(nop)))
 #define NBF_REDUCE_OUTERPTRS(bufferdata) ((char **) \
-        (&(bufferdata)->bd_flexdata + 3*(niter)))
+        (&(bufferdata)->bd_flexdata + 3*(nop)))
 #define NBF_READTRANSFERFN(bufferdata) ((PyArray_StridedTransferFn **) \
-        (&(bufferdata)->bd_flexdata + 4*(niter)))
+        (&(bufferdata)->bd_flexdata + 4*(nop)))
 #define NBF_READTRANSFERDATA(bufferdata) ((void **) \
-        (&(bufferdata)->bd_flexdata + 5*(niter)))
+        (&(bufferdata)->bd_flexdata + 5*(nop)))
 #define NBF_WRITETRANSFERFN(bufferdata) ((PyArray_StridedTransferFn **) \
-        (&(bufferdata)->bd_flexdata + 6*(niter)))
+        (&(bufferdata)->bd_flexdata + 6*(nop)))
 #define NBF_WRITETRANSFERDATA(bufferdata) ((void **) \
-        (&(bufferdata)->bd_flexdata + 7*(niter)))
+        (&(bufferdata)->bd_flexdata + 7*(nop)))
 #define NBF_BUFFERS(bufferdata) ((char **) \
-        (&(bufferdata)->bd_flexdata + 8*(niter)))
+        (&(bufferdata)->bd_flexdata + 8*(nop)))
 
 /* Internal-only AXISDATA MEMBER ACCESS. */
 struct NpyIter_AD {
@@ -250,25 +250,25 @@ struct NpyIter_AD {
 #define NAD_STRIDES(axisdata) ( \
         &(axisdata)->ad_flexdata + 0)
 #define NAD_PTRS(axisdata) ((char **) \
-        &(axisdata)->ad_flexdata + 1*(niter+1))
+        &(axisdata)->ad_flexdata + 1*(nop+1))
 
 #define NAD_NSTRIDES() \
-        ((niter) + ((itflags&NPY_ITFLAG_HASINDEX) ? 1 : 0))
+        ((nop) + ((itflags&NPY_ITFLAG_HASINDEX) ? 1 : 0))
 
 /* Size of one AXISDATA struct within the iterator */
-#define NIT_AXISDATA_SIZEOF(itflags, ndim, niter) (( \
+#define NIT_AXISDATA_SIZEOF(itflags, ndim, nop) (( \
         /* intp shape */ \
         1 + \
         /* intp index */ \
         1 + \
-        /* intp stride[niter+1] AND char* ptr[niter+1] */ \
-        2*((niter)+1) \
+        /* intp stride[nop+1] AND char* ptr[nop+1] */ \
+        2*((nop)+1) \
         )*NPY_SIZEOF_INTP )
 
 /*
  * Macro to advance an AXISDATA pointer by a specified count.
  * Requires that sizeof_axisdata be previously initialized
- * to NIT_AXISDATA_SIZEOF(itflags, ndim, niter).
+ * to NIT_AXISDATA_SIZEOF(itflags, ndim, nop).
  */
 #define NIT_INDEX_AXISDATA(axisdata, index) ((NpyIter_AxisData *) \
         (((char *)(axisdata)) + (index)*sizeof_axisdata))
@@ -276,19 +276,19 @@ struct NpyIter_AD {
         axisdata = NIT_INDEX_AXISDATA(axisdata, count)
 
 /* Size of the whole iterator */
-#define NIT_SIZEOF_ITERATOR(itflags, ndim, niter) ( \
+#define NIT_SIZEOF_ITERATOR(itflags, ndim, nop) ( \
         sizeof(struct NpyIter_InternalOnly) + \
-        NIT_AXISDATA_OFFSET(itflags, ndim, niter) + \
-        NIT_AXISDATA_SIZEOF(itflags, ndim, niter)*(ndim))
+        NIT_AXISDATA_OFFSET(itflags, ndim, nop) + \
+        NIT_AXISDATA_SIZEOF(itflags, ndim, nop)*(ndim))
 
 /* Internal helper functions */
 static int
 npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags);
 static int
-npyiter_check_op_axes(int niter, int oa_ndim, int **op_axes,
+npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
                         npy_intp *itershape);
 static int
-npyiter_calculate_ndim(int niter, PyArrayObject **op_in,
+npyiter_calculate_ndim(int nop, PyArrayObject **op_in,
                        int oa_ndim);
 static int
 npyiter_check_per_op_flags(npy_uint32 flags, char *op_itflags);
@@ -300,7 +300,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                         npy_uint32 flags,
                         npy_uint32 op_flags, char *op_itflags);
 static int
-npyiter_prepare_operands(int niter, PyArrayObject **op_in,
+npyiter_prepare_operands(int nop, PyArrayObject **op_in,
                     PyArrayObject **op,
                     char **op_dataptr,
                     PyArray_Descr **op_request_dtypes,
@@ -308,7 +308,7 @@ npyiter_prepare_operands(int niter, PyArrayObject **op_in,
                     npy_uint32 flags,
                     npy_uint32 *op_flags, char *op_itflags);
 static int
-npyiter_check_casting(int niter, PyArrayObject **op,
+npyiter_check_casting(int nop, PyArrayObject **op,
                     PyArray_Descr **op_dtype,
                     NPY_CASTING casting,
                     char *op_itflags);
@@ -319,7 +319,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                     npy_intp *itershape,
                     int output_scalars);
 static void
-npyiter_replace_axisdata(NpyIter *iter, int iiter,
+npyiter_replace_axisdata(NpyIter *iter, int iop,
                       PyArrayObject *op,
                       int op_ndim, char *op_dataptr,
                       int *op_axes);
@@ -338,7 +338,7 @@ static void
 npyiter_coalesce_axes(NpyIter *iter);
 
 static PyArray_Descr *
-npyiter_get_common_dtype(int niter, PyArrayObject **op,
+npyiter_get_common_dtype(int nop, PyArrayObject **op,
                         char *op_itflags, PyArray_Descr **op_dtype,
                         PyArray_Descr **op_request_dtypes,
                         int only_inputs, int output_scalars);
@@ -355,7 +355,7 @@ npyiter_allocate_arrays(NpyIter *iter,
                         npy_uint32 *op_flags, char *op_itflags,
                         int **op_axes, int output_scalars);
 static void
-npyiter_get_priority_subtype(int niter, PyArrayObject **op,
+npyiter_get_priority_subtype(int nop, PyArrayObject **op,
                             char *op_itflags,
                             double *subtype_priority, PyTypeObject **subtype);
 
@@ -378,7 +378,7 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
  * options for controlling the broadcasting, shape, and buffer size.
  */
 NPY_NO_EXPORT NpyIter *
-NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
+NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
                  NPY_ORDER order, NPY_CASTING casting,
                  npy_uint32 *op_flags,
                  PyArray_Descr **op_request_dtypes,
@@ -387,7 +387,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
 {
     npy_uint32 itflags = NPY_ITFLAG_IDENTPERM;
     int idim, ndim;
-    int iiter;
+    int iop;
 
     /* The iterator being constructed */
     NpyIter *iter;
@@ -398,7 +398,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     char *op_itflags;
     char **op_dataptr;
 
-    char *perm;
+    npy_int8 *perm;
     NpyIter_BufferData *bufferdata = NULL;
     int any_allocate = 0, any_missing_dtypes = 0,
             output_scalars = 0, need_subtype = 0;
@@ -429,15 +429,15 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
 
     NPY_IT_TIME_POINT(c_start);
 
-    if (niter > NPY_MAXARGS) {
+    if (nop > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
             "Cannot construct an iterator with more than %d operands "
-            "(%d were requested)", (int)NPY_MAXARGS, (int)niter);
+            "(%d were requested)", (int)NPY_MAXARGS, (int)nop);
         return NULL;
     }
 
     /* Error check 'oa_ndim' and 'op_axes', which must be used together */
-    if (!npyiter_check_op_axes(niter, oa_ndim, op_axes, itershape)) {
+    if (!npyiter_check_op_axes(nop, oa_ndim, op_axes, itershape)) {
         return NULL;
     }
 
@@ -451,7 +451,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     NPY_IT_TIME_POINT(c_check_global_flags);
 
     /* Calculate how many dimensions the iterator should have */
-    ndim = npyiter_calculate_ndim(niter, op_in, oa_ndim);
+    ndim = npyiter_calculate_ndim(nop, op_in, oa_ndim);
 
     /* If 'ndim' is zero, any outputs should be scalars */
     if (ndim == 0) {
@@ -463,16 +463,16 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
 
     /* Allocate memory for the iterator */
     iter = (NpyIter*)
-                PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
+                PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, nop));
 
     NPY_IT_TIME_POINT(c_malloc);
 
     /* Fill in the basic data */
     NIT_ITFLAGS(iter) = itflags;
     NIT_NDIM(iter) = ndim;
-    NIT_NITER(iter) = niter;
+    NIT_NOP(iter) = nop;
     NIT_ITERINDEX(iter) = 0;
-    memset(NIT_BASEOFFSETS(iter), 0, (niter+1)*NPY_SIZEOF_INTP);
+    memset(NIT_BASEOFFSETS(iter), 0, (nop+1)*NPY_SIZEOF_INTP);
 
     op = NIT_OPERANDS(iter);
     op_dtype = NIT_DTYPES(iter);
@@ -480,7 +480,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     op_dataptr = NIT_RESETDATAPTR(iter);
 
     /* Prepare all the operands */
-    if (!npyiter_prepare_operands(niter, op_in, op, op_dataptr,
+    if (!npyiter_prepare_operands(nop, op_in, op, op_dataptr,
                         op_request_dtypes, op_dtype,
                         flags,
                         op_flags, op_itflags)) {
@@ -488,7 +488,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
         return NULL;
     }
     /* Set resetindex to zero as well (it's just after the resetdataptr) */
-    op_dataptr[niter] = 0;
+    op_dataptr[nop] = 0;
 
     NPY_IT_TIME_POINT(c_prepare_operands);
 
@@ -499,9 +499,9 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     if (itflags&NPY_ITFLAG_BUFFER) {
         bufferdata = NIT_BUFFERDATA(iter);
         NBF_SIZE(bufferdata) = 0;
-        memset(NBF_BUFFERS(bufferdata), 0, niter*NPY_SIZEOF_INTP);
-        memset(NBF_READTRANSFERDATA(bufferdata), 0, niter*NPY_SIZEOF_INTP);
-        memset(NBF_WRITETRANSFERDATA(bufferdata), 0, niter*NPY_SIZEOF_INTP);
+        memset(NBF_BUFFERS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
+        memset(NBF_READTRANSFERDATA(bufferdata), 0, nop*NPY_SIZEOF_INTP);
+        memset(NBF_WRITETRANSFERDATA(bufferdata), 0, nop*NPY_SIZEOF_INTP);
     }
 
     /* Fill in the AXISDATA arrays and set the ITERSIZE field */
@@ -542,7 +542,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     /* Initialize the perm to the identity */
     perm = NIT_PERM(iter);
     for(idim = 0; idim < ndim; ++idim) {
-        perm[idim] = (char)idim;
+        perm[idim] = (npy_int8)idim;
     }
 
     /*
@@ -554,19 +554,19 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     NPY_IT_TIME_POINT(c_apply_forced_iteration_order);
 
     /* Set some flags for allocated outputs */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op[iiter] == NULL) {
+    for (iop = 0; iop < nop; ++iop) {
+        if (op[iop] == NULL) {
             /* Flag this so later we can avoid flipping axes */
             any_allocate = 1;
             /* If a subtype may be used, indicate so */
-            if (!(op_flags[iiter]&NPY_ITER_NO_SUBTYPE)) {
+            if (!(op_flags[iop]&NPY_ITER_NO_SUBTYPE)) {
                 need_subtype = 1;
             }
             /*
              * If the data type wasn't provided, will need to
              * calculate it.
              */
-            if (op_dtype[iiter] == NULL) {
+            if (op_dtype[iop] == NULL) {
                 any_missing_dtypes = 1;
             }
         }
@@ -593,7 +593,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
     NPY_IT_TIME_POINT(c_find_best_axis_ordering);
 
     if (need_subtype) {
-        npyiter_get_priority_subtype(niter, op, op_itflags,
+        npyiter_get_priority_subtype(nop, op, op_itflags,
                                      &subtype_priority, &subtype);
     }
 
@@ -610,7 +610,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
         op = NIT_OPERANDS(iter);
         op_dtype = NIT_DTYPES(iter);
 
-        dtype = npyiter_get_common_dtype(niter, op,
+        dtype = npyiter_get_common_dtype(nop, op,
                                     op_itflags, op_dtype,
                                     op_request_dtypes,
                                     only_inputs,
@@ -622,21 +622,21 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
         if (flags&NPY_ITER_COMMON_DTYPE) {
             NPY_IT_DBG_PRINT("Iterator: Replacing all data types\n");
             /* Replace all the data types */
-            for (iiter = 0; iiter < niter; ++iiter) {
-                if (op_dtype[iiter] != dtype) {
-                    Py_XDECREF(op_dtype[iiter]);
+            for (iop = 0; iop < nop; ++iop) {
+                if (op_dtype[iop] != dtype) {
+                    Py_XDECREF(op_dtype[iop]);
                     Py_INCREF(dtype);
-                    op_dtype[iiter] = dtype;
+                    op_dtype[iop] = dtype;
                 }
             }
         }
         else {
             NPY_IT_DBG_PRINT("Iterator: Setting unset output data types\n");
             /* Replace the NULL data types */
-            for (iiter = 0; iiter < niter; ++iiter) {
-                if (op_dtype[iiter] == NULL) {
+            for (iop = 0; iop < nop; ++iop) {
+                if (op_dtype[iop] == NULL) {
                     Py_INCREF(dtype);
-                    op_dtype[iiter] = dtype;
+                    op_dtype[iop] = dtype;
                 }
             }
         }
@@ -650,7 +650,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
      * to check that data type conversions are following the
      * casting rules.
      */
-    if (!npyiter_check_casting(niter, op, op_dtype, casting, op_itflags)) {
+    if (!npyiter_check_casting(nop, op, op_dtype, casting, op_itflags)) {
         NpyIter_Deallocate(iter);
         return NULL;
     }
@@ -712,8 +712,8 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
      * reference arrays and flag it if so.
      */
     if (flags&NPY_ITER_REFS_OK) {
-        for (iiter = 0; iiter < niter; ++iiter) {
-            PyArray_Descr *rdt = op_dtype[iiter];
+        for (iop = 0; iop < nop; ++iop) {
+            PyArray_Descr *rdt = op_dtype[iop];
             if ((rdt->flags&(NPY_ITEM_REFCOUNT|
                                      NPY_ITEM_IS_POINTER|
                                      NPY_NEEDS_PYAPI)) != 0) {
@@ -732,7 +732,7 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
         if (itflags&NPY_ITFLAG_DELAYBUF) {
             bufferdata = NIT_BUFFERDATA(iter);
             /* Make the data pointers NULL */
-            memset(NBF_PTRS(bufferdata), 0, niter*NPY_SIZEOF_INTP);
+            memset(NBF_PTRS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
         }
         else {
             /* Allocate the buffers */
@@ -777,12 +777,12 @@ NpyIter_AdvancedNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
  * standard NumPy broadcasting rules and the default buffer size.
  */
 NPY_NO_EXPORT NpyIter *
-NpyIter_MultiNew(int niter, PyArrayObject **op_in, npy_uint32 flags,
+NpyIter_MultiNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
                  NPY_ORDER order, NPY_CASTING casting,
                  npy_uint32 *op_flags,
                  PyArray_Descr **op_request_dtypes)
 {
-    return NpyIter_AdvancedNew(niter, op_in, flags, order, casting,
+    return NpyIter_AdvancedNew(nop, op_in, flags, order, casting,
                             op_flags, op_request_dtypes,
                             0, NULL, NULL, 0);
 }
@@ -812,7 +812,7 @@ NpyIter_Copy(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
     int out_of_memory = 0;
 
     npy_intp size;
@@ -821,7 +821,7 @@ NpyIter_Copy(NpyIter *iter)
     PyArray_Descr **dtypes;
 
     /* Allocate memory for the new iterator */
-    size = NIT_SIZEOF_ITERATOR(itflags, ndim, niter);
+    size = NIT_SIZEOF_ITERATOR(itflags, ndim, nop);
     newiter = (NpyIter*)PyArray_malloc(size);
 
     /* Copy the raw values to the new iterator */
@@ -830,9 +830,9 @@ NpyIter_Copy(NpyIter *iter)
     /* Take ownership of references to the operands and dtypes */
     objects = NIT_OPERANDS(newiter);
     dtypes = NIT_DTYPES(newiter);
-    for (iiter = 0; iiter < niter; ++iiter) {
-        Py_INCREF(objects[iiter]);
-        Py_INCREF(dtypes[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_INCREF(objects[iop]);
+        Py_INCREF(dtypes[iop]);
     }
 
     /* Allocate buffers and make copies of the transfer data if necessary */
@@ -848,41 +848,41 @@ NpyIter_Copy(NpyIter *iter)
         writetransferdata = NBF_WRITETRANSFERDATA(bufferdata);
         buffersize = NBF_BUFFERSIZE(bufferdata);
 
-        for (iiter = 0; iiter < niter; ++iiter) {
-            if (buffers[iiter] != NULL) {
+        for (iop = 0; iop < nop; ++iop) {
+            if (buffers[iop] != NULL) {
                 if (out_of_memory) {
-                    buffers[iiter] = NULL;
+                    buffers[iop] = NULL;
                 }
                 else {
-                    itemsize = dtypes[iiter]->elsize;
-                    buffers[iiter] = PyArray_malloc(itemsize*buffersize);
-                    if (buffers[iiter] == NULL) {
+                    itemsize = dtypes[iop]->elsize;
+                    buffers[iop] = PyArray_malloc(itemsize*buffersize);
+                    if (buffers[iop] == NULL) {
                         out_of_memory = 1;
                     }
                 }
             }
 
-            if (readtransferdata[iiter] != NULL) {
+            if (readtransferdata[iop] != NULL) {
                 if (out_of_memory) {
-                    readtransferdata[iiter] = NULL;
+                    readtransferdata[iop] = NULL;
                 }
                 else {
-                    readtransferdata[iiter] =
-                      PyArray_CopyStridedTransferData(readtransferdata[iiter]);
-                    if (readtransferdata[iiter] == NULL) {
+                    readtransferdata[iop] =
+                      PyArray_CopyStridedTransferData(readtransferdata[iop]);
+                    if (readtransferdata[iop] == NULL) {
                         out_of_memory = 1;
                     }
                 }
             }
 
-            if (writetransferdata[iiter] != NULL) {
+            if (writetransferdata[iop] != NULL) {
                 if (out_of_memory) {
-                    writetransferdata[iiter] = NULL;
+                    writetransferdata[iop] = NULL;
                 }
                 else {
-                    writetransferdata[iiter] =
-                      PyArray_CopyStridedTransferData(writetransferdata[iiter]);
-                    if (writetransferdata[iiter] == NULL) {
+                    writetransferdata[iop] =
+                      PyArray_CopyStridedTransferData(writetransferdata[iop]);
+                    if (writetransferdata[iop] == NULL) {
                         out_of_memory = 1;
                     }
                 }
@@ -915,7 +915,7 @@ NpyIter_Deallocate(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     PyArray_Descr **dtype = NIT_DTYPES(iter);
     PyArrayObject **object = NIT_OPERANDS(iter);
@@ -928,21 +928,21 @@ NpyIter_Deallocate(NpyIter *iter)
 
         /* buffers */
         buffers = NBF_BUFFERS(bufferdata);
-        for(iiter = 0; iiter < niter; ++iiter, ++buffers) {
+        for(iop = 0; iop < nop; ++iop, ++buffers) {
             if (*buffers) {
                 PyArray_free(*buffers);
             }
         }
         /* read bufferdata */
         transferdata = NBF_READTRANSFERDATA(bufferdata);
-        for(iiter = 0; iiter < niter; ++iiter, ++transferdata) {
+        for(iop = 0; iop < nop; ++iop, ++transferdata) {
             if (*transferdata) {
                 PyArray_FreeStridedTransferData(*transferdata);
             }
         }
         /* write bufferdata */
         transferdata = NBF_WRITETRANSFERDATA(bufferdata);
-        for(iiter = 0; iiter < niter; ++iiter, ++transferdata) {
+        for(iop = 0; iop < nop; ++iop, ++transferdata) {
             if (*transferdata) {
                 PyArray_FreeStridedTransferData(*transferdata);
             }
@@ -950,7 +950,7 @@ NpyIter_Deallocate(NpyIter *iter)
     }
 
     /* Deallocate all the dtypes and objects that were iterated */
-    for(iiter = 0; iiter < niter; ++iiter, ++dtype, ++object) {
+    for(iop = 0; iop < nop; ++iop, ++dtype, ++object) {
         Py_XDECREF(*dtype);
         Py_XDECREF(*object);
     }
@@ -973,12 +973,12 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     int xdim = 0;
-    char *perm = NIT_PERM(iter);
+    npy_int8 *perm = NIT_PERM(iter);
     NpyIter_AxisData *axisdata_del = NIT_AXISDATA(iter), *axisdata;
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     npy_intp *baseoffsets = NIT_BASEOFFSETS(iter);
     char **resetdataptr = NIT_RESETDATAPTR(iter);
@@ -1008,7 +1008,7 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
     }
 
     /* Reverse axis, since the iterator treats them that way */
-    axis = ndim-1-axis;
+    axis = ndim - 1 - axis;
 
     /* First find the axis in question */
     for (idim = 0; idim < ndim; ++idim) {
@@ -1018,7 +1018,7 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
             break;
         }
         /* If this is it, but it's iterated backward, must reverse the axis */
-        else if (-1-perm[idim] == axis) {
+        else if (-1 - perm[idim] == axis) {
             npy_intp *strides = NAD_STRIDES(axisdata_del);
             npy_intp shape = NAD_SHAPE(axisdata_del), offset;
 
@@ -1028,10 +1028,10 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
              * Adjust baseoffsets and resetbaseptr back to the start of
              * this axis.
              */
-            for (iiter = 0; iiter < niter; ++iiter) {
-                offset = (shape-1)*strides[iiter];
-                baseoffsets[iiter] += offset;
-                resetdataptr[iiter] += offset;
+            for (iop = 0; iop < nop; ++iop) {
+                offset = (shape-1)*strides[iop];
+                baseoffsets[iop] += offset;
+                resetdataptr[iop] += offset;
             }
             break;
         }
@@ -1053,7 +1053,7 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
 
     /* Adjust the permutation */
     for (idim = 0; idim < ndim-1; ++idim) {
-        char p = (idim < xdim) ? perm[idim] : perm[idim+1];
+        npy_int8 p = (idim < xdim) ? perm[idim] : perm[idim+1];
         if (p >= 0) {
             if (p > axis) {
                 --p;
@@ -1082,8 +1082,8 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
     else {
         npy_intp *strides = NAD_STRIDES(axisdata_del);
         NAD_SHAPE(axisdata_del) = 1;
-        for (iiter = 0; iiter < niter; ++iiter) {
-            strides[iiter] = 0;
+        for (iop = 0; iop < nop; ++iop) {
+            strides[iop] = 0;
         }
         NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
     }
@@ -1123,7 +1123,7 @@ NpyIter_EnableExternalLoop(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     /* Check conditions under which this can be done */
     if (itflags&(NPY_ITFLAG_HASINDEX|NPY_ITFLAG_HASMULTIINDEX)) {
@@ -1173,7 +1173,7 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *bufferdata;
@@ -1225,7 +1225,7 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     char **resetdataptr = NIT_RESETDATAPTR(iter);
     npy_intp *baseoffsets = NIT_BASEOFFSETS(iter);
@@ -1245,8 +1245,8 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
     }
 
     /* The new data pointers for resetting */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        resetdataptr[iiter] = baseptrs[iiter] + baseoffsets[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        resetdataptr[iop] = baseptrs[iop] + baseoffsets[iop];
     }
 
     npyiter_goto_iterindex(iter, NIT_ITERSTART(iter));
@@ -1273,7 +1273,7 @@ NpyIter_ResetToIterIndexRange(NpyIter *iter,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    /*int niter = NIT_NITER(iter);*/
+    /*int nop = NIT_NOP(iter);*/
 
     if (!(itflags&NPY_ITFLAG_RANGE)) {
         if (errmsg == NULL) {
@@ -1331,12 +1331,12 @@ NpyIter_GotoMultiIndex(NpyIter *iter, npy_intp *multi_index)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp iterindex, factor;
     NpyIter_AxisData *axisdata;
     npy_intp sizeof_axisdata;
-    char *perm;
+    npy_int8 *perm;
 
     if (!(itflags&NPY_ITFLAG_HASMULTIINDEX)) {
         PyErr_SetString(PyExc_ValueError,
@@ -1361,13 +1361,13 @@ NpyIter_GotoMultiIndex(NpyIter *iter, npy_intp *multi_index)
 
     perm = NIT_PERM(iter);
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /* Compute the iterindex corresponding to the multi-index */
     iterindex = 0;
     factor = 1;
     for (idim = 0; idim < ndim; ++idim) {
-        char p = perm[idim];
+        npy_int8 p = perm[idim];
         npy_intp i, shape;
 
         shape = NAD_SHAPE(axisdata);
@@ -1417,7 +1417,7 @@ NpyIter_GotoIndex(NpyIter *iter, npy_intp flat_index)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp iterindex, factor;
     NpyIter_AxisData *axisdata;
@@ -1452,7 +1452,7 @@ NpyIter_GotoIndex(NpyIter *iter, npy_intp flat_index)
     }
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /* Compute the iterindex corresponding to the flat_index */
     iterindex = 0;
@@ -1460,7 +1460,7 @@ NpyIter_GotoIndex(NpyIter *iter, npy_intp flat_index)
     for (idim = 0; idim < ndim; ++idim) {
         npy_intp i, shape, iterstride;
 
-        iterstride = NAD_STRIDES(axisdata)[niter];
+        iterstride = NAD_STRIDES(axisdata)[nop];
         shape = NAD_SHAPE(axisdata);
 
         /* Extract the index from the flat_index */
@@ -1505,7 +1505,7 @@ NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     if (itflags&NPY_ITFLAG_EXLOOP) {
         PyErr_SetString(PyExc_ValueError,
@@ -1537,8 +1537,8 @@ NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex)
             ptrs = NBF_PTRS(bufferdata);
             delta = iterindex - NIT_ITERINDEX(iter);
 
-            for (iiter = 0; iiter < niter; ++iiter) {
-                ptrs[iiter] += delta * strides[iiter];
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += delta * strides[iop];
             }
 
             NIT_ITERINDEX(iter) = iterindex;
@@ -1569,7 +1569,7 @@ NpyIter_GetIterIndex(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     /* iterindex is only used if NPY_ITER_RANGED or NPY_ITER_BUFFERED was set */
     if (itflags&(NPY_ITFLAG_RANGE|NPY_ITFLAG_BUFFER)) {
@@ -1581,7 +1581,7 @@ NpyIter_GetIterIndex(NpyIter *iter)
         npy_intp sizeof_axisdata;
 
         iterindex = 0;
-        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
         axisdata = NIT_INDEX_AXISDATA(NIT_AXISDATA(iter), ndim-1);
 
         for (idim = ndim-2; idim >= 0; --idim) {
@@ -1610,31 +1610,32 @@ NpyIter_GetIterIndex(NpyIter *iter)
  * #tag_ndim = 1, 2, ANY#
  */
 /**begin repeat2
- * #const_niter = 1, 2, NPY_MAXDIMS#
- * #tag_niter = 1, 2, ANY#
+ * #const_nop = 1, 2, NPY_MAXDIMS#
+ * #tag_nop = 1, 2, ANY#
  */
 
-/* Specialized iternext (@const_itflags@,@tag_ndim@,@tag_niter@) */
+/* Specialized iternext (@const_itflags@,@tag_ndim@,@tag_nop@) */
 static int
-npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
+npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_nop@(
                                                       NpyIter *iter)
 {
+#if !(@const_itflags@&NPY_ITFLAG_EXLOOP) || (@const_ndim@ > 1)
     const npy_uint32 itflags = @const_itflags@;
-#if @const_ndim@ >= NPY_MAXDIMS
+#  if @const_ndim@ >= NPY_MAXDIMS
     int idim, ndim = NIT_NDIM(iter);
-#endif
-#if @const_niter@ < NPY_MAXDIMS
-    const int niter = @const_niter@;
-#else
-    int niter = NIT_NITER(iter);
-#endif
+#  endif
+#  if @const_nop@ < NPY_MAXDIMS
+    const int nop = @const_nop@;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
 
-    npy_intp istrides, nstrides, sizeof_axisdata;
-#if @const_ndim@ > 0
     NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
 #endif
 #if @const_ndim@ > 1
     NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
 #endif
 #if @const_ndim@ > 2
     NpyIter_AxisData *axisdata2;
@@ -1647,10 +1648,13 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
     }
 #endif
 
-    nstrides = NAD_NSTRIDES();
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+#if @const_ndim@ > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
 
+#  if !(@const_itflags@&NPY_ITFLAG_EXLOOP) || (@const_ndim@ > 1)
     axisdata0 = NIT_AXISDATA(iter);
+#  endif
 #  if !(@const_itflags@&NPY_ITFLAG_EXLOOP)
     /* Increment index 0 */
     NAD_INDEX(axisdata0)++;
@@ -1666,9 +1670,6 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
     /* Finished when the index equals the shape */
     return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
 #  else
-    /* Get rid of unused variable warning */
-    istrides = 0;
-
     return 0;
 #  endif
 
@@ -1763,8 +1764,8 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
 
 
 /**begin repeat
- * #const_niter = 1, 2, 3, 4, NPY_MAXDIMS#
- * #tag_niter = 1, 2, 3, 4, ANY#
+ * #const_nop = 1, 2, 3, 4, NPY_MAXDIMS#
+ * #tag_nop = 1, 2, 3, 4, ANY#
  */
 
 /*
@@ -1772,17 +1773,17 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
  * is done with a double loop to avoid frequent re-buffering.
  */
 static int
-npyiter_buffered_reduce_iternext_iters@tag_niter@(NpyIter *iter)
+npyiter_buffered_reduce_iternext_iters@tag_nop@(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-#if @const_niter@ >= NPY_MAXDIMS
-    int niter = NIT_NITER(iter);
+#if @const_nop@ >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
 #else
-    const int niter = @const_niter@;
+    const int nop = @const_nop@;
 #endif
 
-    int iiter;
+    int iop;
 
     NpyIter_AxisData *axisdata;
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -1801,8 +1802,8 @@ npyiter_buffered_reduce_iternext_iters@tag_niter@(NpyIter *iter)
             npy_intp *strides;
 
             strides = NBF_STRIDES(bufferdata);
-            for (iiter = 0; iiter < niter; ++iiter) {
-                ptrs[iiter] += strides[iiter];
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
             }
             return 1;
         }
@@ -1817,10 +1818,10 @@ npyiter_buffered_reduce_iternext_iters@tag_niter@(NpyIter *iter)
     if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
         npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
         char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
-        for (iiter = 0; iiter < niter; ++iiter) {
-            char *ptr = reduce_outerptrs[iiter] + reduce_outerstrides[iiter];
-            ptrs[iiter] = ptr;
-            reduce_outerptrs[iiter] = ptr;
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
         }
         NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
         return 1;
@@ -1828,7 +1829,7 @@ npyiter_buffered_reduce_iternext_iters@tag_niter@(NpyIter *iter)
 
     /* Save the previously used data pointers */
     axisdata = NIT_AXISDATA(iter);
-    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*niter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
 
     /* Write back to the arrays */
     npyiter_copy_from_buffers(iter);
@@ -1857,7 +1858,7 @@ npyiter_buffered_iternext(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
 
@@ -1868,14 +1869,14 @@ npyiter_buffered_iternext(NpyIter *iter)
     if (!(itflags&NPY_ITFLAG_EXLOOP)) {
         /* Increment within the buffer */
         if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
-            int iiter;
+            int iop;
             npy_intp *strides;
             char **ptrs;
 
             strides = NBF_STRIDES(bufferdata);
             ptrs = NBF_PTRS(bufferdata);
-            for (iiter = 0; iiter < niter; ++iiter) {
-                ptrs[iiter] += strides[iiter];
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
             }
             return 1;
         }
@@ -1927,7 +1928,7 @@ NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     /*
      * When there is just one iteration and buffering is disabled
@@ -1942,7 +1943,7 @@ NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
      */
     if (itflags&NPY_ITFLAG_BUFFER) {
         if (itflags&NPY_ITFLAG_REDUCE) {
-            switch (niter) {
+            switch (nop) {
                 case 1:
                     return &npyiter_buffered_reduce_iternext_iters1;
                 case 2:
@@ -1988,30 +1989,30 @@ NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
  * #tag_ndim = 1, 2#
  */
                 case @const_ndim@:
-                    switch (niter) {
+                    switch (nop) {
 /**begin repeat2
- * #const_niter = 1, 2#
- * #tag_niter = 1, 2#
+ * #const_nop = 1, 2#
+ * #tag_nop = 1, 2#
  */
-                        case @const_niter@:
-                            return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@;
+                        case @const_nop@:
+                            return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_nop@;
 /**end repeat2**/
-                        /* Not specialized on niter */
+                        /* Not specialized on nop */
                         default:
                             return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_itersANY;
                     }
 /**end repeat1**/
                 /* Not specialized on ndim */
                 default:
-                    switch (niter) {
+                    switch (nop) {
 /**begin repeat1
- * #const_niter = 1, 2#
- * #tag_niter = 1, 2#
+ * #const_nop = 1, 2#
+ * #tag_nop = 1, 2#
  */
-                        case @const_niter@:
-                            return &npyiter_iternext_itflags@tag_itflags@_dimsANY_iters@tag_niter@;
+                        case @const_nop@:
+                            return &npyiter_iternext_itflags@tag_itflags@_dimsANY_iters@tag_nop@;
 /**end repeat1**/
-                        /* Not specialized on niter */
+                        /* Not specialized on nop */
                         default:
                             return &npyiter_iternext_itflags@tag_itflags@_dimsANY_itersANY;
                     }
@@ -2022,12 +2023,12 @@ NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
     if (errmsg == NULL) {
         PyErr_Format(PyExc_ValueError,
                 "GetIterNext internal iterator error - unexpected "
-                "itflags/ndim/niter combination (%04x/%d/%d)",
-                (int)itflags, (int)ndim, (int)niter);
+                "itflags/ndim/nop combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)nop);
     }
     else {
         *errmsg = "GetIterNext internal iterator error - unexpected "
-                  "itflags/ndim/niter combination";
+                  "itflags/ndim/nop combination";
     }
     return NULL;
 }
@@ -2057,16 +2058,16 @@ npyiter_get_multi_index_itflags@tag_itflags@(
 {
     const npy_uint32 itflags = @const_itflags@;
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp sizeof_axisdata;
     NpyIter_AxisData *axisdata;
 #if !((@const_itflags@)&NPY_ITFLAG_IDENTPERM)
-    char* perm = NIT_PERM(iter);
+    npy_int8 *perm = NIT_PERM(iter);
 #endif
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 #if ((@const_itflags@)&NPY_ITFLAG_IDENTPERM)
     out_multi_index += ndim-1;
     for(idim = 0; idim < ndim; ++idim, --out_multi_index,
@@ -2075,12 +2076,12 @@ npyiter_get_multi_index_itflags@tag_itflags@(
     }
 #elif !((@const_itflags@)&NPY_ITFLAG_NEGPERM)
     for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        char p = perm[idim];
+        npy_int8 p = perm[idim];
         out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
     }
 #else
     for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        char p = perm[idim];
+        npy_int8 p = perm[idim];
         if (p < 0) {
             /* If the perm entry is negative, reverse the index */
             out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
@@ -2106,7 +2107,7 @@ NpyIter_GetGetMultiIndex(NpyIter *iter, char **errmsg)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     /* These flags must be correct */
     if ((itflags&(NPY_ITFLAG_HASMULTIINDEX|NPY_ITFLAG_DELAYBUF)) !=
@@ -2173,12 +2174,12 @@ NpyIter_GetGetMultiIndex(NpyIter *iter, char **errmsg)
     if (errmsg == NULL) {
         PyErr_Format(PyExc_ValueError,
                 "GetGetMultiIndex internal iterator error - unexpected "
-                "itflags/ndim/niter combination (%04x/%d/%d)",
-                (int)itflags, (int)ndim, (int)niter);
+                "itflags/ndim/nop combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)nop);
     }
     else {
         *errmsg = "GetGetMultiIndex internal iterator error - unexpected "
-                  "itflags/ndim/niter combination";
+                  "itflags/ndim/nop combination";
     }
     return NULL;
 
@@ -2228,7 +2229,7 @@ NpyIter_RequiresBuffering(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     char *op_itflags;
 
@@ -2239,8 +2240,8 @@ NpyIter_RequiresBuffering(NpyIter *iter)
     op_itflags = NIT_OPITFLAGS(iter);
 
     /* If any operand requires a cast, buffering is mandatory */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op_itflags[iiter]&NPY_OP_ITFLAG_CAST) {
+    for (iop = 0; iop < nop; ++iop) {
+        if (op_itflags[iop]&NPY_OP_ITFLAG_CAST) {
             return 1;
         }
     }
@@ -2272,9 +2273,9 @@ NpyIter_GetNDim(NpyIter *iter)
  * Gets the number of operands being iterated
  */
 NPY_NO_EXPORT int
-NpyIter_GetNIter(NpyIter *iter)
+NpyIter_GetNOp(NpyIter *iter)
 {
-    return NIT_NITER(iter);
+    return NIT_NOP(iter);
 }
 
 /*NUMPY_API
@@ -2312,7 +2313,7 @@ NpyIter_GetBufferSize(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -2354,19 +2355,19 @@ NpyIter_GetShape(NpyIter *iter, npy_intp *outshape)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     int idim, sizeof_axisdata;
     NpyIter_AxisData *axisdata;
-    char *perm;
+    npy_int8 *perm;
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     if (itflags&NPY_ITFLAG_HASMULTIINDEX) {
         perm = NIT_PERM(iter);
         for(idim = 0; idim < ndim; ++idim) {
-            char p = perm[idim];
+            npy_int8 p = perm[idim];
             if (p < 0) {
                 outshape[ndim+p] = NAD_SHAPE(axisdata);
             }
@@ -2418,11 +2419,11 @@ NpyIter_CreateCompatibleStrides(NpyIter *iter,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp sizeof_axisdata;
     NpyIter_AxisData *axisdata;
-    char *perm;
+    npy_int8 *perm;
 
     if (!(itflags&NPY_ITFLAG_HASMULTIINDEX)) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -2432,11 +2433,11 @@ NpyIter_CreateCompatibleStrides(NpyIter *iter,
     }
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     perm = NIT_PERM(iter);
     for(idim = 0; idim < ndim; ++idim) {
-        char p = perm[idim];
+        npy_int8 p = perm[idim];
         if (p < 0) {
             PyErr_SetString(PyExc_RuntimeError,
                     "Iterator CreateCompatibleStrides may only be called "
@@ -2465,7 +2466,7 @@ NpyIter_GetDataPtrArray(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -2494,7 +2495,7 @@ NpyIter_GetInitialDataPtrArray(NpyIter *iter)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     return NIT_RESETDATAPTR(iter);
 }
@@ -2507,7 +2508,7 @@ NpyIter_GetDescrArray(NpyIter *iter)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     /*int ndim = NIT_NDIM(iter);*/
-    /*int niter = NIT_NITER(iter);*/
+    /*int nop = NIT_NOP(iter);*/
 
     return NIT_DTYPES(iter);
 }
@@ -2520,7 +2521,7 @@ NpyIter_GetOperandArray(NpyIter *iter)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     return NIT_OPERANDS(iter);
 }
@@ -2533,7 +2534,7 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     PyArrayObject *obj, *view;
@@ -2543,7 +2544,7 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
     npy_intp sizeof_axisdata;
     int writeable;
 
-    if (i < 0 || i >= niter) {
+    if (i < 0 || i >= nop) {
         PyErr_SetString(PyExc_IndexError,
                 "index provided for an iterator view was out of bounds");
         return NULL;
@@ -2561,7 +2562,7 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
     writeable = NIT_OPITFLAGS(iter)[i]&NPY_OP_ITFLAG_WRITE;
     dataptr = NIT_RESETDATAPTR(iter)[i];
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /* Retrieve the shape and strides from the axisdata */
     for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
@@ -2594,13 +2595,13 @@ NpyIter_GetIndexPtr(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
 
     if (itflags&NPY_ITFLAG_HASINDEX) {
         /* The index is just after the data pointers */
-        return (npy_intp*)NAD_PTRS(axisdata) + niter;
+        return (npy_intp*)NAD_PTRS(axisdata) + nop;
     }
     else {
         return NULL;
@@ -2615,12 +2616,12 @@ NpyIter_GetReadFlags(NpyIter *iter, char *outreadflags)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     char *op_itflags = NIT_OPITFLAGS(iter);
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        outreadflags[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_READ) != 0;
+    for (iop = 0; iop < nop; ++iop) {
+        outreadflags[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_READ) != 0;
     }
 }
 
@@ -2632,12 +2633,12 @@ NpyIter_GetWriteFlags(NpyIter *iter, char *outwriteflags)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     char *op_itflags = NIT_OPITFLAGS(iter);
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        outwriteflags[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) != 0;
+    for (iop = 0; iop < nop; ++iop) {
+        outwriteflags[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_WRITE) != 0;
     }
 }
 
@@ -2652,7 +2653,7 @@ NpyIter_GetInnerStrideArray(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
@@ -2677,11 +2678,11 @@ NpyIter_GetAxisStrideArray(NpyIter *iter, int axis)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
-    char *perm = NIT_PERM(iter);
+    npy_int8 *perm = NIT_PERM(iter);
     NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     if (axis < 0 || axis >= ndim) {
         PyErr_SetString(PyExc_ValueError,
@@ -2695,7 +2696,7 @@ NpyIter_GetAxisStrideArray(NpyIter *iter, int axis)
 
         /* First find the axis in question */
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            if (perm[idim] == axis || -1-perm[idim] == axis) {
+            if (perm[idim] == axis || -1 - perm[idim] == axis) {
                 return NAD_STRIDES(axisdata);
             }
         }
@@ -2723,10 +2724,10 @@ NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     NpyIter_AxisData *axisdata0 = NIT_AXISDATA(iter);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
@@ -2735,21 +2736,21 @@ NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides)
                 *ad_strides = NAD_STRIDES(axisdata0);
         PyArray_Descr **dtypes = NIT_DTYPES(iter);
 
-        for (iiter = 0; iiter < niter; ++iiter) {
-            stride = strides[iiter];
+        for (iop = 0; iop < nop; ++iop) {
+            stride = strides[iop];
             /*
              * Operands which are always/never buffered have fixed strides,
              * and everything has fixed strides when ndim is 0 or 1
              */
-            if (ndim <= 1 || (op_itflags[iiter]&
+            if (ndim <= 1 || (op_itflags[iop]&
                             (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_BUFNEVER))) {
-                out_strides[iiter] = stride;
+                out_strides[iop] = stride;
             }
             /* If it's a reduction, 0-stride inner loop may have fixed stride */
             else if (stride == 0 && (itflags&NPY_ITFLAG_REDUCE)) {
                 /* If it's a reduction operand, definitely fixed stride */
-                if (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) {
-                    out_strides[iiter] = stride;
+                if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) {
+                    out_strides[iop] = stride;
                 }
                 /*
                  * Otherwise it's a fixed stride if the stride is 0
@@ -2760,17 +2761,17 @@ NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides)
                     int idim,
                             reduce_outerdim = NBF_REDUCE_OUTERDIM(data);
                     for (idim = 0; idim < reduce_outerdim; ++idim) {
-                        if (NAD_STRIDES(axisdata)[iiter] != 0) {
+                        if (NAD_STRIDES(axisdata)[iop] != 0) {
                             break;
                         }
                         NIT_ADVANCE_AXISDATA(axisdata, 1);
                     }
                     /* If all the strides were 0, the stride won't change */
                     if (idim == reduce_outerdim) {
-                        out_strides[iiter] = stride;
+                        out_strides[iop] = stride;
                     }
                     else {
-                        out_strides[iiter] = NPY_MAX_INTP;
+                        out_strides[iop] = NPY_MAX_INTP;
                     }
                 }
             }
@@ -2778,21 +2779,21 @@ NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides)
              * Inner loop contiguous array means its stride won't change when
              * switching between buffering and not buffering
              */
-            else if (ad_strides[iiter] == dtypes[iiter]->elsize) {
-                out_strides[iiter] = ad_strides[iiter];
+            else if (ad_strides[iop] == dtypes[iop]->elsize) {
+                out_strides[iop] = ad_strides[iop];
             }
             /*
              * Otherwise the strides can change if the operand is sometimes
              * buffered, sometimes not.
              */
             else {
-                out_strides[iiter] = NPY_MAX_INTP;
+                out_strides[iop] = NPY_MAX_INTP;
             }
         }
     }
     else {
         /* If there's no buffering, the strides are always fixed */
-        memcpy(out_strides, NAD_STRIDES(axisdata0), niter*NPY_SIZEOF_INTP);
+        memcpy(out_strides, NAD_STRIDES(axisdata0), nop*NPY_SIZEOF_INTP);
     }
 }
 
@@ -2807,7 +2808,7 @@ NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
@@ -2890,7 +2891,7 @@ npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
 }
 
 static int
-npyiter_calculate_ndim(int niter, PyArrayObject **op_in,
+npyiter_calculate_ndim(int nop, PyArrayObject **op_in,
                        int oa_ndim)
 {
     /* If 'op_axes' is being used, force 'ndim' */
@@ -2899,11 +2900,11 @@ npyiter_calculate_ndim(int niter, PyArrayObject **op_in,
     }
     /* Otherwise it's the maximum 'ndim' from the operands */
     else {
-        int ndim = 0, iiter;
+        int ndim = 0, iop;
 
-        for (iiter = 0; iiter < niter; ++iiter) {
-            if (op_in[iiter] != NULL) {
-                int ondim = PyArray_NDIM(op_in[iiter]);
+        for (iop = 0; iop < nop; ++iop) {
+            if (op_in[iop] != NULL) {
+                int ondim = PyArray_NDIM(op_in[iop]);
                 if (ondim > ndim) {
                     ndim = ondim;
                 }
@@ -2916,11 +2917,11 @@ npyiter_calculate_ndim(int niter, PyArrayObject **op_in,
 }
 
 static int
-npyiter_check_op_axes(int niter, int oa_ndim, int **op_axes,
+npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
                         npy_intp *itershape)
 {
     char axes_dupcheck[NPY_MAXDIMS];
-    int iiter, idim;
+    int iop, idim;
 
     if (oa_ndim == 0 && (op_axes != NULL || itershape != NULL)) {
         PyErr_Format(PyExc_ValueError,
@@ -2944,8 +2945,8 @@ npyiter_check_op_axes(int niter, int oa_ndim, int **op_axes,
         }
 
         /* Check that there are no duplicates in op_axes */
-        for (iiter = 0; iiter < niter; ++iiter) {
-            int *axes = op_axes[iiter];
+        for (iop = 0; iop < nop; ++iop) {
+            int *axes = op_axes[iop];
             if (axes != NULL) {
                 memset(axes_dupcheck, 0, NPY_MAXDIMS);
                 for (idim = 0; idim < oa_ndim; ++idim) {
@@ -2956,14 +2957,14 @@ npyiter_check_op_axes(int niter, int oa_ndim, int **op_axes,
                                     "The 'op_axes' provided to the iterator "
                                     "constructor for operand %d "
                                     "contained invalid "
-                                    "values %d", (int)iiter, (int)i);
+                                    "values %d", (int)iop, (int)i);
                             return 0;
                         } else if(axes_dupcheck[i] == 1) {
                             PyErr_Format(PyExc_ValueError,
                                     "The 'op_axes' provided to the iterator "
                                     "constructor for operand %d "
                                     "contained duplicate "
-                                    "value %d", (int)iiter, (int)i);
+                                    "value %d", (int)iop, (int)i);
                             return 0;
                         }
                         else {
@@ -3225,7 +3226,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
  * can replace the arrays if copying is necessary.
  */
 static int
-npyiter_prepare_operands(int niter, PyArrayObject **op_in,
+npyiter_prepare_operands(int nop, PyArrayObject **op_in,
                     PyArrayObject **op,
                     char **op_dataptr,
                     PyArray_Descr **op_request_dtypes,
@@ -3233,16 +3234,16 @@ npyiter_prepare_operands(int niter, PyArrayObject **op_in,
                     npy_uint32 flags,
                     npy_uint32 *op_flags, char *op_itflags)
 {
-    int iiter, i;
+    int iop, i;
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        op[iiter] = op_in[iiter];
-        Py_XINCREF(op[iiter]);
-        op_dtype[iiter] = NULL;
+    for (iop = 0; iop < nop; ++iop) {
+        op[iop] = op_in[iop];
+        Py_XINCREF(op[iop]);
+        op_dtype[iop] = NULL;
 
         /* Check the readonly/writeonly flags, and fill in op_itflags */
-        if (!npyiter_check_per_op_flags(op_flags[iiter], &op_itflags[iiter])) {
-            for (i = 0; i <= iiter; ++i) {
+        if (!npyiter_check_per_op_flags(op_flags[iop], &op_itflags[iop])) {
+            for (i = 0; i <= iop; ++i) {
                 Py_XDECREF(op[i]);
                 Py_XDECREF(op_dtype[i]);
             }
@@ -3250,16 +3251,16 @@ npyiter_prepare_operands(int niter, PyArrayObject **op_in,
         }
 
         /*
-         * Prepare the operand.  This produces an op_dtype[iiter] reference
+         * Prepare the operand.  This produces an op_dtype[iop] reference
          * on success.
          */
-        if (!npyiter_prepare_one_operand(&op[iiter],
-                        &op_dataptr[iiter],
-                        op_request_dtypes ? op_request_dtypes[iiter] : NULL,
-                        &op_dtype[iiter],
+        if (!npyiter_prepare_one_operand(&op[iop],
+                        &op_dataptr[iop],
+                        op_request_dtypes ? op_request_dtypes[iop] : NULL,
+                        &op_dtype[iop],
                         flags,
-                        op_flags[iiter], &op_itflags[iiter])) {
-            for (i = 0; i <= iiter; ++i) {
+                        op_flags[iop], &op_itflags[iop])) {
+            for (i = 0; i <= iop; ++i) {
                 Py_XDECREF(op[i]);
                 Py_XDECREF(op_dtype[i]);
             }
@@ -3271,14 +3272,14 @@ npyiter_prepare_operands(int niter, PyArrayObject **op_in,
     /* If all the operands were NULL, it's an error */
     if (op[0] == NULL) {
         int all_null = 1;
-        for (iiter = 1; iiter < niter; ++iiter) {
-            if (op[iiter] != NULL) {
+        for (iop = 1; iop < nop; ++iop) {
+            if (op[iop] != NULL) {
                 all_null = 0;
                 break;
             }
         }
         if (all_null) {
-            for (i = 0; i < niter; ++i) {
+            for (i = 0; i < nop; ++i) {
                 Py_XDECREF(op[i]);
                 Py_XDECREF(op_dtype[i]);
             }
@@ -3311,52 +3312,52 @@ npyiter_casting_to_string(NPY_CASTING casting)
 }
 
 static int
-npyiter_check_casting(int niter, PyArrayObject **op,
+npyiter_check_casting(int nop, PyArrayObject **op,
                     PyArray_Descr **op_dtype,
                     NPY_CASTING casting,
                     char *op_itflags)
 {
-    int iiter;
+    int iop;
 
-    for(iiter = 0; iiter < niter; ++iiter) {
+    for(iop = 0; iop < nop; ++iop) {
         NPY_IT_DBG_PRINT1("Iterator: Checking casting for operand %d\n",
-                            (int)iiter);
+                            (int)iop);
 #if NPY_IT_DBG_TRACING
         printf("op: ");
-        if (op[iiter] != NULL) {
-            PyObject_Print((PyObject *)PyArray_DESCR(op[iiter]), stdout, 0);
+        if (op[iop] != NULL) {
+            PyObject_Print((PyObject *)PyArray_DESCR(op[iop]), stdout, 0);
         }
         else {
             printf("<null>");
         }
         printf(", iter: ");
-        PyObject_Print((PyObject *)op_dtype[iiter], stdout, 0);
+        PyObject_Print((PyObject *)op_dtype[iop], stdout, 0);
         printf("\n");
 #endif
         /* If the types aren't equivalent, a cast is necessary */
-        if (op[iiter] != NULL && !PyArray_EquivTypes(PyArray_DESCR(op[iiter]),
-                                                     op_dtype[iiter])) {
+        if (op[iop] != NULL && !PyArray_EquivTypes(PyArray_DESCR(op[iop]),
+                                                     op_dtype[iop])) {
             /* Check read (op -> temp) casting */
-            if ((op_itflags[iiter]&NPY_OP_ITFLAG_READ) &&
-                        !PyArray_CanCastArrayTo(op[iiter],
-                                          op_dtype[iiter],
+            if ((op_itflags[iop]&NPY_OP_ITFLAG_READ) &&
+                        !PyArray_CanCastArrayTo(op[iop],
+                                          op_dtype[iop],
                                           casting)) {
                 PyErr_Format(PyExc_TypeError,
                         "Iterator operand %d dtype could not be cast "
                         "to the requested dtype, according to "
-                        "the casting rule given, %s", (int)iiter,
+                        "the casting rule given, %s", (int)iop,
                         npyiter_casting_to_string(casting));
                 return 0;
             }
             /* Check write (temp -> op) casting */
-            if ((op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) &&
-                        !PyArray_CanCastTypeTo(op_dtype[iiter],
-                                          PyArray_DESCR(op[iiter]),
+            if ((op_itflags[iop]&NPY_OP_ITFLAG_WRITE) &&
+                        !PyArray_CanCastTypeTo(op_dtype[iop],
+                                          PyArray_DESCR(op[iop]),
                                           casting)) {
                 PyErr_Format(PyExc_TypeError,
                         "Iterator requested dtype could not be cast "
                         "to the operand %d dtype, according to "
-                        "the casting rule given, %s", (int)iiter,
+                        "the casting rule given, %s", (int)iop,
                         npyiter_casting_to_string(casting));
                 return 0;
             }
@@ -3364,7 +3365,7 @@ npyiter_check_casting(int niter, PyArrayObject **op,
             NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                 "because the types aren't equivalent\n");
             /* Indicate that this operand needs casting */
-            op_itflags[iiter] |= NPY_OP_ITFLAG_CAST;
+            op_itflags[iop] |= NPY_OP_ITFLAG_CAST;
         }
     }
 
@@ -3391,7 +3392,7 @@ npyiter_shape_string(npy_intp n, npy_intp *vals, char *ending)
         return PyUString_FromFormat("()%s", ending);
     }
     else {
-        ret = PyUString_FromFormat("(%zd", vals[i++]);
+        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
         if (ret == NULL) {
             return NULL;
         }
@@ -3402,7 +3403,7 @@ npyiter_shape_string(npy_intp n, npy_intp *vals, char *ending)
             tmp = PyUString_FromString(",newaxis");
         }
         else {
-            tmp = PyUString_FromFormat(",%zd", vals[i]);
+            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
         }
         if (tmp == NULL) {
             Py_DECREF(ret);
@@ -3421,7 +3422,7 @@ npyiter_shape_string(npy_intp n, npy_intp *vals, char *ending)
 }
 
 /*
- * Fills in the AXISDATA for the 'niter' operands, broadcasting
+ * Fills in the AXISDATA for the 'nop' operands, broadcasting
  * the dimensionas as necessary.  Also fills
  * in the ITERSIZE data member.
  *
@@ -3439,7 +3440,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     int ondim;
     NpyIter_AxisData *axisdata;
@@ -3462,16 +3463,16 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
             }
         }
     }
-    for (iiter = 0; iiter < niter; ++iiter) {
-        op_cur = op[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        op_cur = op[iop];
         if (op_cur != NULL) {
             npy_intp *shape = PyArray_DIMS(op_cur);
             ondim = PyArray_NDIM(op_cur);
 
-            if (op_axes == NULL || op_axes[iiter] == NULL) {
+            if (op_axes == NULL || op_axes[iop] == NULL) {
                 /*
                  * Possible if op_axes are being used, but
-                 * op_axes[iiter] is NULL
+                 * op_axes[iop] is NULL
                  */
                 if (ondim > ndim) {
                     PyErr_SetString(PyExc_ValueError,
@@ -3491,7 +3492,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                 }
             }
             else {
-                int *axes = op_axes[iiter];
+                int *axes = op_axes[iop];
                 for (idim = 0; idim < ndim; ++idim) {
                     int i = axes[idim];
                     if (i >= 0) {
@@ -3510,8 +3511,8 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                                     "Iterator input op_axes[%d][%d] (==%d) "
                                     "is not a valid axis of op[%d], which "
                                     "has %d dimensions ",
-                                    (int)iiter, (int)(ndim-idim-1), (int)i,
-                                    (int)iiter, (int)ondim);
+                                    (int)iop, (int)(ndim-idim-1), (int)i,
+                                    (int)iop, (int)ondim);
                             return 0;
                         }
                     }
@@ -3532,7 +3533,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
     }
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /* Now process the operands, filling in the axisdata */
     for (idim = 0; idim < ndim; ++idim) {
@@ -3541,39 +3542,39 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
 
         NAD_SHAPE(axisdata) = bshape;
         NAD_INDEX(axisdata) = 0;
-        memcpy(NAD_PTRS(axisdata), op_dataptr, NPY_SIZEOF_INTP*niter);
+        memcpy(NAD_PTRS(axisdata), op_dataptr, NPY_SIZEOF_INTP*nop);
 
-        for (iiter = 0; iiter < niter; ++iiter) {
-            op_cur = op[iiter];
+        for (iop = 0; iop < nop; ++iop) {
+            op_cur = op[iop];
 
-            if (op_axes == NULL || op_axes[iiter] == NULL) {
+            if (op_axes == NULL || op_axes[iop] == NULL) {
                 if (op_cur == NULL) {
-                    strides[iiter] = 0;
+                    strides[iop] = 0;
                 }
                 else {
                     ondim = PyArray_NDIM(op_cur);
                     if (bshape == 1) {
-                        strides[iiter] = 0;
+                        strides[iop] = 0;
                         if (idim >= ondim && !output_scalars &&
-                                        (op_flags[iiter]&NPY_ITER_NO_BROADCAST)) {
+                                        (op_flags[iop]&NPY_ITER_NO_BROADCAST)) {
                             goto operand_different_than_broadcast;
                         }
                     }
                     else if (idim >= ondim ||
                                         PyArray_DIM(op_cur, ondim-idim-1) == 1) {
-                        strides[iiter] = 0;
-                        if (op_flags[iiter]&NPY_ITER_NO_BROADCAST) {
+                        strides[iop] = 0;
+                        if (op_flags[iop]&NPY_ITER_NO_BROADCAST) {
                             goto operand_different_than_broadcast;
                         }
                         /* If it's writeable, this means a reduction */
-                        if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                        if (op_itflags[iop]&NPY_OP_ITFLAG_WRITE) {
                             if (!(flags&NPY_ITER_REDUCE_OK)) {
                                 PyErr_SetString(PyExc_ValueError,
                                         "output operand requires a reduction, but "
                                         "reduction is not enabled");
                                 return 0;
                             }
-                            if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                            if (!(op_itflags[iop]&NPY_OP_ITFLAG_READ)) {
                                 PyErr_SetString(PyExc_ValueError,
                                         "output operand requires a reduction, but "
                                         "is flagged as write-only, not "
@@ -3581,35 +3582,35 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                                 return 0;
                             }
                             NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                            op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
+                            op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
                         }
                     }
                     else {
-                        strides[iiter] = PyArray_STRIDE(op_cur, ondim-idim-1);
+                        strides[iop] = PyArray_STRIDE(op_cur, ondim-idim-1);
                     }
                 }
             }
             else {
-                int *axes = op_axes[iiter];
+                int *axes = op_axes[iop];
                 int i = axes[ndim-idim-1];
                 if (i >= 0) {
                     if (bshape == 1 || op_cur == NULL) {
-                        strides[iiter] = 0;
+                        strides[iop] = 0;
                     }
                     else if (PyArray_DIM(op_cur, i) == 1) {
-                        strides[iiter] = 0;
-                        if (op_flags[iiter]&NPY_ITER_NO_BROADCAST) {
+                        strides[iop] = 0;
+                        if (op_flags[iop]&NPY_ITER_NO_BROADCAST) {
                             goto operand_different_than_broadcast;
                         }
                         /* If it's writeable, this means a reduction */
-                        if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                        if (op_itflags[iop]&NPY_OP_ITFLAG_WRITE) {
                             if (!(flags&NPY_ITER_REDUCE_OK)) {
                                 PyErr_SetString(PyExc_ValueError,
                                         "output operand requires a reduction, but "
                                         "reduction is not enabled");
                                 return 0;
                             }
-                            if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                            if (!(op_itflags[iop]&NPY_OP_ITFLAG_READ)) {
                                 PyErr_SetString(PyExc_ValueError,
                                         "output operand requires a reduction, but "
                                         "is flagged as write-only, not "
@@ -3617,27 +3618,27 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                                 return 0;
                             }
                             NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                            op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
+                            op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
                         }
                     }
                     else {
-                        strides[iiter] = PyArray_STRIDE(op_cur, i);
+                        strides[iop] = PyArray_STRIDE(op_cur, i);
                     }
                 }
                 else if (bshape == 1) {
-                    strides[iiter] = 0;
+                    strides[iop] = 0;
                 }
                 else {
-                    strides[iiter] = 0;
+                    strides[iop] = 0;
                     /* If it's writeable, this means a reduction */
-                    if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                    if (op_itflags[iop]&NPY_OP_ITFLAG_WRITE) {
                         if (!(flags&NPY_ITER_REDUCE_OK)) {
                             PyErr_SetString(PyExc_ValueError,
                                     "output operand requires a reduction, but "
                                     "reduction is not enabled");
                             return 0;
                         }
-                        if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                        if (!(op_itflags[iop]&NPY_OP_ITFLAG_READ)) {
                             PyErr_SetString(PyExc_ValueError,
                                     "output operand requires a reduction, but "
                                     "is flagged as write-only, not "
@@ -3645,7 +3646,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                             return 0;
                         }
                         NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                        op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
+                        op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
                     }
                 }
             }
@@ -3676,10 +3677,10 @@ broadcast_error: {
             if (errmsg == NULL) {
                 return 0;
             }
-            for (iiter = 0; iiter < niter; ++iiter) {
-                if (op[iiter] != NULL) {
-                    tmp = npyiter_shape_string(PyArray_NDIM(op[iiter]),
-                                                    PyArray_DIMS(op[iiter]),
+            for (iop = 0; iop < nop; ++iop) {
+                if (op[iop] != NULL) {
+                    tmp = npyiter_shape_string(PyArray_NDIM(op[iop]),
+                                                    PyArray_DIMS(op[iop]),
                                                     " ");
                     if (tmp == NULL) {
                         Py_DECREF(errmsg);
@@ -3719,13 +3720,13 @@ broadcast_error: {
             errmsg = PyUString_FromString("operands could not be broadcast "
                                           "together with remapped shapes "
                                           "[original->remapped]: ");
-            for (iiter = 0; iiter < niter; ++iiter) {
-                if (op[iiter] != NULL) {
-                    int *axes = op_axes[iiter];
+            for (iop = 0; iop < nop; ++iop) {
+                if (op[iop] != NULL) {
+                    int *axes = op_axes[iop];
 
                     tmpstr = (axes == NULL) ? " " : "->";
-                    tmp = npyiter_shape_string(PyArray_NDIM(op[iiter]),
-                                                    PyArray_DIMS(op[iiter]),
+                    tmp = npyiter_shape_string(PyArray_NDIM(op[iop]),
+                                                    PyArray_DIMS(op[iop]),
                                                     tmpstr);
                     if (tmp == NULL) {
                         return 0;
@@ -3739,8 +3740,8 @@ broadcast_error: {
                         for (idim = 0; idim < ndim; ++idim) {
                             npy_intp i = axes[idim];
 
-                            if (i >= 0 && i < PyArray_NDIM(op[iiter])) {
-                                remdims[idim] = PyArray_DIM(op[iiter], i);
+                            if (i >= 0 && i < PyArray_NDIM(op[iop])) {
+                                remdims[idim] = PyArray_DIM(op[iop], i);
                             }
                             else {
                                 remdims[idim] = -1;
@@ -3790,7 +3791,7 @@ operand_different_than_broadcast: {
         PyObject *errmsg, *tmp;
 
         /* Start of error message */
-        if (op_flags[iiter]&NPY_ITER_READONLY) {
+        if (op_flags[iop]&NPY_ITER_READONLY) {
             errmsg = PyUString_FromString("non-broadcastable operand "
                                           "with shape ");
         }
@@ -3803,8 +3804,8 @@ operand_different_than_broadcast: {
         }
 
         /* Operand shape */
-        tmp = npyiter_shape_string(PyArray_NDIM(op[iiter]),
-                                        PyArray_DIMS(op[iiter]), "");
+        tmp = npyiter_shape_string(PyArray_NDIM(op[iop]),
+                                        PyArray_DIMS(op[iop]), "");
         if (tmp == NULL) {
             return 0;
         }
@@ -3813,14 +3814,14 @@ operand_different_than_broadcast: {
             return 0;
         }
         /* Remapped operand shape */
-        if (op_axes != NULL && op_axes[iiter] != NULL) {
-            int *axes = op_axes[iiter];
+        if (op_axes != NULL && op_axes[iop] != NULL) {
+            int *axes = op_axes[iop];
 
             for (idim = 0; idim < ndim; ++idim) {
                 npy_intp i = axes[ndim-idim-1];
 
-                if (i >= 0 && i < PyArray_NDIM(op[iiter])) {
-                    remdims[idim] = PyArray_DIM(op[iiter], i);
+                if (i >= 0 && i < PyArray_NDIM(op[iop])) {
+                    remdims[idim] = PyArray_DIM(op[iop], i);
                 }
                 else {
                     remdims[idim] = -1;
@@ -3880,7 +3881,7 @@ operand_different_than_broadcast: {
 }
 
 /*
- * Replaces the AXISDATA for the iiter'th operand, broadcasting
+ * Replaces the AXISDATA for the iop'th operand, broadcasting
  * the dimensions as necessary.  Assumes the replacement array is
  * exactly the same shape as the original array used when
  * npy_fill_axisdata was called.
@@ -3889,23 +3890,23 @@ operand_different_than_broadcast: {
  * array.
  */
 static void
-npyiter_replace_axisdata(NpyIter *iter, int iiter,
+npyiter_replace_axisdata(NpyIter *iter, int iop,
                       PyArrayObject *op,
                       int op_ndim, char *op_dataptr,
                       int *op_axes)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     NpyIter_AxisData *axisdata0, *axisdata;
     npy_intp sizeof_axisdata;
-    char *perm;
+    npy_int8 *perm;
     npy_intp baseoffset = 0;
 
     perm = NIT_PERM(iter);
     axisdata0 = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /*
      * Replace just the strides which were non-zero, and compute
@@ -3915,7 +3916,7 @@ npyiter_replace_axisdata(NpyIter *iter, int iiter,
 
     if (op_axes != NULL) {
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            char p;
+            npy_int8 p;
             int i;
             npy_intp shape;
 
@@ -3934,11 +3935,11 @@ npyiter_replace_axisdata(NpyIter *iter, int iiter,
                     npy_intp stride = PyArray_STRIDE(op, i);
                     if (p < 0) {
                         /* If the perm entry is negative, flip the axis */
-                        NAD_STRIDES(axisdata)[iiter] = -stride;
+                        NAD_STRIDES(axisdata)[iop] = -stride;
                         baseoffset += stride*(shape-1);
                     }
                     else {
-                        NAD_STRIDES(axisdata)[iiter] = stride;
+                        NAD_STRIDES(axisdata)[iop] = stride;
                     }
                 }
             }
@@ -3946,7 +3947,7 @@ npyiter_replace_axisdata(NpyIter *iter, int iiter,
     }
     else {
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            char p;
+            npy_int8 p;
             int i;
             npy_intp shape;
 
@@ -3965,11 +3966,11 @@ npyiter_replace_axisdata(NpyIter *iter, int iiter,
                     npy_intp stride = PyArray_STRIDE(op, i);
                     if (p < 0) {
                         /* If the perm entry is negative, flip the axis */
-                        NAD_STRIDES(axisdata)[iiter] = -stride;
+                        NAD_STRIDES(axisdata)[iop] = -stride;
                         baseoffset += stride*(shape-1);
                     }
                     else {
-                        NAD_STRIDES(axisdata)[iiter] = stride;
+                        NAD_STRIDES(axisdata)[iop] = stride;
                     }
                 }
             }
@@ -3979,11 +3980,11 @@ npyiter_replace_axisdata(NpyIter *iter, int iiter,
     op_dataptr += baseoffset;
 
     /* Now the base data pointer is calculated, set it everywhere it's needed */
-    NIT_RESETDATAPTR(iter)[iiter] = op_dataptr;
-    NIT_BASEOFFSETS(iter)[iiter] = baseoffset;
+    NIT_RESETDATAPTR(iter)[iop] = op_dataptr;
+    NIT_BASEOFFSETS(iter)[iop] = baseoffset;
     axisdata = axisdata0;
     for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        NAD_PTRS(axisdata)[iiter] = op_dataptr;
+        NAD_PTRS(axisdata)[iop] = op_dataptr;
     }
 }
 
@@ -3999,7 +4000,7 @@ npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp indexstride;
     NpyIter_AxisData *axisdata;
@@ -4013,42 +4014,42 @@ npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags)
     if (NIT_ITERSIZE(iter) == 1) {
         if (itflags&NPY_ITFLAG_HASINDEX) {
             axisdata = NIT_AXISDATA(iter);
-            NAD_PTRS(axisdata)[niter] = 0;
+            NAD_PTRS(axisdata)[nop] = 0;
         }
         return;
     }
 
     if (flags&NPY_ITER_C_INDEX) {
-        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
         axisdata = NIT_AXISDATA(iter);
         indexstride = 1;
         for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
             npy_intp shape = NAD_SHAPE(axisdata);
 
             if (shape == 1) {
-                NAD_STRIDES(axisdata)[niter] = 0;
+                NAD_STRIDES(axisdata)[nop] = 0;
             }
             else {
-                NAD_STRIDES(axisdata)[niter] = indexstride;
+                NAD_STRIDES(axisdata)[nop] = indexstride;
             }
-            NAD_PTRS(axisdata)[niter] = 0;
+            NAD_PTRS(axisdata)[nop] = 0;
             indexstride *= shape;
         }
     }
     else if (flags&NPY_ITER_F_INDEX) {
-        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
         axisdata = NIT_INDEX_AXISDATA(NIT_AXISDATA(iter), ndim-1);
         indexstride = 1;
         for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, -1)) {
             npy_intp shape = NAD_SHAPE(axisdata);
 
             if (shape == 1) {
-                NAD_STRIDES(axisdata)[niter] = 0;
+                NAD_STRIDES(axisdata)[nop] = 0;
             }
             else {
-                NAD_STRIDES(axisdata)[niter] = indexstride;
+                NAD_STRIDES(axisdata)[nop] = indexstride;
             }
-            NAD_PTRS(axisdata)[niter] = 0;
+            NAD_PTRS(axisdata)[nop] = 0;
             indexstride *= shape;
         }
     }
@@ -4064,7 +4065,7 @@ npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     int ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     switch (order) {
     case NPY_CORDER:
@@ -4085,7 +4086,7 @@ npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order)
             int forder = 1;
 
             /* Check that all the array inputs are fortran order */
-            for (iiter = 0; iiter < niter; ++iiter, ++op) {
+            for (iop = 0; iop < nop; ++iop, ++op) {
                 if (*op && !PyArray_CHKFLAGS(*op, NPY_F_CONTIGUOUS)) {
                    forder = 0;
                    break;
@@ -4115,12 +4116,12 @@ npyiter_flip_negative_strides(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     npy_intp istrides, nstrides = NAD_NSTRIDES();
     NpyIter_AxisData *axisdata, *axisdata0;
     npy_intp *baseoffsets;
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     int any_flipped = 0;
 
     axisdata0 = axisdata = NIT_AXISDATA(iter);
@@ -4133,11 +4134,11 @@ npyiter_flip_negative_strides(NpyIter *iter)
          * Check the signs of all the strides, excluding
          * the index stride at the end.
          */
-        for (iiter = 0; iiter < niter; ++iiter) {
-            if (strides[iiter] < 0) {
+        for (iop = 0; iop < nop; ++iop) {
+            if (strides[iop] < 0) {
                 any_negative = 1;
             }
-            else if (strides[iiter] != 0) {
+            else if (strides[iop] != 0) {
                 break;
             }
         }
@@ -4145,7 +4146,7 @@ npyiter_flip_negative_strides(NpyIter *iter)
          * If at least on stride is negative and none are positive,
          * flip all the strides for this dimension.
          */
-        if (any_negative && iiter == niter) {
+        if (any_negative && iop == nop) {
             npy_intp shapem1 = NAD_SHAPE(axisdata) - 1;
 
             for (istrides = 0; istrides < nstrides; ++istrides) {
@@ -4197,13 +4198,13 @@ npyiter_reverse_axis_ordering(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp i, temp, size;
     npy_intp *first, *last;
-    char *perm;
+    npy_int8 *perm;
 
-    size = NIT_AXISDATA_SIZEOF(itflags, ndim, niter)/NPY_SIZEOF_INTP;
+    size = NIT_AXISDATA_SIZEOF(itflags, ndim, nop)/NPY_SIZEOF_INTP;
     first = (npy_intp*)NIT_AXISDATA(iter);
     last = first + (ndim-1)*size;
 
@@ -4221,7 +4222,7 @@ npyiter_reverse_axis_ordering(NpyIter *iter)
     /* Store the perm we applied */
     perm = NIT_PERM(iter);
     for(i = ndim-1; i >= 0; --i, ++perm) {
-        *perm = (char)i;
+        *perm = (npy_int8)i;
     }
 
     NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
@@ -4237,13 +4238,13 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     npy_intp ax_i0, ax_i1, ax_ipos;
-    char ax_j0, ax_j1;
-    char *perm;
+    npy_int8 ax_j0, ax_j1;
+    npy_int8 *perm;
     NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     int permuted = 0;
 
     perm = NIT_PERM(iter);
@@ -4269,10 +4270,10 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
 
             strides1 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, ax_j1));
 
-            for (iiter = 0; iiter < niter; ++iiter) {
-                if (strides0[iiter] != 0 && strides1[iiter] != 0) {
-                    if (intp_abs(strides1[iiter]) <=
-                                            intp_abs(strides0[iiter])) {
+            for (iop = 0; iop < nop; ++iop) {
+                if (strides0[iop] != 0 && strides1[iop] != 0) {
+                    if (intp_abs(strides1[iop]) <=
+                                            intp_abs(strides0[iop])) {
                         /*
                          * Set swap even if it's not ambiguous already,
                          * because in the case of conflicts between
@@ -4335,14 +4336,13 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
 
             /* If this axis hasn't been touched yet, process it */
             if (NAD_INDEX(ad_i) == 1) {
-                char pidim = perm[idim], qidim;
+                npy_int8 pidim = perm[idim];
                 npy_intp tmp;
                 NpyIter_AxisData *ad_p, *ad_q;
 
                 if (pidim != idim) {
                     /* Follow the cycle, copying the data */
                     for (i = 0; i < size; ++i) {
-                        qidim = (char)idim;
                         pidim = perm[idim];
                         ad_q = ad_i;
                         tmp = *((npy_intp*)ad_q + i);
@@ -4350,7 +4350,6 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
                             ad_p = NIT_INDEX_AXISDATA(axisdata, pidim);
                             *((npy_intp*)ad_q + i) = *((npy_intp*)ad_p + i);
 
-                            qidim = pidim;
                             ad_q = ad_p;
                             pidim = perm[(int)pidim];
                         }
@@ -4377,11 +4376,11 @@ npyiter_coalesce_axes(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     npy_intp istrides, nstrides = NAD_NSTRIDES();
     NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     NpyIter_AxisData *ad_compress;
     npy_intp new_ndim = 1;
 
@@ -4434,11 +4433,11 @@ npyiter_coalesce_axes(NpyIter *iter)
      * compress the data into the new layout.
      */
     if (new_ndim < ndim) {
-        char *perm = NIT_PERM(iter);
+        npy_int8 *perm = NIT_PERM(iter);
 
         /* Reset to an identity perm */
         for (idim = 0; idim < new_ndim; ++idim) {
-            perm[idim] = (char)idim;
+            perm[idim] = (npy_int8)idim;
         }
         NIT_NDIM(iter) = new_ndim;
     }
@@ -4462,9 +4461,9 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
-    char *perm = NIT_PERM(iter);
+    npy_int8 *perm = NIT_PERM(iter);
     npy_intp new_shape[NPY_MAXDIMS], strides[NPY_MAXDIMS],
              stride = op_dtype->elsize;
     char reversestride[NPY_MAXDIMS], anyreverse = 0;
@@ -4493,7 +4492,7 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
     }
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     memset(reversestride, 0, NPY_MAXDIMS);
     /* Initialize the strides to invalid values */
@@ -4503,7 +4502,7 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
 
     if (op_axes != NULL) {
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            char p;
+            npy_int8 p;
 
             /* Apply the perm to get the original axis */
             p = perm[idim];
@@ -4560,8 +4559,8 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                             return NULL;
                         }
 
-                        NPY_IT_DBG_PRINT("Iterator: Indicating that a reduction "
-                                          "is occurring\n");
+                        NPY_IT_DBG_PRINT("Iterator: Indicating that a "
+                                          "reduction is occurring\n");
                         /* Indicate that a reduction is occurring */
                         NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
                         (*op_itflags) |= NPY_OP_ITFLAG_REDUCE;
@@ -4572,15 +4571,15 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
     }
     else {
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            char p;
+            npy_int8 p;
 
             /* Apply the perm to get the original axis */
             p = perm[idim];
             if (p < 0) {
-                i = op_ndim+p;
+                i = op_ndim + p;
             }
             else {
-                i = op_ndim-p-1;
+                i = op_ndim - p - 1;
             }
 
             if (i >= 0) {
@@ -4734,7 +4733,7 @@ npyiter_allocate_arrays(NpyIter *iter,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     NpyIter_BufferData *bufferdata = NULL;
     PyArrayObject **op = NIT_OPERANDS(iter);
@@ -4744,136 +4743,136 @@ npyiter_allocate_arrays(NpyIter *iter,
     }
 
 
-    for (iiter = 0; iiter < niter; ++iiter) {
+    for (iop = 0; iop < nop; ++iop) {
         /* NULL means an output the iterator should allocate */
-        if (op[iiter] == NULL) {
+        if (op[iop] == NULL) {
             PyArrayObject *out;
             PyTypeObject *op_subtype;
             int ondim = output_scalars ? 0 : ndim;
 
             /* Check whether the subtype was disabled */
-            op_subtype = (op_flags[iiter]&NPY_ITER_NO_SUBTYPE) ?
+            op_subtype = (op_flags[iop]&NPY_ITER_NO_SUBTYPE) ?
                                                 &PyArray_Type : subtype;
 
             /* Allocate the output array */
             out = npyiter_new_temp_array(iter, op_subtype,
-                                        flags, &op_itflags[iiter],
+                                        flags, &op_itflags[iop],
                                         ondim,
                                         NULL,
-                                        op_dtype[iiter],
-                                        op_axes ? op_axes[iiter] : NULL);
+                                        op_dtype[iop],
+                                        op_axes ? op_axes[iop] : NULL);
             if (out == NULL) {
                 return 0;
             }
 
-            op[iiter] = out;
+            op[iop] = out;
 
             /*
              * Now we need to replace the pointers and strides with values
              * from the new array.
              */
-            npyiter_replace_axisdata(iter, iiter, op[iiter], ondim,
-                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+            npyiter_replace_axisdata(iter, iop, op[iop], ondim,
+                    PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
             /* New arrays are aligned and need no cast */
-            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
-            op_itflags[iiter] &= ~NPY_OP_ITFLAG_CAST;
+            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         /*
          * If casting is required, the operand is read-only, and
          * it's an array scalar, make a copy whether or not the
          * copy flag is enabled.
          */
-        else if ((op_itflags[iiter]&(NPY_OP_ITFLAG_CAST|
+        else if ((op_itflags[iop]&(NPY_OP_ITFLAG_CAST|
                          NPY_OP_ITFLAG_READ|
                          NPY_OP_ITFLAG_WRITE)) == (NPY_OP_ITFLAG_CAST|
                                                    NPY_OP_ITFLAG_READ) &&
-                          PyArray_NDIM(op[iiter]) == 0) {
+                          PyArray_NDIM(op[iop]) == 0) {
             PyArrayObject *temp;
-            Py_INCREF(op_dtype[iiter]);
+            Py_INCREF(op_dtype[iop]);
             temp = (PyArrayObject *)PyArray_NewFromDescr(
-                                        &PyArray_Type, op_dtype[iiter],
+                                        &PyArray_Type, op_dtype[iop],
                                         0, NULL, NULL, NULL, 0, NULL);
             if (temp == NULL) {
                 return 0;
             }
-            if (PyArray_CopyInto(temp, op[iiter]) != 0) {
+            if (PyArray_CopyInto(temp, op[iop]) != 0) {
                 Py_DECREF(temp);
                 return 0;
             }
-            Py_DECREF(op[iiter]);
-            op[iiter] = temp;
+            Py_DECREF(op[iop]);
+            op[iop] = temp;
 
             /*
              * Now we need to replace the pointers and strides with values
              * from the temporary array.
              */
-            npyiter_replace_axisdata(iter, iiter, op[iiter], 0,
-                    PyArray_DATA(op[iiter]), NULL);
+            npyiter_replace_axisdata(iter, iop, op[iop], 0,
+                    PyArray_DATA(op[iop]), NULL);
 
             /*
              * New arrays are aligned need no cast, and in the case
              * of scalars, always have stride 0 so never need buffering
              */
-            op_itflags[iiter] |= (NPY_OP_ITFLAG_ALIGNED|
+            op_itflags[iop] |= (NPY_OP_ITFLAG_ALIGNED|
                                   NPY_OP_ITFLAG_BUFNEVER);
-            op_itflags[iiter] &= ~NPY_OP_ITFLAG_CAST;
+            op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
             if (itflags&NPY_ITFLAG_BUFFER) {
-                NBF_STRIDES(bufferdata)[iiter] = 0;
+                NBF_STRIDES(bufferdata)[iop] = 0;
             }
         }
         /* If casting is required and permitted */
-        else if ((op_itflags[iiter]&NPY_OP_ITFLAG_CAST) &&
-                   (op_flags[iiter]&(NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) {
+        else if ((op_itflags[iop]&NPY_OP_ITFLAG_CAST) &&
+                   (op_flags[iop]&(NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) {
             PyArrayObject *temp;
-            int ondim = PyArray_NDIM(op[iiter]);
+            int ondim = PyArray_NDIM(op[iop]);
 
             /* Allocate the temporary array, if possible */
             temp = npyiter_new_temp_array(iter, &PyArray_Type,
-                                        flags, &op_itflags[iiter],
+                                        flags, &op_itflags[iop],
                                         ondim,
-                                        PyArray_DIMS(op[iiter]),
-                                        op_dtype[iiter],
-                                        op_axes ? op_axes[iiter] : NULL);
+                                        PyArray_DIMS(op[iop]),
+                                        op_dtype[iop],
+                                        op_axes ? op_axes[iop] : NULL);
             if (temp == NULL) {
                 return 0;
             }
 
             /* If the data will be read, copy it into temp */
-            if (op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
-                if (PyArray_CopyInto(temp, op[iiter]) != 0) {
+            if (op_itflags[iop]&NPY_OP_ITFLAG_READ) {
+                if (PyArray_CopyInto(temp, op[iop]) != 0) {
                     Py_DECREF(temp);
                     return 0;
                 }
             }
             /* If the data will be written to, set UPDATEIFCOPY */
-            if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+            if (op_itflags[iop]&NPY_OP_ITFLAG_WRITE) {
                 PyArray_FLAGS(temp) |= NPY_UPDATEIFCOPY;
-                PyArray_FLAGS(op[iiter]) &= ~NPY_WRITEABLE;
-                Py_INCREF(op[iiter]);
-                temp->base = (PyObject *)op[iiter];
+                PyArray_FLAGS(op[iop]) &= ~NPY_WRITEABLE;
+                Py_INCREF(op[iop]);
+                temp->base = (PyObject *)op[iop];
             }
 
-            Py_DECREF(op[iiter]);
-            op[iiter] = temp;
+            Py_DECREF(op[iop]);
+            op[iop] = temp;
 
             /*
              * Now we need to replace the pointers and strides with values
              * from the temporary array.
              */
-            npyiter_replace_axisdata(iter, iiter, op[iiter], ondim,
-                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+            npyiter_replace_axisdata(iter, iop, op[iop], ondim,
+                    PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
             /* The temporary copy is aligned and needs no cast */
-            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
-            op_itflags[iiter] &= ~NPY_OP_ITFLAG_CAST;
+            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         else {
             /*
              * Buffering must be enabled for casting/conversion if copy
              * wasn't specified.
              */
-            if ((op_itflags[iiter]&NPY_OP_ITFLAG_CAST) &&
+            if ((op_itflags[iop]&NPY_OP_ITFLAG_CAST) &&
                                   !(itflags&NPY_ITFLAG_BUFFER)) {
                 PyErr_SetString(PyExc_TypeError,
                         "Iterator operand required copying or buffering, "
@@ -4885,20 +4884,20 @@ npyiter_allocate_arrays(NpyIter *iter,
              * If the operand is aligned, any buffering can use aligned
              * optimizations.
              */
-            if (PyArray_ISALIGNED(op[iiter])) {
-                op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            if (PyArray_ISALIGNED(op[iop])) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
             }
         }
 
         /* Here we can finally check for contiguous iteration */
-        if (op_flags[iiter]&NPY_ITER_CONTIG) {
+        if (op_flags[iop]&NPY_ITER_CONTIG) {
             NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-            npy_intp stride = NAD_STRIDES(axisdata)[iiter];
+            npy_intp stride = NAD_STRIDES(axisdata)[iop];
 
-            if (stride != op_dtype[iiter]->elsize) {
+            if (stride != op_dtype[iop]->elsize) {
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_CONTIG\n");
-                op_itflags[iiter] |= NPY_OP_ITFLAG_CAST;
+                op_itflags[iop] |= NPY_OP_ITFLAG_CAST;
                 if (!(itflags&NPY_ITFLAG_BUFFER)) {
                     PyErr_SetString(PyExc_TypeError,
                             "Iterator operand required buffering, "
@@ -4914,21 +4913,21 @@ npyiter_allocate_arrays(NpyIter *iter,
          * the inner stride of this operand works for the whole
          * array, we can set NPY_OP_ITFLAG_BUFNEVER.
          */
-        if ((itflags&NPY_ITFLAG_BUFFER) && !(op_itflags[iiter]&NPY_OP_ITFLAG_CAST)) {
+        if ((itflags&NPY_ITFLAG_BUFFER) && !(op_itflags[iop]&NPY_OP_ITFLAG_CAST)) {
             NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
             if (ndim == 1) {
-                op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
-                NBF_STRIDES(bufferdata)[iiter] = NAD_STRIDES(axisdata)[iiter];
+                op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
+                NBF_STRIDES(bufferdata)[iop] = NAD_STRIDES(axisdata)[iop];
             }
-            else if (PyArray_NDIM(op[iiter]) > 0) {
+            else if (PyArray_NDIM(op[iop]) > 0) {
                 npy_intp stride, shape, innerstride = 0, innershape;
                 npy_intp sizeof_axisdata =
-                                    NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+                                    NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
                 /* Find stride of the first non-empty shape */
                 for (idim = 0; idim < ndim; ++idim) {
                     innershape = NAD_SHAPE(axisdata);
                     if (innershape != 1) {
-                        innerstride = NAD_STRIDES(axisdata)[iiter];
+                        innerstride = NAD_STRIDES(axisdata)[iop];
                         break;
                     }
                     NIT_ADVANCE_AXISDATA(axisdata, 1);
@@ -4937,7 +4936,7 @@ npyiter_allocate_arrays(NpyIter *iter,
                 NIT_ADVANCE_AXISDATA(axisdata, 1);
                 /* Check that everything could have coalesced together */
                 for (; idim < ndim; ++idim) {
-                    stride = NAD_STRIDES(axisdata)[iiter];
+                    stride = NAD_STRIDES(axisdata)[iop];
                     shape = NAD_SHAPE(axisdata);
                     if (shape != 1) {
                         /*
@@ -4959,8 +4958,8 @@ npyiter_allocate_arrays(NpyIter *iter,
                  * dimension.
                  */
                 if (idim == ndim) {
-                    op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
-                    NBF_STRIDES(bufferdata)[iiter] = innerstride;
+                    op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
+                    NBF_STRIDES(bufferdata)[iop] = innerstride;
                 }
             }
         }
@@ -4975,19 +4974,19 @@ npyiter_allocate_arrays(NpyIter *iter,
  * subtype of the input array with highest priority.
  */
 static void
-npyiter_get_priority_subtype(int niter, PyArrayObject **op,
+npyiter_get_priority_subtype(int nop, PyArrayObject **op,
                             char *op_itflags,
                             double *subtype_priority,
                             PyTypeObject **subtype)
 {
-    int iiter;
+    int iop;
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op[iiter] != NULL && op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
-            double priority = PyArray_GetPriority((PyObject *)op[iiter], 0.0);
+    for (iop = 0; iop < nop; ++iop) {
+        if (op[iop] != NULL && op_itflags[iop]&NPY_OP_ITFLAG_READ) {
+            double priority = PyArray_GetPriority((PyObject *)op[iop], 0.0);
             if (priority > *subtype_priority) {
                 *subtype_priority = priority;
-                *subtype = Py_TYPE(op[iiter]);
+                *subtype = Py_TYPE(op[iop]);
             }
         }
     }
@@ -4999,12 +4998,12 @@ npyiter_get_priority_subtype(int niter, PyArrayObject **op,
  * are not read from out of the calculation.
  */
 static PyArray_Descr *
-npyiter_get_common_dtype(int niter, PyArrayObject **op,
+npyiter_get_common_dtype(int nop, PyArrayObject **op,
                         char *op_itflags, PyArray_Descr **op_dtype,
                         PyArray_Descr **op_request_dtypes,
                         int only_inputs, int output_scalars)
 {
-    int iiter;
+    int iop;
     npy_intp narrs = 0, ndtypes = 0;
     PyArrayObject *arrs[NPY_MAXARGS];
     PyArray_Descr *dtypes[NPY_MAXARGS];
@@ -5012,18 +5011,18 @@ npyiter_get_common_dtype(int niter, PyArrayObject **op,
 
     NPY_IT_DBG_PRINT("Iterator: Getting a common data type from operands\n");
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op_dtype[iiter] != NULL &&
-                    (!only_inputs || (op_itflags[iiter]&NPY_OP_ITFLAG_READ))) {
+    for (iop = 0; iop < nop; ++iop) {
+        if (op_dtype[iop] != NULL &&
+                    (!only_inputs || (op_itflags[iop]&NPY_OP_ITFLAG_READ))) {
             /* If no dtype was requested and the op is a scalar, pass the op */
             if ((op_request_dtypes == NULL ||
-                            op_request_dtypes[iiter] == NULL) &&
-                                            PyArray_NDIM(op[iiter]) == 0) {
-                arrs[narrs++] = op[iiter];
+                            op_request_dtypes[iop] == NULL) &&
+                                            PyArray_NDIM(op[iop]) == 0) {
+                arrs[narrs++] = op[iop];
             }
             /* Otherwise just pass in the dtype */
             else {
-                dtypes[ndtypes++] = op_dtype[iiter];
+                dtypes[ndtypes++] = op_dtype[iop];
             }
         }
     }
@@ -5059,7 +5058,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter = 0, niter = NIT_NITER(iter);
+    int iop = 0, nop = NIT_NOP(iter);
 
     npy_intp i;
     char *op_itflags = NIT_OPITFLAGS(iter);
@@ -5077,14 +5076,14 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     void *transferdata = NULL;
     int needs_api = 0;
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        char flags = op_itflags[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        char flags = op_itflags[iop];
         /*
          * Reduction operands may be buffered with a different stride,
          * so we must pass NPY_MAX_INTP to the transfer function factory.
          */
         op_stride = (flags&NPY_OP_ITFLAG_REDUCE) ? NPY_MAX_INTP :
-                                                   strides[iiter];
+                                                   strides[iop];
 
         /*
          * If we have determined that a buffer may be needed,
@@ -5096,40 +5095,40 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                 if (PyArray_GetDTypeTransferFunction(
                                         (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
                                         op_stride,
-                                        op_dtype[iiter]->elsize,
-                                        PyArray_DESCR(op[iiter]),
-                                        op_dtype[iiter],
+                                        op_dtype[iop]->elsize,
+                                        PyArray_DESCR(op[iop]),
+                                        op_dtype[iop],
                                         move_references,
                                         &stransfer,
                                         &transferdata,
                                         &needs_api) != NPY_SUCCEED) {
                     goto fail;
                 }
-                readtransferfn[iiter] = stransfer;
-                readtransferdata[iiter] = transferdata;
+                readtransferfn[iop] = stransfer;
+                readtransferdata[iop] = transferdata;
             }
             else {
-                readtransferfn[iiter] = NULL;
+                readtransferfn[iop] = NULL;
             }
             if (flags&NPY_OP_ITFLAG_WRITE) {
                 int move_references = 1;
                 if (PyArray_GetDTypeTransferFunction(
                                         (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
-                                        op_dtype[iiter]->elsize,
+                                        op_dtype[iop]->elsize,
                                         op_stride,
-                                        op_dtype[iiter],
-                                        PyArray_DESCR(op[iiter]),
+                                        op_dtype[iop],
+                                        PyArray_DESCR(op[iop]),
                                         move_references,
                                         &stransfer,
                                         &transferdata,
                                         &needs_api) != NPY_SUCCEED) {
                     goto fail;
                 }
-                writetransferfn[iiter] = stransfer;
-                writetransferdata[iiter] = transferdata;
+                writetransferfn[iop] = stransfer;
+                writetransferdata[iop] = transferdata;
             }
             /* If no write back but there are references make a decref fn */
-            else if (PyDataType_REFCHK(op_dtype[iiter])) {
+            else if (PyDataType_REFCHK(op_dtype[iop])) {
                 /*
                  * By passing NULL to dst_type and setting move_references
                  * to 1, we get back a function that just decrements the
@@ -5137,24 +5136,24 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                  */
                 if (PyArray_GetDTypeTransferFunction(
                                         (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
-                                        op_dtype[iiter]->elsize, 0,
-                                        op_dtype[iiter], NULL,
+                                        op_dtype[iop]->elsize, 0,
+                                        op_dtype[iop], NULL,
                                         1,
                                         &stransfer,
                                         &transferdata,
                                         &needs_api) != NPY_SUCCEED) {
                     goto fail;
                 }
-                writetransferfn[iiter] = stransfer;
-                writetransferdata[iiter] = transferdata;
+                writetransferfn[iop] = stransfer;
+                writetransferdata[iop] = transferdata;
             }
             else {
-                writetransferfn[iiter] = NULL;
+                writetransferfn[iop] = NULL;
             }
         }
         else {
-            readtransferfn[iiter] = NULL;
-            writetransferfn[iiter] = NULL;
+            readtransferfn[iop] = NULL;
+            writetransferfn[iop] = NULL;
         }
     }
 
@@ -5166,14 +5165,14 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     return 1;
 
 fail:
-    for (i = 0; i < iiter; ++i) {
-        if (readtransferdata[iiter] != NULL) {
-            PyArray_FreeStridedTransferData(readtransferdata[iiter]);
-            readtransferdata[iiter] = NULL;
+    for (i = 0; i < iop; ++i) {
+        if (readtransferdata[iop] != NULL) {
+            PyArray_FreeStridedTransferData(readtransferdata[iop]);
+            readtransferdata[iop] = NULL;
         }
-        if (writetransferdata[iiter] != NULL) {
-            PyArray_FreeStridedTransferData(writetransferdata[iiter]);
-            writetransferdata[iiter] = NULL;
+        if (writetransferdata[iop] != NULL) {
+            PyArray_FreeStridedTransferData(writetransferdata[iop]);
+            writetransferdata[iop] = NULL;
         }
     }
     return 0;
@@ -5191,7 +5190,7 @@ npyiter_allocate_buffers(NpyIter *iter, char **errmsg)
 {
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     /*int ndim = NIT_NDIM(iter);*/
-    int iiter = 0, niter = NIT_NITER(iter);
+    int iop = 0, nop = NIT_NOP(iter);
 
     npy_intp i;
     char *op_itflags = NIT_OPITFLAGS(iter);
@@ -5200,15 +5199,15 @@ npyiter_allocate_buffers(NpyIter *iter, char **errmsg)
     npy_intp buffersize = NBF_BUFFERSIZE(bufferdata);
     char *buffer, **buffers = NBF_BUFFERS(bufferdata);
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        char flags = op_itflags[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        char flags = op_itflags[iop];
 
         /*
          * If we have determined that a buffer may be needed,
          * allocate one.
          */
         if (!(flags&NPY_OP_ITFLAG_BUFNEVER)) {
-            npy_intp itemsize = op_dtype[iiter]->elsize;
+            npy_intp itemsize = op_dtype[iop]->elsize;
             buffer = PyArray_malloc(itemsize*buffersize);
             if (buffer == NULL) {
                 if (errmsg == NULL) {
@@ -5219,14 +5218,14 @@ npyiter_allocate_buffers(NpyIter *iter, char **errmsg)
                 }
                 goto fail;
             }
-            buffers[iiter] = buffer;
+            buffers[iop] = buffer;
         }
     }
 
     return 1;
 
 fail:
-    for (i = 0; i < iiter; ++i) {
+    for (i = 0; i < iop; ++i) {
         if (buffers[i] != NULL) {
             PyArray_free(buffers[i]);
             buffers[i] = NULL;
@@ -5245,7 +5244,7 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int niter = NIT_NITER(iter);
+    int nop = NIT_NOP(iter);
 
     char **dataptr;
     NpyIter_AxisData *axisdata;
@@ -5253,7 +5252,7 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
     npy_intp istrides, nstrides, i, shape;
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     nstrides = NAD_NSTRIDES();
 
     NIT_ITERINDEX(iter) = iterindex;
@@ -5328,7 +5327,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     char *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -5340,19 +5339,18 @@ npyiter_copy_from_buffers(NpyIter *iter)
                 buffersize = NBF_BUFFERSIZE(bufferdata);
     npy_intp *strides = NBF_STRIDES(bufferdata),
              *ad_strides = NAD_STRIDES(axisdata);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
     char **buffers = NBF_BUFFERS(bufferdata);
     char *buffer;
 
     npy_intp reduce_outerdim = 0;
     npy_intp *reduce_outerstrides = NULL;
-    char **reduce_outerptrs = NULL;
 
     PyArray_StridedTransferFn *stransfer = NULL;
     void *transferdata = NULL;
 
-    npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, niter) /
+    npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) /
                                 NPY_SIZEOF_INTP;
 
     /* If we're past the end, nothing to copy */
@@ -5365,39 +5363,38 @@ npyiter_copy_from_buffers(NpyIter *iter)
     if (itflags&NPY_ITFLAG_REDUCE) {
         reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata);
         reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
-        reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
         reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
         transfersize *= NBF_REDUCE_OUTERSIZE(bufferdata);
     }
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        stransfer = NBF_WRITETRANSFERFN(bufferdata)[iiter];
-        transferdata = NBF_WRITETRANSFERDATA(bufferdata)[iiter];
-        buffer = buffers[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        stransfer = NBF_WRITETRANSFERFN(bufferdata)[iop];
+        transferdata = NBF_WRITETRANSFERDATA(bufferdata)[iop];
+        buffer = buffers[iop];
         /*
          * Copy the data back to the arrays.  If the type has refs,
          * this function moves them so the buffer's refs are released.
          */
-        if ((stransfer != NULL) && (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)) {
+        if ((stransfer != NULL) && (op_itflags[iop]&NPY_OP_ITFLAG_WRITE)) {
             /* Copy back only if the pointer was pointing to the buffer */
-            npy_intp delta = (ptrs[iiter] - buffer);
-            if (0 <= delta && delta <= buffersize*dtypes[iiter]->elsize) {
+            npy_intp delta = (ptrs[iop] - buffer);
+            if (0 <= delta && delta <= buffersize*dtypes[iop]->elsize) {
                 npy_intp op_transfersize;
 
                 npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape;
                 int ndim_transfer;
 
                 NPY_IT_DBG_PRINT1("Iterator: Operand %d was buffered\n",
-                                            (int)iiter);
+                                            (int)iop);
 
                 /*
                  * If this operand is being reduced in the inner loop,
                  * its buffering stride was set to zero, and just
                  * one element was copied.
                  */
-                if (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) {
-                    if (strides[iiter] == 0) {
-                        if (reduce_outerstrides[iiter] == 0) {
+                if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) {
+                    if (strides[iop] == 0) {
+                        if (reduce_outerstrides[iop] == 0) {
                             op_transfersize = 1;
                             src_stride = 0;
                             dst_strides = &src_stride;
@@ -5407,19 +5404,19 @@ npyiter_copy_from_buffers(NpyIter *iter)
                         }
                         else {
                             op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
-                            src_stride = reduce_outerstrides[iiter];
+                            src_stride = reduce_outerstrides[iop];
                             dst_strides =
-                                    &NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                                    &NAD_STRIDES(reduce_outeraxisdata)[iop];
                             dst_coords = &NAD_INDEX(reduce_outeraxisdata);
                             dst_shape = &NAD_SHAPE(reduce_outeraxisdata);
                             ndim_transfer = ndim - reduce_outerdim;
                         }
                     }
                     else {
-                        if (reduce_outerstrides[iiter] == 0) {
+                        if (reduce_outerstrides[iop] == 0) {
                             op_transfersize = NBF_SIZE(bufferdata);
-                            src_stride = strides[iiter];
-                            dst_strides = &ad_strides[iiter];
+                            src_stride = strides[iop];
+                            dst_strides = &ad_strides[iop];
                             dst_coords = &NAD_INDEX(axisdata);
                             dst_shape = &NAD_SHAPE(axisdata);
                             ndim_transfer = reduce_outerdim ?
@@ -5427,8 +5424,8 @@ npyiter_copy_from_buffers(NpyIter *iter)
                         }
                         else {
                             op_transfersize = transfersize;
-                            src_stride = strides[iiter];
-                            dst_strides = &ad_strides[iiter];
+                            src_stride = strides[iop];
+                            dst_strides = &ad_strides[iop];
                             dst_coords = &NAD_INDEX(axisdata);
                             dst_shape = &NAD_SHAPE(axisdata);
                             ndim_transfer = ndim;
@@ -5437,8 +5434,8 @@ npyiter_copy_from_buffers(NpyIter *iter)
                 }
                 else {
                     op_transfersize = transfersize;
-                    src_stride = strides[iiter];
-                    dst_strides = &ad_strides[iiter];
+                    src_stride = strides[iop];
+                    dst_strides = &ad_strides[iop];
                     dst_coords = &NAD_INDEX(axisdata);
                     dst_shape = &NAD_SHAPE(axisdata);
                     ndim_transfer = ndim;
@@ -5446,14 +5443,14 @@ npyiter_copy_from_buffers(NpyIter *iter)
 
                 NPY_IT_DBG_PRINT2("Iterator: Copying buffer to "
                                     "operand %d (%d items)\n",
-                                    (int)iiter, (int)op_transfersize);
+                                    (int)iop, (int)op_transfersize);
 
                 PyArray_TransferStridedToNDim(ndim_transfer,
-                        ad_ptrs[iiter], dst_strides, axisdata_incr,
+                        ad_ptrs[iop], dst_strides, axisdata_incr,
                         buffer, src_stride,
                         dst_coords, axisdata_incr,
                         dst_shape, axisdata_incr,
-                        op_transfersize, dtypes[iiter]->elsize,
+                        op_transfersize, dtypes[iop]->elsize,
                         stransfer,
                         transferdata);
             }
@@ -5464,13 +5461,13 @@ npyiter_copy_from_buffers(NpyIter *iter)
          */
         else if (stransfer != NULL) {
             /* Decrement refs only if the pointer was pointing to the buffer */
-            npy_intp delta = (ptrs[iiter] - buffer);
-            if (0 <= delta && delta <= transfersize*dtypes[iiter]->elsize) {
+            npy_intp delta = (ptrs[iop] - buffer);
+            if (0 <= delta && delta <= transfersize*dtypes[iop]->elsize) {
                 NPY_IT_DBG_PRINT1("Iterator: Freeing refs and zeroing buffer "
-                                    "of operand %d\n", (int)iiter);
+                                    "of operand %d\n", (int)iop);
                 /* Decrement refs */
-                stransfer(NULL, 0, buffer, dtypes[iiter]->elsize,
-                            transfersize, dtypes[iiter]->elsize,
+                stransfer(NULL, 0, buffer, dtypes[iop]->elsize,
+                            transfersize, dtypes[iop]->elsize,
                             transferdata);
                 /*
                  * Zero out the memory for safety.  For instance,
@@ -5478,7 +5475,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
                  * array pointing into the buffer, it will get None
                  * values for its references after this.
                  */
-                memset(buffer, 0, dtypes[iiter]->elsize*transfersize);
+                memset(buffer, 0, dtypes[iop]->elsize*transfersize);
             }
         }
     }
@@ -5496,7 +5493,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     char *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -5507,7 +5504,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
     PyArrayObject **operands = NIT_OPERANDS(iter);
     npy_intp *strides = NBF_STRIDES(bufferdata),
              *ad_strides = NAD_STRIDES(axisdata);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
     char **buffers = NBF_BUFFERS(bufferdata);
     npy_intp iterindex, iterend, transfersize,
@@ -5527,7 +5524,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
     npy_bool reuse_reduce_loops = (prev_dataptrs != NULL) &&
                     ((itflags&NPY_ITFLAG_REUSE_REDUCE_LOOPS) != 0);
 
-    npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, niter) /
+    npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) /
                                 NPY_SIZEOF_INTP;
 
     NPY_IT_DBG_PRINT("Iterator: Copying inputs to buffers\n");
@@ -5542,13 +5539,30 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 
     /* If last time around, the reduce loop structure was full, we reuse it */
     if (reuse_reduce_loops) {
+        npy_intp full_transfersize;
+
         reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
         reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
         reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata);
         reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
         reduce_innersize = NBF_SIZE(bufferdata);
         NBF_REDUCE_POS(bufferdata) = 0;
-        transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize;
+        /*
+         * Try to do make the outersize as big as possible. This allows
+         * it to shrink when processing the last bit of the outer reduce loop,
+         * then grow again at the beginnning of the next outer reduce loop.
+         */
+        NBF_REDUCE_OUTERSIZE(bufferdata) = (NAD_SHAPE(reduce_outeraxisdata)-
+                                            NAD_INDEX(reduce_outeraxisdata));
+        full_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize;
+        /* If the full transfer size doesn't fit in the buffer, truncate it */
+        if (full_transfersize > NBF_BUFFERSIZE(bufferdata)) {
+            NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize;
+            transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize;
+        }
+        else {
+            transfersize = full_transfersize;
+        }
         NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
 
         NPY_IT_DBG_PRINT3("Reused reduce transfersize: %d innersize: %d "
@@ -5556,6 +5570,8 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                             (int)transfersize,
                             (int)reduce_innersize,
                             (int)NpyIter_GetIterSize(iter));
+        NPY_IT_DBG_PRINT1("Reduced reduce outersize: %d",
+                            (int)NBF_REDUCE_OUTERSIZE(bufferdata));
     }
     /*
      * If there are any reduction operands, we may have to make
@@ -5603,40 +5619,40 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
         is_onestride = 1;
     }
 
-    for (iiter = 0; iiter < niter; ++iiter) {
+    for (iop = 0; iop < nop; ++iop) {
         /*
          * If the buffer is write-only, these two are NULL, and the buffer
          * pointers will be set up but the read copy won't be done
          */
-        stransfer = NBF_READTRANSFERFN(bufferdata)[iiter];
-        transferdata = NBF_READTRANSFERDATA(bufferdata)[iiter];
-        switch (op_itflags[iiter]&
+        stransfer = NBF_READTRANSFERFN(bufferdata)[iop];
+        transferdata = NBF_READTRANSFERDATA(bufferdata)[iop];
+        switch (op_itflags[iop]&
                         (NPY_OP_ITFLAG_BUFNEVER|
                          NPY_OP_ITFLAG_CAST|
                          NPY_OP_ITFLAG_REDUCE)) {
             /* Never need to buffer this operand */
             case NPY_OP_ITFLAG_BUFNEVER:
-                ptrs[iiter] = ad_ptrs[iiter];
+                ptrs[iop] = ad_ptrs[iop];
                 if (itflags&NPY_ITFLAG_REDUCE) {
-                    reduce_outerstrides[iiter] = reduce_innersize *
-                                                 strides[iiter];
-                    reduce_outerptrs[iiter] = ptrs[iiter];
+                    reduce_outerstrides[iop] = reduce_innersize *
+                                                 strides[iop];
+                    reduce_outerptrs[iop] = ptrs[iop];
                 }
                 /*
-                 * Should not adjust the stride - ad_strides[iiter]
-                 * could be zero, but strides[iiter] was initialized
+                 * Should not adjust the stride - ad_strides[iop]
+                 * could be zero, but strides[iop] was initialized
                  * to the first non-trivial stride.
                  */
                 stransfer = NULL;
                 break;
             /* Never need to buffer this operand */
             case NPY_OP_ITFLAG_BUFNEVER|NPY_OP_ITFLAG_REDUCE:
-                ptrs[iiter] = ad_ptrs[iiter];
-                reduce_outerptrs[iiter] = ptrs[iiter];
-                reduce_outerstrides[iiter] = 0;
+                ptrs[iop] = ad_ptrs[iop];
+                reduce_outerptrs[iop] = ptrs[iop];
+                reduce_outerstrides[iop] = 0;
                 /*
-                 * Should not adjust the stride - ad_strides[iiter]
-                 * could be zero, but strides[iiter] was initialized
+                 * Should not adjust the stride - ad_strides[iop]
+                 * could be zero, but strides[iop] was initialized
                  * to the first non-trivial stride.
                  */
                 stransfer = NULL;
@@ -5650,8 +5666,8 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                  * already does that, no need to copy it.
                  */
                 if (is_onestride) {
-                    ptrs[iiter] = ad_ptrs[iiter];
-                    strides[iiter] = ad_strides[iiter];
+                    ptrs[iop] = ad_ptrs[iop];
+                    strides[iop] = ad_strides[iop];
                     stransfer = NULL;
                 }
                 /* If some other op is reduced, we have a double reduce loop */
@@ -5660,33 +5676,33 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                                 (transfersize/reduce_innersize <=
                                             NAD_SHAPE(reduce_outeraxisdata) -
                                             NAD_INDEX(reduce_outeraxisdata))) {
-                    ptrs[iiter] = ad_ptrs[iiter];
-                    reduce_outerptrs[iiter] = ptrs[iiter];
-                    strides[iiter] = ad_strides[iiter];
-                    reduce_outerstrides[iiter] =
-                                    NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                    ptrs[iop] = ad_ptrs[iop];
+                    reduce_outerptrs[iop] = ptrs[iop];
+                    strides[iop] = ad_strides[iop];
+                    reduce_outerstrides[iop] =
+                                    NAD_STRIDES(reduce_outeraxisdata)[iop];
                     stransfer = NULL;
                 }
                 else {
                     /* In this case, the buffer is being used */
-                    ptrs[iiter] = buffers[iiter];
-                    strides[iiter] = dtypes[iiter]->elsize;
+                    ptrs[iop] = buffers[iop];
+                    strides[iop] = dtypes[iop]->elsize;
                     if (itflags&NPY_ITFLAG_REDUCE) {
-                        reduce_outerstrides[iiter] = reduce_innersize *
-                                                     strides[iiter];
-                        reduce_outerptrs[iiter] = ptrs[iiter];
+                        reduce_outerstrides[iop] = reduce_innersize *
+                                                     strides[iop];
+                        reduce_outerptrs[iop] = ptrs[iop];
                     }
                 }
                 break;
             /* Just a copy, but with a reduction */
             case NPY_OP_ITFLAG_REDUCE:
-                if (ad_strides[iiter] == 0) {
-                    strides[iiter] = 0;
+                if (ad_strides[iop] == 0) {
+                    strides[iop] = 0;
                     /* It's all in one stride in the inner loop dimension */
                     if (is_onestride) {
-                        NPY_IT_DBG_PRINT1("reduce op %d all one stride\n", (int)iiter);
-                        ptrs[iiter] = ad_ptrs[iiter];
-                        reduce_outerstrides[iiter] = 0;
+                        NPY_IT_DBG_PRINT1("reduce op %d all one stride\n", (int)iop);
+                        ptrs[iop] = ad_ptrs[iop];
+                        reduce_outerstrides[iop] = 0;
                         stransfer = NULL;
                     }
                     /* It's all in one stride in the reduce outer loop */
@@ -5695,33 +5711,33 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                                             NAD_SHAPE(reduce_outeraxisdata) -
                                             NAD_INDEX(reduce_outeraxisdata))) {
                         NPY_IT_DBG_PRINT1("reduce op %d all one outer stride\n",
-                                                            (int)iiter);
-                        ptrs[iiter] = ad_ptrs[iiter];
+                                                            (int)iop);
+                        ptrs[iop] = ad_ptrs[iop];
                         /* Outer reduce loop advances by one item */
-                        reduce_outerstrides[iiter] =
-                                NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                        reduce_outerstrides[iop] =
+                                NAD_STRIDES(reduce_outeraxisdata)[iop];
                         stransfer = NULL;
                     }
                     /* In this case, the buffer is being used */
                     else {
-                        NPY_IT_DBG_PRINT1("reduce op %d must buffer\n", (int)iiter);
-                        ptrs[iiter] = buffers[iiter];
+                        NPY_IT_DBG_PRINT1("reduce op %d must buffer\n", (int)iop);
+                        ptrs[iop] = buffers[iop];
                         /* Both outer and inner reduce loops have stride 0 */
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
-                            reduce_outerstrides[iiter] = 0;
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
+                            reduce_outerstrides[iop] = 0;
                         }
                         /* Outer reduce loop advances by one item */
                         else {
-                            reduce_outerstrides[iiter] = dtypes[iiter]->elsize;
+                            reduce_outerstrides[iop] = dtypes[iop]->elsize;
                         }
                     }
 
                 }
                 else if (is_onestride) {
-                    NPY_IT_DBG_PRINT1("reduce op %d all one stride in dim 0\n", (int)iiter);
-                    ptrs[iiter] = ad_ptrs[iiter];
-                    strides[iiter] = ad_strides[iiter];
-                    reduce_outerstrides[iiter] = 0;
+                    NPY_IT_DBG_PRINT1("reduce op %d all one stride in dim 0\n", (int)iop);
+                    ptrs[iop] = ad_ptrs[iop];
+                    strides[iop] = ad_strides[iop];
+                    reduce_outerstrides[iop] = 0;
                     stransfer = NULL;
                 }
                 else {
@@ -5730,82 +5746,84 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                                     (transfersize/reduce_innersize <=
                                             NAD_SHAPE(reduce_outeraxisdata) -
                                             NAD_INDEX(reduce_outeraxisdata))) {
-                        ptrs[iiter] = ad_ptrs[iiter];
-                        strides[iiter] = ad_strides[iiter];
+                        ptrs[iop] = ad_ptrs[iop];
+                        strides[iop] = ad_strides[iop];
                         /* Outer reduce loop advances by one item */
-                        reduce_outerstrides[iiter] =
-                                NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                        reduce_outerstrides[iop] =
+                                NAD_STRIDES(reduce_outeraxisdata)[iop];
                         stransfer = NULL;
                     }
                     /* In this case, the buffer is being used */
                     else {
-                        ptrs[iiter] = buffers[iiter];
-                        strides[iiter] = dtypes[iiter]->elsize;
+                        ptrs[iop] = buffers[iop];
+                        strides[iop] = dtypes[iop]->elsize;
 
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
                             /* Reduction in outer reduce loop */
-                            reduce_outerstrides[iiter] = 0;
+                            reduce_outerstrides[iop] = 0;
                         }
                         else {
                             /* Advance to next items in outer reduce loop */
-                            reduce_outerstrides[iiter] = reduce_innersize *
-                                                         dtypes[iiter]->elsize;
+                            reduce_outerstrides[iop] = reduce_innersize *
+                                                         dtypes[iop]->elsize;
                         }
                     }
                 }
-                reduce_outerptrs[iiter] = ptrs[iiter];
+                reduce_outerptrs[iop] = ptrs[iop];
                 break;
             default:
-                /* In this case, the buffer is being used */
-                if (!(op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE)) {
-                    ptrs[iiter] = buffers[iiter];
-                    strides[iiter] = dtypes[iiter]->elsize;
+                /* In this case, the buffer is always being used */
+                any_buffered = 1;
+
+                if (!(op_itflags[iop]&NPY_OP_ITFLAG_REDUCE)) {
+                    ptrs[iop] = buffers[iop];
+                    strides[iop] = dtypes[iop]->elsize;
                     if (itflags&NPY_ITFLAG_REDUCE) {
-                        reduce_outerstrides[iiter] = reduce_innersize *
-                                                     strides[iiter];
-                        reduce_outerptrs[iiter] = ptrs[iiter];
+                        reduce_outerstrides[iop] = reduce_innersize *
+                                                     strides[iop];
+                        reduce_outerptrs[iop] = ptrs[iop];
                     }
                 }
                 /* The buffer is being used with reduction */
                 else {
-                    ptrs[iiter] = buffers[iiter];
-                    if (ad_strides[iiter] == 0) {
-                        NPY_IT_DBG_PRINT1("cast op %d has innermost stride 0\n", (int)iiter);
-                        strides[iiter] = 0;
+                    ptrs[iop] = buffers[iop];
+                    if (ad_strides[iop] == 0) {
+                        NPY_IT_DBG_PRINT1("cast op %d has innermost stride 0\n", (int)iop);
+                        strides[iop] = 0;
                         /* Both outer and inner reduce loops have stride 0 */
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iiter);
-                            reduce_outerstrides[iiter] = 0;
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
+                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iop);
+                            reduce_outerstrides[iop] = 0;
                         }
                         /* Outer reduce loop advances by one item */
                         else {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iiter);
-                            reduce_outerstrides[iiter] = dtypes[iiter]->elsize;
+                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iop);
+                            reduce_outerstrides[iop] = dtypes[iop]->elsize;
                         }
                     }
                     else {
-                        NPY_IT_DBG_PRINT1("cast op %d has innermost stride !=0\n", (int)iiter);
-                        strides[iiter] = dtypes[iiter]->elsize;
+                        NPY_IT_DBG_PRINT1("cast op %d has innermost stride !=0\n", (int)iop);
+                        strides[iop] = dtypes[iop]->elsize;
 
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iiter);
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
+                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iop);
                             /* Reduction in outer reduce loop */
-                            reduce_outerstrides[iiter] = 0;
+                            reduce_outerstrides[iop] = 0;
                         }
                         else {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iiter);
+                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iop);
                             /* Advance to next items in outer reduce loop */
-                            reduce_outerstrides[iiter] = reduce_innersize *
-                                                         dtypes[iiter]->elsize;
+                            reduce_outerstrides[iop] = reduce_innersize *
+                                                         dtypes[iop]->elsize;
                         }
                     }
-                    reduce_outerptrs[iiter] = ptrs[iiter];
+                    reduce_outerptrs[iop] = ptrs[iop];
                 }
                 break;
         }
 
         if (stransfer != NULL) {
-            npy_intp src_itemsize = PyArray_DESCR(operands[iiter])->elsize;
+            npy_intp src_itemsize = PyArray_DESCR(operands[iop])->elsize;
             npy_intp op_transfersize;
 
             npy_intp dst_stride, *src_strides, *src_coords, *src_shape;
@@ -5813,6 +5831,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 
             npy_bool skip_transfer = 0;
 
+            /* If stransfer wasn't set to NULL, buffering is required */
             any_buffered = 1;
 
             /*
@@ -5820,10 +5839,10 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
              * set its buffering stride to zero, and just copy
              * one element.
              */
-            if (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) {
-                if (ad_strides[iiter] == 0) {
-                    strides[iiter] = 0;
-                    if (reduce_outerstrides[iiter] == 0) {
+            if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) {
+                if (ad_strides[iop] == 0) {
+                    strides[iop] = 0;
+                    if (reduce_outerstrides[iop] == 0) {
                         op_transfersize = 1;
                         dst_stride = 0;
                         src_strides = &dst_stride;
@@ -5839,36 +5858,36 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                          * intermediate calculation.
                          */
                         if (prev_dataptrs &&
-                                    prev_dataptrs[iiter] == ad_ptrs[iiter]) {
+                                    prev_dataptrs[iop] == ad_ptrs[iop]) {
                             NPY_IT_DBG_PRINT1("Iterator: skipping operand %d"
                                     " copy because it's a 1-element reduce\n",
-                                    (int)iiter);
+                                    (int)iop);
 
                             skip_transfer = 1;
                         }
                     }
                     else {
                         op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
-                        dst_stride = reduce_outerstrides[iiter];
-                        src_strides = &NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                        dst_stride = reduce_outerstrides[iop];
+                        src_strides = &NAD_STRIDES(reduce_outeraxisdata)[iop];
                         src_coords = &NAD_INDEX(reduce_outeraxisdata);
                         src_shape = &NAD_SHAPE(reduce_outeraxisdata);
                         ndim_transfer = ndim - reduce_outerdim;
                     }
                 }
                 else {
-                    if (reduce_outerstrides[iiter] == 0) {
+                    if (reduce_outerstrides[iop] == 0) {
                         op_transfersize = NBF_SIZE(bufferdata);
-                        dst_stride = strides[iiter];
-                        src_strides = &ad_strides[iiter];
+                        dst_stride = strides[iop];
+                        src_strides = &ad_strides[iop];
                         src_coords = &NAD_INDEX(axisdata);
                         src_shape = &NAD_SHAPE(axisdata);
                         ndim_transfer = reduce_outerdim ? reduce_outerdim : 1;
                     }
                     else {
                         op_transfersize = transfersize;
-                        dst_stride = strides[iiter];
-                        src_strides = &ad_strides[iiter];
+                        dst_stride = strides[iop];
+                        src_strides = &ad_strides[iop];
                         src_coords = &NAD_INDEX(axisdata);
                         src_shape = &NAD_SHAPE(axisdata);
                         ndim_transfer = ndim;
@@ -5877,8 +5896,8 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
             }
             else {
                 op_transfersize = transfersize;
-                dst_stride = strides[iiter];
-                src_strides = &ad_strides[iiter];
+                dst_stride = strides[iop];
+                src_strides = &ad_strides[iop];
                 src_coords = &NAD_INDEX(axisdata);
                 src_shape = &NAD_SHAPE(axisdata);
                 ndim_transfer = ndim;
@@ -5889,19 +5908,19 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
              * and the source pointer for this data didn't change,
              * we don't have to copy the data again.
              */
-            if (reuse_reduce_loops && prev_dataptrs[iiter] == ad_ptrs[iiter]) {
+            if (reuse_reduce_loops && prev_dataptrs[iop] == ad_ptrs[iop]) {
                 NPY_IT_DBG_PRINT2("Iterator: skipping operands %d "
                         "copy (%d items) because loops are reused and the data "
                         "pointer didn't change\n",
-                        (int)iiter, (int)op_transfersize);
+                        (int)iop, (int)op_transfersize);
                 skip_transfer = 1;
             }
 
             /* If the data type requires zero-inititialization */
-            if (PyDataType_FLAGCHK(dtypes[iiter], NPY_NEEDS_INIT)) {
+            if (PyDataType_FLAGCHK(dtypes[iop], NPY_NEEDS_INIT)) {
                 NPY_IT_DBG_PRINT("Iterator: Buffer requires init, "
                                     "memsetting to 0\n");
-                memset(ptrs[iiter], 0, dtypes[iiter]->elsize*op_transfersize);
+                memset(ptrs[iop], 0, dtypes[iop]->elsize*op_transfersize);
                 /* Can't skip the transfer in this case */
                 skip_transfer = 0;
             }
@@ -5909,11 +5928,11 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
             if (!skip_transfer) {
                 NPY_IT_DBG_PRINT2("Iterator: Copying operand %d to "
                                 "buffer (%d items)\n",
-                                (int)iiter, (int)op_transfersize);
+                                (int)iop, (int)op_transfersize);
 
                 PyArray_TransferNDimToStrided(ndim_transfer,
-                        ptrs[iiter], dst_stride,
-                        ad_ptrs[iiter], src_strides, axisdata_incr,
+                        ptrs[iop], dst_stride,
+                        ad_ptrs[iop], src_strides, axisdata_incr,
                         src_coords, axisdata_incr,
                         src_shape, axisdata_incr,
                         op_transfersize, src_itemsize,
@@ -5921,13 +5940,13 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                         transferdata);
             }
         }
-        else if (ptrs[iiter] == buffers[iiter]) {
+        else if (ptrs[iop] == buffers[iop]) {
             /* If the data type requires zero-inititialization */
-            if (PyDataType_FLAGCHK(dtypes[iiter], NPY_NEEDS_INIT)) {
+            if (PyDataType_FLAGCHK(dtypes[iop], NPY_NEEDS_INIT)) {
                 NPY_IT_DBG_PRINT1("Iterator: Write-only buffer for "
                                     "operand %d requires init, "
-                                    "memsetting to 0\n", (int)iiter);
-                memset(ptrs[iiter], 0, dtypes[iiter]->elsize*transfersize);
+                                    "memsetting to 0\n", (int)iop);
+                memset(ptrs[iop], 0, dtypes[iop]->elsize*transfersize);
             }
         }
 
@@ -5980,7 +5999,7 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     NpyIter_AxisData *axisdata;
     npy_intp sizeof_axisdata;
@@ -6000,14 +6019,16 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
         return count;
     }
 
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     axisdata = NIT_AXISDATA(iter);
 
     /* Indicate which REDUCE operands have stride 0 in the inner loop */
     strides = NAD_STRIDES(axisdata);
-    for (iiter = 0; iiter < niter; ++iiter) {
-        stride0op[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) &&
-                           (strides[iiter] == 0);
+    for (iop = 0; iop < nop; ++iop) {
+        stride0op[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) &&
+                           (strides[iop] == 0);
+        NPY_IT_DBG_PRINT2("Iterator: Operand %d has stride 0 in "
+                        "the inner loop? %d\n", iop, (int)stride0op[iop]);
     }
     shape = NAD_SHAPE(axisdata);
     coord = NAD_INDEX(axisdata);
@@ -6018,8 +6039,11 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     /* Go forward through axisdata, calculating the space available */
     for (idim = 1; idim < ndim && reducespace < count;
                                 ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        NPY_IT_DBG_PRINT2("Iterator: inner loop reducespace %d, count %d\n",
+                                (int)reducespace, (int)count);
+
         strides = NAD_STRIDES(axisdata);
-        for (iiter = 0; iiter < niter; ++iiter) {
+        for (iop = 0; iop < nop; ++iop) {
             /*
              * If a reduce stride switched from zero to non-zero, or
              * vice versa, that's the point where the data will stop
@@ -6027,10 +6051,10 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
              * buffer starts with an all zero multi-index up to this
              * point, gives us the reduce_innersize.
              */
-            if((stride0op[iiter] && (strides[iiter] != 0)) ||
-                        (!stride0op[iiter] &&
-                         (strides[iiter] == 0) &&
-                         (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE))) {
+            if((stride0op[iop] && (strides[iop] != 0)) ||
+                        (!stride0op[iop] &&
+                         (strides[iop] == 0) &&
+                         (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE))) {
                 NPY_IT_DBG_PRINT1("Iterator: Reduce operation limits "
                                     "buffer to %d\n", (int)reducespace);
                 /*
@@ -6056,7 +6080,9 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
             }
         }
         /* If we broke out of the loop early, we found reduce_innersize */
-        if (iiter != niter) {
+        if (iop != nop) {
+            NPY_IT_DBG_PRINT2("Iterator: Found first dim not "
+                            "reduce (%d of %d)\n", iop, nop);
             break;
         }
 
@@ -6069,8 +6095,12 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
         factor *= shape;
     }
 
-    /* If there was any non-zero coordinate, can't do the double loop */
-    if (nonzerocoord) {
+    /*
+     * If there was any non-zero coordinate, the reduction inner
+     * loop doesn't fit in the buffersize, or the reduction inner loop
+     * covered the entire iteration size, can't do the double loop.
+     */
+    if (nonzerocoord || count < reducespace || idim == ndim) {
         if (reducespace < count) {
             count = reducespace;
         }
@@ -6086,6 +6116,9 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     *reduce_innersize = reducespace;
     count /= reducespace;
 
+    NPY_IT_DBG_PRINT2("Iterator: reduce_innersize %d count /ed %d\n",
+                    (int)reducespace, (int)count);
+
     /*
      * Continue through the rest of the dimensions.  If there are
      * two separated reduction axes, we may have to cut the buffer
@@ -6096,9 +6129,11 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     factor = 1;
     /* Indicate which REDUCE operands have stride 0 at the current level */
     strides = NAD_STRIDES(axisdata);
-    for (iiter = 0; iiter < niter; ++iiter) {
-        stride0op[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) &&
-                           (strides[iiter] == 0);
+    for (iop = 0; iop < nop; ++iop) {
+        stride0op[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) &&
+                           (strides[iop] == 0);
+        NPY_IT_DBG_PRINT2("Iterator: Operand %d has stride 0 in "
+                        "the outer loop? %d\n", iop, (int)stride0op[iop]);
     }
     shape = NAD_SHAPE(axisdata);
     coord = NAD_INDEX(axisdata);
@@ -6109,8 +6144,10 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
 
     for (; idim < ndim && reducespace < count;
                                 ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        NPY_IT_DBG_PRINT2("Iterator: outer loop reducespace %d, count %d\n",
+                                (int)reducespace, (int)count);
         strides = NAD_STRIDES(axisdata);
-        for (iiter = 0; iiter < niter; ++iiter) {
+        for (iop = 0; iop < nop; ++iop) {
             /*
              * If a reduce stride switched from zero to non-zero, or
              * vice versa, that's the point where the data will stop
@@ -6118,10 +6155,10 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
              * buffer starts with an all zero multi-index up to this
              * point, gives us the reduce_innersize.
              */
-            if((stride0op[iiter] && (strides[iiter] != 0)) ||
-                        (!stride0op[iiter] &&
-                         (strides[iiter] == 0) &&
-                         (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE))) {
+            if((stride0op[iop] && (strides[iop] != 0)) ||
+                        (!stride0op[iop] &&
+                         (strides[iop] == 0) &&
+                         (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE))) {
                 NPY_IT_DBG_PRINT1("Iterator: Reduce operation limits "
                                     "buffer to %d\n", (int)reducespace);
                 /*
@@ -6161,7 +6198,7 @@ NpyIter_DebugPrint(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
-    int iiter, niter = NIT_NITER(iter);
+    int iop, nop = NIT_NOP(iter);
 
     NpyIter_AxisData *axisdata;
     npy_intp sizeof_axisdata;
@@ -6201,17 +6238,17 @@ NpyIter_DebugPrint(NpyIter *iter)
         printf("REUSE_REDUCE_LOOPS ");
     printf("\n");
     printf("| NDim: %d\n", (int)ndim);
-    printf("| NIter: %d\n", (int)niter);
+    printf("| NOp: %d\n", (int)nop);
     printf("| IterSize: %d\n", (int)NIT_ITERSIZE(iter));
     printf("| IterStart: %d\n", (int)NIT_ITERSTART(iter));
     printf("| IterEnd: %d\n", (int)NIT_ITEREND(iter));
     printf("| IterIndex: %d\n", (int)NIT_ITERINDEX(iter));
     printf("| Iterator SizeOf: %d\n",
-                            (int)NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
+                            (int)NIT_SIZEOF_ITERATOR(itflags, ndim, nop));
     printf("| BufferData SizeOf: %d\n",
-                            (int)NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter));
+                            (int)NIT_BUFFERDATA_SIZEOF(itflags, ndim, nop));
     printf("| AxisData SizeOf: %d\n",
-                            (int)NIT_AXISDATA_SIZEOF(itflags, ndim, niter));
+                            (int)NIT_AXISDATA_SIZEOF(itflags, ndim, nop));
     printf("|\n");
 
     printf("| Perm: ");
@@ -6220,43 +6257,43 @@ NpyIter_DebugPrint(NpyIter *iter)
     }
     printf("\n");
     printf("| DTypes: ");
-    for (iiter = 0; iiter < niter; ++iiter) {
-        printf("%p ", (void *)NIT_DTYPES(iter)[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        printf("%p ", (void *)NIT_DTYPES(iter)[iop]);
     }
     printf("\n");
     printf("| DTypes: ");
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (NIT_DTYPES(iter)[iiter] != NULL)
-            PyObject_Print((PyObject*)NIT_DTYPES(iter)[iiter], stdout, 0);
+    for (iop = 0; iop < nop; ++iop) {
+        if (NIT_DTYPES(iter)[iop] != NULL)
+            PyObject_Print((PyObject*)NIT_DTYPES(iter)[iop], stdout, 0);
         else
             printf("(nil) ");
         printf(" ");
     }
     printf("\n");
     printf("| InitDataPtrs: ");
-    for (iiter = 0; iiter < niter; ++iiter) {
-        printf("%p ", (void *)NIT_RESETDATAPTR(iter)[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        printf("%p ", (void *)NIT_RESETDATAPTR(iter)[iop]);
     }
     printf("\n");
     printf("| BaseOffsets: ");
-    for (iiter = 0; iiter < niter; ++iiter) {
-        printf("%i ", (int)NIT_BASEOFFSETS(iter)[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        printf("%i ", (int)NIT_BASEOFFSETS(iter)[iop]);
     }
     printf("\n");
     if (itflags&NPY_ITFLAG_HASINDEX) {
         printf("| InitIndex: %d\n",
-                        (int)(npy_intp)NIT_RESETDATAPTR(iter)[niter]);
+                        (int)(npy_intp)NIT_RESETDATAPTR(iter)[nop]);
     }
     printf("| Operands: ");
-    for (iiter = 0; iiter < niter; ++iiter) {
-        printf("%p ", (void *)NIT_OPERANDS(iter)[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        printf("%p ", (void *)NIT_OPERANDS(iter)[iop]);
     }
     printf("\n");
     printf("| Operand DTypes: ");
-    for (iiter = 0; iiter < niter; ++iiter) {
+    for (iop = 0; iop < nop; ++iop) {
         PyArray_Descr *dtype;
-        if (NIT_OPERANDS(iter)[iiter] != NULL) {
-            dtype = PyArray_DESCR(NIT_OPERANDS(iter)[iiter]);
+        if (NIT_OPERANDS(iter)[iop] != NULL) {
+            dtype = PyArray_DESCR(NIT_OPERANDS(iter)[iop]);
             if (dtype != NULL)
                 PyObject_Print((PyObject *)dtype, stdout, 0);
             else
@@ -6269,19 +6306,19 @@ NpyIter_DebugPrint(NpyIter *iter)
     }
     printf("\n");
     printf("| OpItFlags:\n");
-    for (iiter = 0; iiter < niter; ++iiter) {
-        printf("|   Flags[%d]: ", (int)iiter);
-        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_READ)
+    for (iop = 0; iop < nop; ++iop) {
+        printf("|   Flags[%d]: ", (int)iop);
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_READ)
             printf("READ ");
-        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_WRITE)
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_WRITE)
             printf("WRITE ");
-        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_CAST)
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_CAST)
             printf("CAST ");
-        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_BUFNEVER)
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_BUFNEVER)
             printf("BUFNEVER ");
-        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_ALIGNED)
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_ALIGNED)
             printf("ALIGNED ");
-        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_REDUCE)
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_REDUCE)
             printf("REDUCE ");
         printf("\n");
     }
@@ -6302,77 +6339,77 @@ NpyIter_DebugPrint(NpyIter *iter)
                         (int)NBF_REDUCE_OUTERDIM(bufferdata));
         }
         printf("|   Strides: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%d ", (int)NBF_STRIDES(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%d ", (int)NBF_STRIDES(bufferdata)[iop]);
         printf("\n");
         /* Print the fixed strides when there's no inner loop */
         if (itflags&NPY_ITFLAG_EXLOOP) {
             npy_intp fixedstrides[NPY_MAXDIMS];
             printf("|   Fixed Strides: ");
             NpyIter_GetInnerFixedStrideArray(iter, fixedstrides);
-            for (iiter = 0; iiter < niter; ++iiter)
-                printf("%d ", (int)fixedstrides[iiter]);
+            for (iop = 0; iop < nop; ++iop)
+                printf("%d ", (int)fixedstrides[iop]);
             printf("\n");
         }
         printf("|   Ptrs: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%p ", (void *)NBF_PTRS(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%p ", (void *)NBF_PTRS(bufferdata)[iop]);
         printf("\n");
         if (itflags&NPY_ITFLAG_REDUCE) {
             printf("|   REDUCE Outer Strides: ");
-            for (iiter = 0; iiter < niter; ++iiter)
-                printf("%d ", (int)NBF_REDUCE_OUTERSTRIDES(bufferdata)[iiter]);
+            for (iop = 0; iop < nop; ++iop)
+                printf("%d ", (int)NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop]);
             printf("\n");
             printf("|   REDUCE Outer Ptrs: ");
-            for (iiter = 0; iiter < niter; ++iiter)
-                printf("%p ", (void *)NBF_REDUCE_OUTERPTRS(bufferdata)[iiter]);
+            for (iop = 0; iop < nop; ++iop)
+                printf("%p ", (void *)NBF_REDUCE_OUTERPTRS(bufferdata)[iop]);
             printf("\n");
         }
         printf("|   ReadTransferFn: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%p ", (void *)NBF_READTRANSFERFN(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%p ", (void *)NBF_READTRANSFERFN(bufferdata)[iop]);
         printf("\n");
         printf("|   ReadTransferData: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%p ", (void *)NBF_READTRANSFERDATA(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%p ", (void *)NBF_READTRANSFERDATA(bufferdata)[iop]);
         printf("\n");
         printf("|   WriteTransferFn: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%p ", (void *)NBF_WRITETRANSFERFN(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%p ", (void *)NBF_WRITETRANSFERFN(bufferdata)[iop]);
         printf("\n");
         printf("|   WriteTransferData: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%p ", (void *)NBF_WRITETRANSFERDATA(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%p ", (void *)NBF_WRITETRANSFERDATA(bufferdata)[iop]);
         printf("\n");
         printf("|   Buffers: ");
-        for (iiter = 0; iiter < niter; ++iiter)
-            printf("%p ", (void *)NBF_BUFFERS(bufferdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop)
+            printf("%p ", (void *)NBF_BUFFERS(bufferdata)[iop]);
         printf("\n");
         printf("|\n");
     }
 
     axisdata = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
         printf("| AxisData[%d]:\n", (int)idim);
         printf("|   Shape: %d\n", (int)NAD_SHAPE(axisdata));
         printf("|   Index: %d\n", (int)NAD_INDEX(axisdata));
         printf("|   Strides: ");
-        for (iiter = 0; iiter < niter; ++iiter) {
-            printf("%d ", (int)NAD_STRIDES(axisdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop) {
+            printf("%d ", (int)NAD_STRIDES(axisdata)[iop]);
         }
         printf("\n");
         if (itflags&NPY_ITFLAG_HASINDEX) {
-            printf("|   Index Stride: %d\n", (int)NAD_STRIDES(axisdata)[niter]);
+            printf("|   Index Stride: %d\n", (int)NAD_STRIDES(axisdata)[nop]);
         }
         printf("|   Ptrs: ");
-        for (iiter = 0; iiter < niter; ++iiter) {
-            printf("%p ", (void *)NAD_PTRS(axisdata)[iiter]);
+        for (iop = 0; iop < nop; ++iop) {
+            printf("%p ", (void *)NAD_PTRS(axisdata)[iop]);
         }
         printf("\n");
         if (itflags&NPY_ITFLAG_HASINDEX) {
             printf("|   Index Value: %d\n",
-                               (int)((npy_intp*)NAD_PTRS(axisdata))[niter]);
+                               (int)((npy_intp*)NAD_PTRS(axisdata))[nop]);
         }
     }
 
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 3d13d9ec43a9..5227f9c1e9fe 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -443,9 +443,9 @@ NpyIter_OpFlagsConverter(PyObject *op_flags_in,
 
 static int
 npyiter_convert_op_flags_array(PyObject *op_flags_in,
-                         npy_uint32 *op_flags_array, npy_intp niter)
+                         npy_uint32 *op_flags_array, npy_intp nop)
 {
-    npy_intp iiter;
+    npy_intp iop;
 
     if (!PyTuple_Check(op_flags_in) && !PyList_Check(op_flags_in)) {
         PyErr_SetString(PyExc_ValueError,
@@ -453,22 +453,22 @@ npyiter_convert_op_flags_array(PyObject *op_flags_in,
         return 0;
     }
 
-    if (PySequence_Size(op_flags_in) != niter) {
+    if (PySequence_Size(op_flags_in) != nop) {
         goto try_single_flags;
     }
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        PyObject *f = PySequence_GetItem(op_flags_in, iiter);
+    for (iop = 0; iop < nop; ++iop) {
+        PyObject *f = PySequence_GetItem(op_flags_in, iop);
         if (f == NULL) {
             return 0;
         }
         /* If the first item is a string, try as one set of flags */
-        if (iiter == 0 && (PyBytes_Check(f) || PyUnicode_Check(f))) {
+        if (iop == 0 && (PyBytes_Check(f) || PyUnicode_Check(f))) {
             Py_DECREF(f);
             goto try_single_flags;
         }
         if (NpyIter_OpFlagsConverter(f,
-                        &op_flags_array[iiter]) != 1) {
+                        &op_flags_array[iop]) != 1) {
             Py_DECREF(f);
             return 0;
         }
@@ -484,8 +484,8 @@ npyiter_convert_op_flags_array(PyObject *op_flags_in,
         return 0;
     }
 
-    for (iiter = 1; iiter < niter; ++iiter) {
-        op_flags_array[iiter] = op_flags_array[0];
+    for (iop = 1; iop < nop; ++iop) {
+        op_flags_array[iop] = op_flags_array[0];
     }
 
     return 1;
@@ -494,33 +494,33 @@ npyiter_convert_op_flags_array(PyObject *op_flags_in,
 static int
 npyiter_convert_dtypes(PyObject *op_dtypes_in,
                         PyArray_Descr **op_dtypes,
-                        npy_intp niter)
+                        npy_intp nop)
 {
-    npy_intp iiter;
+    npy_intp iop;
 
     /*
      * If the input isn't a tuple of dtypes, try converting it as-is
      * to a dtype, and replicating to all operands.
      */
     if ((!PyTuple_Check(op_dtypes_in) && !PyList_Check(op_dtypes_in)) ||
-                                    PySequence_Size(op_dtypes_in) != niter) {
+                                    PySequence_Size(op_dtypes_in) != nop) {
         goto try_single_dtype;
     }
 
-    for (iiter = 0; iiter < niter; ++iiter) {
-        PyObject *dtype = PySequence_GetItem(op_dtypes_in, iiter);
+    for (iop = 0; iop < nop; ++iop) {
+        PyObject *dtype = PySequence_GetItem(op_dtypes_in, iop);
         if (dtype == NULL) {
             npy_intp i;
-            for (i = 0; i < iiter; ++i ) {
+            for (i = 0; i < iop; ++i ) {
                 Py_XDECREF(op_dtypes[i]);
             }
             return 0;
         }
 
         /* Try converting the object to a descr */
-        if (PyArray_DescrConverter2(dtype, &op_dtypes[iiter]) != 1) {
+        if (PyArray_DescrConverter2(dtype, &op_dtypes[iop]) != 1) {
             npy_intp i;
-            for (i = 0; i < iiter; ++i ) {
+            for (i = 0; i < iop; ++i ) {
                 Py_XDECREF(op_dtypes[i]);
             }
             Py_DECREF(dtype);
@@ -535,9 +535,9 @@ npyiter_convert_dtypes(PyObject *op_dtypes_in,
 
 try_single_dtype:
     if (PyArray_DescrConverter2(op_dtypes_in, &op_dtypes[0]) == 1) {
-        for (iiter = 1; iiter < niter; ++iiter) {
-            op_dtypes[iiter] = op_dtypes[0];
-            Py_XINCREF(op_dtypes[iiter]);
+        for (iop = 1; iop < nop; ++iop) {
+            op_dtypes[iop] = op_dtypes[0];
+            Py_XINCREF(op_dtypes[iop]);
         }
         return 1;
     }
@@ -546,14 +546,14 @@ npyiter_convert_dtypes(PyObject *op_dtypes_in,
 }
 
 static int
-npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp niter,
+npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp nop,
                         int **op_axes, int *oa_ndim)
 {
     PyObject *a;
-    int iiter;
+    int iop;
 
     if ((!PyTuple_Check(op_axes_in) && !PyList_Check(op_axes_in)) ||
-                                PySequence_Size(op_axes_in) != niter) {
+                                PySequence_Size(op_axes_in) != nop) {
         PyErr_SetString(PyExc_ValueError,
                 "op_axes must be a tuple/list matching the number of ops");
         return 0;
@@ -562,14 +562,14 @@ npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp niter,
     *oa_ndim = 0;
 
     /* Copy the tuples into op_axes */
-    for (iiter = 0; iiter < niter; ++iiter) {
+    for (iop = 0; iop < nop; ++iop) {
         int idim;
-        a = PySequence_GetItem(op_axes_in, iiter);
+        a = PySequence_GetItem(op_axes_in, iop);
         if (a == NULL) {
             return 0;
         }
         if (a == Py_None) {
-            op_axes[iiter] = NULL;
+            op_axes[iop] = NULL;
         } else {
             if (!PyTuple_Check(a) && !PyList_Check(a)) {
                 PyErr_SetString(PyExc_ValueError,
@@ -605,11 +605,11 @@ npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp niter,
                 }
                 /* numpy.newaxis is None */
                 if (v == Py_None) {
-                    op_axes[iiter][idim] = -1;
+                    op_axes[iop][idim] = -1;
                 }
                 else {
-                    op_axes[iiter][idim] = PyInt_AsLong(v);
-                    if (op_axes[iiter][idim]==-1 &&
+                    op_axes[iop][idim] = PyInt_AsLong(v);
+                    if (op_axes[iop][idim]==-1 &&
                                                 PyErr_Occurred()) {
                         Py_DECREF(a);
                         Py_DECREF(v);
@@ -634,34 +634,34 @@ npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp niter,
 
 /*
  * Converts the operand array and op_flags array into the form NpyIter_AdvancedNew
- * needs.  Sets niter, and on success, each op[i] owns a reference
+ * needs.  Sets nop, and on success, each op[i] owns a reference
  * to an array object.
  */
 static int
 npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
                     PyArrayObject **op, npy_uint32 *op_flags,
-                    int *niter_out)
+                    int *nop_out)
 {
-    int iiter, niter;
+    int iop, nop;
 
-    /* niter and op */
+    /* nop and op */
     if (PyTuple_Check(op_in) || PyList_Check(op_in)) {
-        niter = PySequence_Size(op_in);
-        if (niter == 0) {
+        nop = PySequence_Size(op_in);
+        if (nop == 0) {
             PyErr_SetString(PyExc_ValueError,
                     "Must provide at least one operand");
             return 0;
         }
-        if (niter > NPY_MAXARGS) {
+        if (nop > NPY_MAXARGS) {
             PyErr_SetString(PyExc_ValueError, "Too many operands");
             return 0;
         }
 
-        for (iiter = 0; iiter < niter; ++iiter) {
-            PyObject *item = PySequence_GetItem(op_in, iiter);
+        for (iop = 0; iop < nop; ++iop) {
+            PyObject *item = PySequence_GetItem(op_in, iop);
             if (item == NULL) {
                 npy_intp i;
-                for (i = 0; i < iiter; ++i) {
+                for (i = 0; i < iop; ++i) {
                     Py_XDECREF(op[i]);
                 }
                 return 0;
@@ -671,53 +671,53 @@ npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
                 item = NULL;
             }
             /* This is converted to an array after op flags are retrieved */
-            op[iiter] = (PyArrayObject *)item;
+            op[iop] = (PyArrayObject *)item;
         }
     }
     else {
-        niter = 1;
+        nop = 1;
         /* Is converted to an array after op flags are retrieved */
         Py_INCREF(op_in);
         op[0] = (PyArrayObject *)op_in;
     }
 
-    *niter_out = niter;
+    *nop_out = nop;
 
     /* op_flags */
     if (op_flags_in == NULL || op_flags_in == Py_None) {
-        for (iiter = 0; iiter < niter; ++iiter) {
+        for (iop = 0; iop < nop; ++iop) {
             /*
              * By default, make NULL operands writeonly and flagged for
              * allocation, and everything else readonly.  To write
              * to a provided operand, you must specify the write flag manually.
              */
-            if (op[iiter] == NULL) {
-                op_flags[iiter] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
+            if (op[iop] == NULL) {
+                op_flags[iop] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
             }
             else {
-                op_flags[iiter] = NPY_ITER_READONLY;
+                op_flags[iop] = NPY_ITER_READONLY;
             }
         }
     }
     else if (npyiter_convert_op_flags_array(op_flags_in,
-                                      op_flags, niter) != 1) {
-        for (iiter = 0; iiter < niter; ++iiter) {
-            Py_XDECREF(op[iiter]);
+                                      op_flags, nop) != 1) {
+        for (iop = 0; iop < nop; ++iop) {
+            Py_XDECREF(op[iop]);
         }
-        *niter_out = 0;
+        *nop_out = 0;
         return 0;
     }
 
     /* Now that we have the flags - convert all the ops to arrays */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op[iiter] != NULL) {
+    for (iop = 0; iop < nop; ++iop) {
+        if (op[iop] != NULL) {
             PyArrayObject *ao;
             int fromanyflags = 0;
 
-            if (op_flags[iiter]&(NPY_ITER_READWRITE|NPY_ITER_WRITEONLY)) {
+            if (op_flags[iop]&(NPY_ITER_READWRITE|NPY_ITER_WRITEONLY)) {
                 fromanyflags = NPY_UPDATEIFCOPY;
             }
-            ao = (PyArrayObject *)PyArray_FromAny((PyObject *)op[iiter],
+            ao = (PyArrayObject *)PyArray_FromAny((PyObject *)op[iop],
                                             NULL, 0, 0, fromanyflags, NULL);
             if (ao == NULL) {
                 if (PyErr_Occurred() &&
@@ -727,14 +727,14 @@ npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
                             "but is an object which cannot be written "
                             "back to via UPDATEIFCOPY");
                 }
-                for (iiter = 0; iiter < niter; ++iiter) {
-                    Py_DECREF(op[iiter]);
+                for (iop = 0; iop < nop; ++iop) {
+                    Py_DECREF(op[iop]);
                 }
-                *niter_out = 0;
+                *nop_out = 0;
                 return 0;
             }
-            Py_DECREF(op[iiter]);
-            op[iiter] = ao;
+            Py_DECREF(op[iop]);
+            op[iop] = ao;
         }
     }
 
@@ -752,7 +752,7 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
     PyObject *op_in = NULL, *op_flags_in = NULL,
                 *op_dtypes_in = NULL, *op_axes_in = NULL;
 
-    int iiter, niter = 0;
+    int iop, nop = 0;
     PyArrayObject *op[NPY_MAXARGS];
     npy_uint32 flags = 0;
     NPY_ORDER order = NPY_KEEPORDER;
@@ -791,7 +791,7 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
     memset(op_request_dtypes, 0, sizeof(op_request_dtypes));
 
     /* op and op_flags */
-    if (npyiter_convert_ops(op_in, op_flags_in, op, op_flags, &niter)
+    if (npyiter_convert_ops(op_in, op_flags_in, op, op_flags, &nop)
                                                         != 1) {
         goto fail;
     }
@@ -799,18 +799,18 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
     /* op_request_dtypes */
     if (op_dtypes_in != NULL && op_dtypes_in != Py_None &&
             npyiter_convert_dtypes(op_dtypes_in,
-                                   op_request_dtypes, niter) != 1) {
+                                   op_request_dtypes, nop) != 1) {
         goto fail;
     }
 
     /* op_axes */
     if (op_axes_in != NULL && op_axes_in != Py_None) {
         /* Initialize to point to the op_axes arrays */
-        for (iiter = 0; iiter < niter; ++iiter) {
-            op_axes[iiter] = op_axes_arrays[iiter];
+        for (iop = 0; iop < nop; ++iop) {
+            op_axes[iop] = op_axes_arrays[iop];
         }
 
-        if (npyiter_convert_op_axes(op_axes_in, niter,
+        if (npyiter_convert_op_axes(op_axes_in, nop,
                                     op_axes, &oa_ndim) != 1) {
             goto fail;
         }
@@ -833,7 +833,7 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
         itershape.ptr = NULL;
     }
 
-    self->iter = NpyIter_AdvancedNew(niter, op, flags, order, casting, op_flags,
+    self->iter = NpyIter_AdvancedNew(nop, op, flags, order, casting, op_flags,
                                   op_request_dtypes,
                                   oa_ndim, oa_ndim > 0 ? op_axes : NULL,
                                   itershape.ptr,
@@ -860,9 +860,9 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
     }
 
     /* Release the references we got to the ops and dtypes */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        Py_XDECREF(op[iiter]);
-        Py_XDECREF(op_request_dtypes[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+        Py_XDECREF(op_request_dtypes[iop]);
     }
 
     return 0;
@@ -871,9 +871,9 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
     if (itershape.ptr != NULL) {
         PyDimMem_FREE(itershape.ptr);
     }
-    for (iiter = 0; iiter < niter; ++iiter) {
-        Py_XDECREF(op[iiter]);
-        Py_XDECREF(op_request_dtypes[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+        Py_XDECREF(op_request_dtypes[iop]);
     }
     return -1;
 }
@@ -890,7 +890,7 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
     PyObject *op_in = NULL, *axes_in = NULL,
             *op_flags_in = NULL, *op_dtypes_in = NULL;
 
-    int iiter, niter = 0, inest, nnest = 0;
+    int iop, nop = 0, inest, nnest = 0;
     PyArrayObject *op[NPY_MAXARGS];
     npy_uint32 flags = 0, flags_inner = 0;
     NPY_ORDER order = NPY_KEEPORDER;
@@ -986,20 +986,20 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
     }
 
     /* op and op_flags */
-    if (npyiter_convert_ops(op_in, op_flags_in, op, op_flags, &niter)
+    if (npyiter_convert_ops(op_in, op_flags_in, op, op_flags, &nop)
                                                         != 1) {
         return NULL;
     }
 
     /* Set the dtypes to all NULL to start as well */
-    memset(op_request_dtypes, 0, sizeof(op_request_dtypes[0])*niter);
+    memset(op_request_dtypes, 0, sizeof(op_request_dtypes[0])*nop);
     memset(op_request_dtypes_inner, 0,
-                        sizeof(op_request_dtypes_inner[0])*niter);
+                        sizeof(op_request_dtypes_inner[0])*nop);
 
     /* op_request_dtypes */
     if (op_dtypes_in != NULL && op_dtypes_in != Py_None &&
             npyiter_convert_dtypes(op_dtypes_in,
-                                   op_request_dtypes, niter) != 1) {
+                                   op_request_dtypes, nop) != 1) {
         goto fail;
     }
 
@@ -1018,16 +1018,16 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
      * to indicate exactly the allocated outputs.  Also, separate
      * the inner loop flags.
      */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if ((op_flags[iiter]&NPY_ITER_ALLOCATE) && op[iiter] != NULL) {
-            op_flags[iiter] &= ~NPY_ITER_ALLOCATE;
+    for (iop = 0; iop < nop; ++iop) {
+        if ((op_flags[iop]&NPY_ITER_ALLOCATE) && op[iop] != NULL) {
+            op_flags[iop] &= ~NPY_ITER_ALLOCATE;
         }
 
         /*
          * Clear any flags allowing copies or output allocation for
          * the inner loop.
          */
-        op_flags_inner[iiter] = op_flags[iiter] & ~(NPY_ITER_COPY|
+        op_flags_inner[iop] = op_flags[iop] & ~(NPY_ITER_COPY|
                              NPY_ITER_UPDATEIFCOPY|
                              NPY_ITER_ALLOCATE);
         /*
@@ -1036,12 +1036,12 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
          * for the outer loops.
          */
         if ((flags&(NPY_ITER_BUFFERED)) &&
-                !(op_flags[iiter]&(NPY_ITER_COPY|
+                !(op_flags[iop]&(NPY_ITER_COPY|
                                    NPY_ITER_UPDATEIFCOPY|
                                    NPY_ITER_ALLOCATE))) {
-            op_flags[iiter] &= ~(NPY_ITER_NBO|NPY_ITER_ALIGNED|NPY_ITER_CONTIG);
-            op_request_dtypes_inner[iiter] = op_request_dtypes[iiter];
-            op_request_dtypes[iiter] = NULL;
+            op_flags[iop] &= ~(NPY_ITER_NBO|NPY_ITER_ALIGNED|NPY_ITER_CONTIG);
+            op_request_dtypes_inner[iop] = op_request_dtypes[iop];
+            op_request_dtypes[iop] = NULL;
         }
     }
 
@@ -1052,33 +1052,33 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
 
     for (inest = 0; inest < nnest; ++inest) {
         NewNpyArrayIterObject *iter;
-        int *op_axes_niter[NPY_MAXARGS];
+        int *op_axes_nop[NPY_MAXARGS];
 
         /*
          * All the operands' op_axes are the same, except for
          * allocated outputs.
          */
-        for (iiter = 0; iiter < niter; ++iiter) {
-            if (op_flags[iiter]&NPY_ITER_ALLOCATE) {
+        for (iop = 0; iop < nop; ++iop) {
+            if (op_flags[iop]&NPY_ITER_ALLOCATE) {
                 if (inest == 0) {
-                    op_axes_niter[iiter] = NULL;
+                    op_axes_nop[iop] = NULL;
                 }
                 else {
-                    op_axes_niter[iiter] = negones;
+                    op_axes_nop[iop] = negones;
                 }
             }
             else {
-                op_axes_niter[iiter] = nested_op_axes[inest];
+                op_axes_nop[iop] = nested_op_axes[inest];
             }
         }
 
         /*
         printf("\n");
-        for (iiter = 0; iiter < niter; ++iiter) {
+        for (iop = 0; iop < nop; ++iop) {
             npy_intp i;
 
             for (i = 0; i < nested_naxes[inest]; ++i) {
-                printf("%d ", (int)op_axes_niter[iiter][i]);
+                printf("%d ", (int)op_axes_nop[iop][i]);
             }
             printf("\n");
         }
@@ -1092,17 +1092,17 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
         }
 
         if (inest < nnest-1) {
-            iter->iter = NpyIter_AdvancedNew(niter, op, flags, order,
+            iter->iter = NpyIter_AdvancedNew(nop, op, flags, order,
                                 casting, op_flags, op_request_dtypes,
-                                nested_naxes[inest], op_axes_niter,
+                                nested_naxes[inest], op_axes_nop,
                                 NULL,
                                 0);
         }
         else {
-            iter->iter = NpyIter_AdvancedNew(niter, op, flags_inner, order,
+            iter->iter = NpyIter_AdvancedNew(nop, op, flags_inner, order,
                                 casting, op_flags_inner,
                                 op_request_dtypes_inner,
-                                nested_naxes[inest], op_axes_niter,
+                                nested_naxes[inest], op_axes_nop,
                                 NULL,
                                 buffersize);
         }
@@ -1130,18 +1130,18 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
          */
         if (inest == 0) {
             PyArrayObject **operands = NpyIter_GetOperandArray(iter->iter);
-            for (iiter = 0; iiter < niter; ++iiter) {
-                if (op[iiter] != operands[iiter]) {
-                    Py_XDECREF(op[iiter]);
-                    op[iiter] = operands[iiter];
-                    Py_INCREF(op[iiter]);
+            for (iop = 0; iop < nop; ++iop) {
+                if (op[iop] != operands[iop]) {
+                    Py_XDECREF(op[iop]);
+                    op[iop] = operands[iop];
+                    Py_INCREF(op[iop]);
                 }
 
                 /*
                  * Clear any flags allowing copies for
                  * the rest of the iterators
                  */
-                op_flags[iiter] &= ~(NPY_ITER_COPY|
+                op_flags[iop] &= ~(NPY_ITER_COPY|
                                  NPY_ITER_UPDATEIFCOPY);
             }
             /* Clear the common dtype flag for the rest of the iterators */
@@ -1152,10 +1152,10 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
     }
 
     /* Release our references to the ops and dtypes */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        Py_XDECREF(op[iiter]);
-        Py_XDECREF(op_request_dtypes[iiter]);
-        Py_XDECREF(op_request_dtypes_inner[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+        Py_XDECREF(op_request_dtypes[iop]);
+        Py_XDECREF(op_request_dtypes_inner[iop]);
     }
 
     /* Set up the nested child references */
@@ -1183,10 +1183,10 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
     return ret;
 
 fail:
-    for (iiter = 0; iiter < niter; ++iiter) {
-        Py_XDECREF(op[iiter]);
-        Py_XDECREF(op_request_dtypes[iiter]);
-        Py_XDECREF(op_request_dtypes_inner[iiter]);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+        Py_XDECREF(op_request_dtypes[iop]);
+        Py_XDECREF(op_request_dtypes_inner[iop]);
     }
     return NULL;
 }
@@ -1416,9 +1416,7 @@ static PyObject *npyiter_value_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iiter, niter;
-    PyArray_Descr **dtypes;
-    char **dataptrs;
+    npy_intp iop, nop;
 
     if (self->iter == NULL || self->finished) {
         PyErr_SetString(PyExc_ValueError,
@@ -1426,26 +1424,24 @@ static PyObject *npyiter_value_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
-    dtypes = self->dtypes;
-    dataptrs = self->dataptrs;
+    nop = NpyIter_GetNOp(self->iter);
 
     /* Return an array  or tuple of arrays with the values */
-    if (niter == 1) {
+    if (nop == 1) {
         ret = npyiter_seq_item(self, 0);
     }
     else {
-        ret = PyTuple_New(niter);
+        ret = PyTuple_New(nop);
         if (ret == NULL) {
             return NULL;
         }
-        for (iiter = 0; iiter < niter; ++iiter) {
-            PyObject *a = npyiter_seq_item(self, iiter);
+        for (iop = 0; iop < nop; ++iop) {
+            PyObject *a = npyiter_seq_item(self, iop);
             if (a == NULL) {
                 Py_DECREF(ret);
                 return NULL;
             }
-            PyTuple_SET_ITEM(ret, iiter, a);
+            PyTuple_SET_ITEM(ret, iop, a);
         }
     }
 
@@ -1456,7 +1452,7 @@ static PyObject *npyiter_operands_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iiter, niter;
+    npy_intp iop, nop;
     PyArrayObject **operands;
 
     if (self->iter == NULL) {
@@ -1465,18 +1461,18 @@ static PyObject *npyiter_operands_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
+    nop = NpyIter_GetNOp(self->iter);
     operands = self->operands;
 
-    ret = PyTuple_New(niter);
+    ret = PyTuple_New(nop);
     if (ret == NULL) {
         return NULL;
     }
-    for (iiter = 0; iiter < niter; ++iiter) {
-        PyObject *operand = (PyObject *)operands[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        PyObject *operand = (PyObject *)operands[iop];
 
         Py_INCREF(operand);
-        PyTuple_SET_ITEM(ret, iiter, operand);
+        PyTuple_SET_ITEM(ret, iop, operand);
     }
 
     return ret;
@@ -1486,7 +1482,7 @@ static PyObject *npyiter_itviews_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iiter, niter;
+    npy_intp iop, nop;
 
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -1494,20 +1490,20 @@ static PyObject *npyiter_itviews_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
+    nop = NpyIter_GetNOp(self->iter);
 
-    ret = PyTuple_New(niter);
+    ret = PyTuple_New(nop);
     if (ret == NULL) {
         return NULL;
     }
-    for (iiter = 0; iiter < niter; ++iiter) {
-        PyArrayObject *view = NpyIter_GetIterView(self->iter, iiter);
+    for (iop = 0; iop < nop; ++iop) {
+        PyArrayObject *view = NpyIter_GetIterView(self->iter, iop);
 
         if (view == NULL) {
             Py_DECREF(ret);
             return NULL;
         }
-        PyTuple_SET_ITEM(ret, iiter, (PyObject *)view);
+        PyTuple_SET_ITEM(ret, iop, (PyObject *)view);
     }
 
     return ret;
@@ -1909,7 +1905,7 @@ static PyObject *npyiter_dtypes_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iiter, niter;
+    npy_intp iop, nop;
     PyArray_Descr **dtypes;
 
     if (self->iter == NULL) {
@@ -1918,18 +1914,18 @@ static PyObject *npyiter_dtypes_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
+    nop = NpyIter_GetNOp(self->iter);
 
-    ret = PyTuple_New(niter);
+    ret = PyTuple_New(nop);
     if (ret == NULL) {
         return NULL;
     }
     dtypes = self->dtypes;
-    for (iiter = 0; iiter < niter; ++iiter) {
-        PyArray_Descr *dtype = dtypes[iiter];
+    for (iop = 0; iop < nop; ++iop) {
+        PyArray_Descr *dtype = dtypes[iop];
 
         Py_INCREF(dtype);
-        PyTuple_SET_ITEM(ret, iiter, (PyObject *)dtype);
+        PyTuple_SET_ITEM(ret, iop, (PyObject *)dtype);
     }
 
     return ret;
@@ -1946,7 +1942,7 @@ static PyObject *npyiter_ndim_get(NewNpyArrayIterObject *self)
     return PyInt_FromLong(NpyIter_GetNDim(self->iter));
 }
 
-static PyObject *npyiter_niter_get(NewNpyArrayIterObject *self)
+static PyObject *npyiter_nop_get(NewNpyArrayIterObject *self)
 {
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -1954,7 +1950,7 @@ static PyObject *npyiter_niter_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetNIter(self->iter));
+    return PyInt_FromLong(NpyIter_GetNOp(self->iter));
 }
 
 static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
@@ -1985,7 +1981,7 @@ npyiter_seq_length(NewNpyArrayIterObject *self)
         return 0;
     }
     else {
-        return NpyIter_GetNIter(self->iter);
+        return NpyIter_GetNOp(self->iter);
     }
 }
 
@@ -1995,7 +1991,7 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
     PyObject *ret;
 
     npy_intp ret_ndim;
-    npy_intp niter, innerloopsize, innerstride;
+    npy_intp nop, innerloopsize, innerstride;
     char *dataptr;
     PyArray_Descr *dtype;
 
@@ -2012,8 +2008,8 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
         return NULL;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
-    if (i < 0 || i >= niter) {
+    nop = NpyIter_GetNOp(self->iter);
+    if (i < 0 || i >= nop) {
         PyErr_Format(PyExc_IndexError,
                 "Iterator operand index %d is out of bounds", (int)i);
         return NULL;
@@ -2067,7 +2063,7 @@ npyiter_seq_slice(NewNpyArrayIterObject *self,
                     Py_ssize_t ilow, Py_ssize_t ihigh)
 {
     PyObject *ret;
-    npy_intp niter;
+    npy_intp nop;
     Py_ssize_t i;
 
     if (self->iter == NULL || self->finished) {
@@ -2083,18 +2079,18 @@ npyiter_seq_slice(NewNpyArrayIterObject *self,
         return NULL;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
+    nop = NpyIter_GetNOp(self->iter);
     if (ilow < 0) {
         ilow = 0;
     }
-    else if (ilow >= niter) {
-        ilow = niter-1;
+    else if (ilow >= nop) {
+        ilow = nop-1;
     }
     if (ihigh < ilow) {
         ihigh = ilow;
     }
-    else if (ihigh > niter) {
-        ihigh = niter;
+    else if (ihigh > nop) {
+        ihigh = nop;
     }
 
     ret = PyTuple_New(ihigh-ilow);
@@ -2116,7 +2112,7 @@ NPY_NO_EXPORT int
 npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
 {
 
-    npy_intp niter, innerloopsize, innerstride;
+    npy_intp nop, innerloopsize, innerstride;
     char *dataptr;
     PyArray_Descr *dtype;
     PyArrayObject *tmp;
@@ -2141,8 +2137,8 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
         return -1;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
-    if (i < 0 || i >= niter) {
+    nop = NpyIter_GetNOp(self->iter);
+    if (i < 0 || i >= nop) {
         PyErr_Format(PyExc_IndexError,
                 "Iterator operand index %d is out of bounds", (int)i);
         return -1;
@@ -2184,7 +2180,7 @@ static int
 npyiter_seq_ass_slice(NewNpyArrayIterObject *self, Py_ssize_t ilow,
                 Py_ssize_t ihigh, PyObject *v)
 {
-    npy_intp niter;
+    npy_intp nop;
     Py_ssize_t i;
 
     if (v == NULL) {
@@ -2206,18 +2202,18 @@ npyiter_seq_ass_slice(NewNpyArrayIterObject *self, Py_ssize_t ilow,
         return -1;
     }
 
-    niter = NpyIter_GetNIter(self->iter);
+    nop = NpyIter_GetNOp(self->iter);
     if (ilow < 0) {
         ilow = 0;
     }
-    else if (ilow >= niter) {
-        ilow = niter-1;
+    else if (ilow >= nop) {
+        ilow = nop-1;
     }
     if (ihigh < ilow) {
         ihigh = ilow;
     }
-    else if (ihigh > niter) {
-        ihigh = niter;
+    else if (ihigh > nop) {
+        ihigh = nop;
     }
 
     if (!PySequence_Check(v) || PySequence_Size(v) != ihigh-ilow) {
@@ -2268,7 +2264,7 @@ npyiter_subscript(NewNpyArrayIterObject *self, PyObject *op)
     else if (PySlice_Check(op)) {
         Py_ssize_t istart = 0, iend = 0, istep = 0;
         if (PySlice_GetIndices((PySliceObject *)op,
-                            NpyIter_GetNIter(self->iter),
+                            NpyIter_GetNOp(self->iter),
                             &istart, &iend, &istep) < 0) {
             return NULL;
         }
@@ -2313,7 +2309,7 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
     else if (PySlice_Check(op)) {
         Py_ssize_t istart = 0, iend = 0, istep = 0;
         if (PySlice_GetIndices((PySliceObject *)op,
-                            NpyIter_GetNIter(self->iter),
+                            NpyIter_GetNOp(self->iter),
                             &istart, &iend, &istep) < 0) {
             return -1;
         }
@@ -2395,8 +2391,8 @@ static PyGetSetDef npyiter_getsets[] = {
     {"ndim",
         (getter)npyiter_ndim_get,
         NULL, NULL, NULL},
-    {"niter",
-        (getter)npyiter_niter_get,
+    {"nop",
+        (getter)npyiter_nop_get,
         NULL, NULL, NULL},
     {"itersize",
         (getter)npyiter_itersize_get,
diff --git a/numpy/core/src/multiarray/nditer_pywrap.h b/numpy/core/src/multiarray/nditer_pywrap.h
index 35e32254175e..49eb5d89de00 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.h
+++ b/numpy/core/src/multiarray/nditer_pywrap.h
@@ -1,5 +1,5 @@
-#ifndef __NEW_ITERATOR_PYWRAP_H
-#define __NEW_ITERATOR_PYWRAP_H
+#ifndef __NDITER_PYWRAP_H
+#define __NDITER_PYWRAP_H
 
 NPY_NO_EXPORT PyObject *
 NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index eff71d12d1d4..8a0da6bf6cab 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -1217,12 +1217,11 @@ gentype_byteswap(PyObject *self, PyObject *args)
     else {
         /* get the data, copyswap it and pass it to a new Array scalar */
         char *data;
-        int numbytes;
         PyArray_Descr *descr;
         PyObject *new;
         char *newmem;
 
-        numbytes = gentype_getreadbuf(self, 0, (void **)&data);
+        gentype_getreadbuf(self, 0, (void **)&data);
         descr = PyArray_DescrFromScalar(self);
         newmem = _pya_malloc(descr->elsize);
         if (newmem == NULL) {
@@ -3275,21 +3274,21 @@ NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
  * type numbers.  Note that signed integers are mapped to INTNEG_SCALAR,
  * which is different than what PyArray_ScalarKind returns.
  */
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_scalar_kinds_table[NPY_NTYPES];
 
 /*
  * This table maps a scalar kind (excluding NPY_NOSCALAR)
  * to the smallest type number of that kind.
  */
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
 
 /*
  * This table gives the type of the same kind, but next in the sequence
  * of sizes.
  */
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_next_larger_type_table[NPY_NTYPES];
 
 /*
@@ -3303,7 +3302,7 @@ _npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
  * This table gives the smallest-size and smallest-kind type to which
  * the input types may be safely cast, according to _npy_can_cast_safely.
  */
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
 #endif
 
diff --git a/numpy/core/src/multiarray/scalartypes.h b/numpy/core/src/multiarray/scalartypes.h
index 53850947a98b..7397a97e0ff7 100644
--- a/numpy/core/src/multiarray/scalartypes.h
+++ b/numpy/core/src/multiarray/scalartypes.h
@@ -5,24 +5,24 @@
 #ifdef NPY_ENABLE_SEPARATE_COMPILATION
 extern NPY_NO_EXPORT unsigned char
 _npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
-extern NPY_NO_EXPORT char
+extern NPY_NO_EXPORT signed char
 _npy_scalar_kinds_table[NPY_NTYPES];
-extern NPY_NO_EXPORT char
+extern NPY_NO_EXPORT signed char
 _npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
-extern NPY_NO_EXPORT char
+extern NPY_NO_EXPORT signed char
 _npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
-extern NPY_NO_EXPORT char
+extern NPY_NO_EXPORT signed char
 _npy_next_larger_type_table[NPY_NTYPES];
 #else
 NPY_NO_EXPORT unsigned char
 _npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_scalar_kinds_table[NPY_NTYPES];
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
-NPY_NO_EXPORT char
+NPY_NO_EXPORT signed char
 _npy_next_larger_type_table[NPY_NTYPES];
 #endif
 
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index dd2ea48eb912..e8865d065278 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -72,6 +72,13 @@ array_slice(PyArrayObject *self, Py_ssize_t ilow,
                              self->nd, self->dimensions,
                              self->strides, data,
                              self->flags, (PyObject *)self);
+    /* DISTNUMPY */
+    if(PyDistArray_IsDist(self))
+    {
+        dndslice slice = {ilow, 1, self->dimensions[0]};
+        if(PyDistArray_NewViewArray(self, r, 1, &slice) == -1)
+            return NULL;
+    }
     self->dimensions[0] = l;
     if (r == NULL) {
         return NULL;
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 59722cef48c6..930c91ca1d03 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -442,7 +442,11 @@ _extract_pyvals(PyObject *ref, char *name, int *bufsize,
 
 
 
-/*UFUNC_API*/
+/*UFUNC_API
+ *
+ * On return, if errobj is populated with a non-NULL value, the caller
+ * owns a new reference to errobj.
+ */
 NPY_NO_EXPORT int
 PyUFunc_GetPyValues(char *name, int *bufsize, int *errmask, PyObject **errobj)
 {
@@ -970,13 +974,13 @@ ufunc_loop_matches(PyUFuncObject *self,
                     NPY_CASTING input_casting,
                     NPY_CASTING output_casting,
                     int any_object,
-                    int all_inputs_scalar,
+                    int use_min_scalar,
                     int *types,
                     int *out_no_castable_output,
                     char *out_err_src_typecode,
                     char *out_err_dst_typecode)
 {
-    npy_intp i, nin = self->nin, niter = nin + self->nout;
+    npy_intp i, nin = self->nin, nop = nin + self->nout;
 
     /*
      * First check if all the inputs can be safely cast
@@ -1014,7 +1018,7 @@ ufunc_loop_matches(PyUFuncObject *self,
          * If all the inputs are scalars, use the regular
          * promotion rules, not the special value-checking ones.
          */
-        if (all_inputs_scalar) {
+        if (!use_min_scalar) {
             if (!PyArray_CanCastTypeTo(PyArray_DESCR(op[i]), tmp,
                                                     input_casting)) {
                 Py_DECREF(tmp);
@@ -1035,7 +1039,7 @@ ufunc_loop_matches(PyUFuncObject *self,
      * If all the inputs were ok, then check casting back to the
      * outputs.
      */
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         if (op[i] != NULL) {
             PyArray_Descr *tmp = PyArray_DescrFromType(types[i]);
             if (tmp == NULL) {
@@ -1065,11 +1069,11 @@ set_ufunc_loop_data_types(PyUFuncObject *self, PyArrayObject **op,
                     int *types,
                     npy_intp buffersize, int *out_trivial_loop_ok)
 {
-    npy_intp i, nin = self->nin, niter = nin + self->nout;
+    npy_intp i, nin = self->nin, nop = nin + self->nout;
 
     *out_trivial_loop_ok = 1;
     /* Fill the dtypes array */
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         out_dtype[i] = PyArray_DescrFromType(types[i]);
         if (out_dtype[i] == NULL) {
             return -1;
@@ -1119,7 +1123,7 @@ find_ufunc_matching_userloop(PyUFuncObject *self,
                         NPY_CASTING output_casting,
                         npy_intp buffersize,
                         int any_object,
-                        int all_inputs_scalar,
+                        int use_min_scalar,
                         PyArray_Descr **out_dtype,
                         PyUFuncGenericFunction *out_innerloop,
                         void **out_innerloopdata,
@@ -1155,7 +1159,7 @@ find_ufunc_matching_userloop(PyUFuncObject *self,
                 int *types = funcdata->arg_types;
                 switch (ufunc_loop_matches(self, op,
                             input_casting, output_casting,
-                            any_object, all_inputs_scalar,
+                            any_object, use_min_scalar,
                             types,
                             out_no_castable_output, out_err_src_typecode,
                             out_err_dst_typecode)) {
@@ -1197,13 +1201,13 @@ find_ufunc_specified_userloop(PyUFuncObject *self,
                         NPY_CASTING casting,
                         npy_intp buffersize,
                         int any_object,
-                        int all_inputs_scalar,
+                        int use_min_scalar,
                         PyArray_Descr **out_dtype,
                         PyUFuncGenericFunction *out_innerloop,
                         void **out_innerloopdata,
                         int *out_trivial_loop_ok)
 {
-    npy_intp i, j, nin = self->nin, niter = nin + self->nout;
+    npy_intp i, j, nin = self->nin, nop = nin + self->nout;
     PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
@@ -1233,8 +1237,8 @@ find_ufunc_specified_userloop(PyUFuncObject *self,
                 int *types = funcdata->arg_types;
                 int matched = 1;
 
-                if (n_specified == niter) {
-                    for (j = 0; j < niter; ++j) {
+                if (n_specified == nop) {
+                    for (j = 0; j < nop; ++j) {
                         if (types[j] != specified_types[j]) {
                             matched = 0;
                             break;
@@ -1251,7 +1255,7 @@ find_ufunc_specified_userloop(PyUFuncObject *self,
 
                 switch (ufunc_loop_matches(self, op,
                             casting, casting,
-                            any_object, all_inputs_scalar,
+                            any_object, use_min_scalar,
                             types,
                             &no_castable_output, &err_src_typecode,
                             &err_dst_typecode)) {
@@ -1291,6 +1295,74 @@ find_ufunc_specified_userloop(PyUFuncObject *self,
     return 0;
 }
 
+/*
+ * Provides an ordering for the dtype 'kind' character codes, to help
+ * determine when to use the min_scalar_type function. This groups
+ * 'kind' into boolean, integer, floating point, and everything else.
+ */
+
+static int
+dtype_kind_to_simplified_ordering(char kind)
+{
+    switch (kind) {
+        /* Boolean kind */
+        case 'b':
+            return 0;
+        /* Unsigned int kind */
+        case 'u':
+        /* Signed int kind */
+        case 'i':
+            return 1;
+        /* Float kind */
+        case 'f':
+        /* Complex kind */
+        case 'c':
+            return 2;
+        /* Anything else */
+        default:
+            return 3;
+    }
+}
+
+static int
+should_use_min_scalar(PyArrayObject **op, int nop)
+{
+    int i, use_min_scalar, kind;
+    int all_scalars = 1, max_scalar_kind = -1, max_array_kind = -1;
+
+    /*
+     * Determine if there are any scalars, and if so, whether
+     * the maximum "kind" of the scalars surpasses the maximum
+     * "kind" of the arrays
+     */
+    use_min_scalar = 0;
+    if (nop > 1) {
+        for(i = 0; i < nop; ++i) {
+            kind = dtype_kind_to_simplified_ordering(
+                                PyArray_DESCR(op[i])->kind);
+            if (PyArray_NDIM(op[i]) == 0) {
+                if (kind > max_scalar_kind) {
+                    max_scalar_kind = kind;
+                }
+            }
+            else {
+                all_scalars = 0;
+                if (kind > max_array_kind) {
+                    max_array_kind = kind;
+                }
+
+            }
+        }
+
+        /* Indicate whether to use the min_scalar_type function */
+        if (!all_scalars && max_array_kind >= max_scalar_kind) {
+            use_min_scalar = 1;
+        }
+    }
+
+    return use_min_scalar;
+}
+
 /*
  * Does a linear search for the best inner loop of the ufunc.
  * When op[i] is a scalar or a one dimensional array smaller than
@@ -1313,29 +1385,23 @@ find_best_ufunc_inner_loop(PyUFuncObject *self,
                         void **out_innerloopdata,
                         int *out_trivial_loop_ok)
 {
-    npy_intp i, j, nin = self->nin, niter = nin + self->nout;
+    npy_intp i, j, nin = self->nin, nop = nin + self->nout;
     int types[NPY_MAXARGS];
     char *ufunc_name;
-    int no_castable_output, all_inputs_scalar;
+    int no_castable_output, use_min_scalar;
 
     /* For making a better error message on coercion error */
     char err_dst_typecode = '-', err_src_typecode = '-';
 
     ufunc_name = self->name ? self->name : "(unknown)";
 
-    /* Check whether all the inputs are scalar */
-    all_inputs_scalar = 1;
-    for(i = 0; i < nin; ++i) {
-        if (PyArray_NDIM(op[i]) > 0) {
-            all_inputs_scalar = 0;
-        }
-    }
+    use_min_scalar = should_use_min_scalar(op, nin);
 
     /* If the ufunc has userloops, search for them. */
     if (self->userloops) {
         switch (find_ufunc_matching_userloop(self, op,
                                 input_casting, output_casting,
-                                buffersize, any_object, all_inputs_scalar,
+                                buffersize, any_object, use_min_scalar,
                                 out_dtype, out_innerloop, out_innerloopdata,
                                 out_trivial_loop_ok,
                                 &no_castable_output, &err_src_typecode,
@@ -1370,14 +1436,14 @@ find_best_ufunc_inner_loop(PyUFuncObject *self,
         char *orig_types = self->types + i*self->nargs;
 
         /* Copy the types into an int array for matching */
-        for (j = 0; j < niter; ++j) {
+        for (j = 0; j < nop; ++j) {
             types[j] = orig_types[j];
         }
 
         NPY_UF_DBG_PRINT1("Trying function loop %d\n", (int)i);
         switch (ufunc_loop_matches(self, op,
                     input_casting, output_casting,
-                    any_object, all_inputs_scalar,
+                    any_object, use_min_scalar,
                     types,
                     &no_castable_output, &err_src_typecode,
                     &err_dst_typecode)) {
@@ -1448,32 +1514,26 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
                         void **out_innerloopdata,
                         int *out_trivial_loop_ok)
 {
-    npy_intp i, j, n, nin = self->nin, niter = nin + self->nout;
+    npy_intp i, j, n, nin = self->nin, nop = nin + self->nout;
     int n_specified = 0;
     int specified_types[NPY_MAXARGS], types[NPY_MAXARGS];
     char *ufunc_name;
-    int no_castable_output, all_inputs_scalar;
+    int no_castable_output, use_min_scalar;
 
     /* For making a better error message on coercion error */
     char err_dst_typecode = '-', err_src_typecode = '-';
 
     ufunc_name = self->name ? self->name : "(unknown)";
 
-    /* Check whether all the inputs are scalar */
-    all_inputs_scalar = 1;
-    for(i = 0; i < nin; ++i) {
-        if (PyArray_NDIM(op[i]) > 0) {
-            all_inputs_scalar = 0;
-        }
-    }
+    use_min_scalar = should_use_min_scalar(op, nin);
 
     /* Fill in specified_types from the tuple or string */
     if (PyTuple_Check(type_tup)) {
         n = PyTuple_GET_SIZE(type_tup);
-        if (n != 1 && n != niter) {
+        if (n != 1 && n != nop) {
             PyErr_Format(PyExc_ValueError,
                          "a type-tuple must be specified " \
-                         "of length 1 or %d for ufunc '%s'", (int)niter,
+                         "of length 1 or %d for ufunc '%s'", (int)nop,
                          self->name ? self->name : "(unknown)");
             return -1;
         }
@@ -1507,7 +1567,7 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
             Py_XDECREF(str_obj);
             return -1;
         }
-        if (length != 1 && (length != niter + 2 ||
+        if (length != 1 && (length != nop + 2 ||
                                 str[nin] != '-' || str[nin+1] != '>')) {
             PyErr_Format(PyExc_ValueError,
                                  "a type-string for %s, "   \
@@ -1534,9 +1594,9 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
         }
         else {
             PyArray_Descr *dtype;
-            n_specified = (int)niter;
+            n_specified = (int)nop;
 
-            for (i = 0; i < niter; ++i) {
+            for (i = 0; i < nop; ++i) {
                 npy_intp istr = i < nin ? i : i+2;
 
                 dtype = PyArray_DescrFromType(str[istr]);
@@ -1559,7 +1619,7 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
         switch (find_ufunc_specified_userloop(self,
                         n_specified, specified_types,
                         op, casting,
-                        buffersize, any_object, all_inputs_scalar,
+                        buffersize, any_object, use_min_scalar,
                         out_dtype, out_innerloop, out_innerloopdata,
                         out_trivial_loop_ok)) {
             /* Error */
@@ -1579,12 +1639,12 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
         NPY_UF_DBG_PRINT1("Trying function loop %d\n", (int)i);
 
         /* Copy the types into an int array for matching */
-        for (j = 0; j < niter; ++j) {
+        for (j = 0; j < nop; ++j) {
             types[j] = orig_types[j];
         }
 
-        if (n_specified == niter) {
-            for (j = 0; j < niter; ++j) {
+        if (n_specified == nop) {
+            for (j = 0; j < nop; ++j) {
                 if (types[j] != specified_types[j]) {
                     matched = 0;
                     break;
@@ -1604,7 +1664,7 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
         NPY_UF_DBG_PRINT("It matches, confirming type casting\n");
         switch (ufunc_loop_matches(self, op,
                     casting, casting,
-                    any_object, all_inputs_scalar,
+                    any_object, use_min_scalar,
                     types,
                     &no_castable_output, &err_src_typecode,
                     &err_dst_typecode)) {
@@ -1783,7 +1843,7 @@ iterator_loop(PyUFuncObject *self,
                     void *innerloopdata)
 {
     npy_intp i, nin = self->nin, nout = self->nout;
-    npy_intp niter = nin + nout;
+    npy_intp nop = nin + nout;
     npy_uint32 op_flags[NPY_MAXARGS];
     NpyIter *iter;
     char *baseptrs[NPY_MAXARGS];
@@ -1803,7 +1863,7 @@ iterator_loop(PyUFuncObject *self,
         op_flags[i] = NPY_ITER_READONLY|
                       NPY_ITER_ALIGNED;
     }
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         op_flags[i] = NPY_ITER_WRITEONLY|
                       NPY_ITER_ALIGNED|
                       NPY_ITER_ALLOCATE|
@@ -1816,7 +1876,7 @@ iterator_loop(PyUFuncObject *self,
      * were already checked, we use the casting rule 'unsafe' which
      * is faster to calculate.
      */
-    iter = NpyIter_AdvancedNew(niter, op,
+    iter = NpyIter_AdvancedNew(nop, op,
                         NPY_ITER_EXTERNAL_LOOP|
                         NPY_ITER_REFS_OK|
                         NPY_ITER_ZEROSIZE_OK|
@@ -1834,7 +1894,7 @@ iterator_loop(PyUFuncObject *self,
 
     /* Copy any allocated outputs */
     op_it = NpyIter_GetOperandArray(iter);
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         if (op[i] == NULL) {
             op[i] = op_it[i];
             Py_INCREF(op[i]);
@@ -1857,7 +1917,7 @@ iterator_loop(PyUFuncObject *self,
         for (i = 0; i < nin; ++i) {
             baseptrs[i] = PyArray_BYTES(op_it[i]);
         }
-        for (i = nin; i < niter; ++i) {
+        for (i = nin; i < nop; ++i) {
             baseptrs[i] = PyArray_BYTES(op[i]);
         }
         if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
@@ -2074,7 +2134,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
                         PyArrayObject **op)
 {
     int nin, nout;
-    int i, idim, niter;
+    int i, idim, nop;
     char *ufunc_name;
     int retval = -1, any_object = 0, subok = 1;
     NPY_CASTING input_casting;
@@ -2116,7 +2176,6 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
 
     int trivial_loop_ok = 0;
 
-    /* TODO: For 1.6, the default should probably be NPY_CORDER */
     NPY_ORDER order = NPY_KEEPORDER;
     /*
      * Many things in NumPy do unsafe casting (doing int += float, etc).
@@ -2134,14 +2193,14 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
 
     nin = self->nin;
     nout = self->nout;
-    niter = nin + nout;
+    nop = nin + nout;
 
     ufunc_name = self->name ? self->name : "<unnamed ufunc>";
 
     NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s\n", ufunc_name);
 
     /* Initialize all the operands and dtypes to NULL */
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         op[i] = NULL;
         dtype[i] = NULL;
         arr_prep[i] = NULL;
@@ -2176,7 +2235,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     /* Fill in op_axes for all the operands */
     core_dim_ixs_size = 0;
     core_dim_ixs = self->core_dim_ixs;
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         int n;
         if (op[i]) {
             /*
@@ -2281,7 +2340,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
         printf(" ");
     }
     printf("\noutput types:\n");
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         PyObject_Print((PyObject *)dtype[i], stdout, 0);
         printf(" ");
     }
@@ -2318,7 +2377,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
                       NPY_ITER_COPY|
                       NPY_ITER_ALIGNED;
     }
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         op_flags[i] = NPY_ITER_READWRITE|
                       NPY_ITER_UPDATEIFCOPY|
                       NPY_ITER_ALIGNED|
@@ -2327,7 +2386,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     }
 
     /* Create the iterator */
-    iter = NpyIter_AdvancedNew(niter, op, NPY_ITER_MULTI_INDEX|
+    iter = NpyIter_AdvancedNew(nop, op, NPY_ITER_MULTI_INDEX|
                                       NPY_ITER_REFS_OK|
                                       NPY_ITER_REDUCE_OK,
                            order, NPY_UNSAFE_CASTING, op_flags,
@@ -2338,7 +2397,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     }
 
     /* Fill in any allocated outputs */
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         if (op[i] == NULL) {
             op[i] = NpyIter_GetOperandArray(iter)[i];
             Py_INCREF(op[i]);
@@ -2350,10 +2409,10 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
      * buffering, the strides are fixed throughout the looping.
      */
     inner_strides = (npy_intp *)_pya_malloc(
-                        NPY_SIZEOF_INTP * (niter+core_dim_ixs_size));
-    /* The strides after the first niter match core_dim_ixs */
+                        NPY_SIZEOF_INTP * (nop+core_dim_ixs_size));
+    /* The strides after the first nop match core_dim_ixs */
     core_dim_ixs = self->core_dim_ixs;
-    inner_strides_tmp = inner_strides + niter;
+    inner_strides_tmp = inner_strides + nop;
     for (idim = 0; idim < self->core_num_dim_ix; ++idim) {
         ax_strides_tmp[idim] = NpyIter_GetAxisStrideArray(iter,
                                                 broadcast_ndim+idim);
@@ -2362,7 +2421,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
             goto fail;
         }
     }
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         for (idim = 0; idim < self->core_num_dims[i]; ++idim) {
             inner_strides_tmp[idim] = ax_strides_tmp[core_dim_ixs[idim]][i];
         }
@@ -2397,15 +2456,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     }
 
     /*
-     * The first niter strides are for the inner loop (but only can
+     * The first nop strides are for the inner loop (but only can
      * copy them after removing the core axes
      */
     memcpy(inner_strides, NpyIter_GetInnerStrideArray(iter),
-                                    NPY_SIZEOF_INTP * niter);
+                                    NPY_SIZEOF_INTP * nop);
 
 #if 0
     printf("strides: ");
-    for (i = 0; i < niter+core_dim_ixs_size; ++i) {
+    for (i = 0; i < nop+core_dim_ixs_size; ++i) {
         printf("%d ", (int)inner_strides[i]);
     }
     printf("\n");
@@ -2448,7 +2507,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     _pya_free(inner_strides);
     NpyIter_Deallocate(iter);
     /* The caller takes ownership of all the references in op */
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         Py_XDECREF(dtype[i]);
         Py_XDECREF(arr_prep[i]);
     }
@@ -2468,7 +2527,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
     }
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         Py_XDECREF(op[i]);
         op[i] = NULL;
         Py_XDECREF(dtype[i]);
@@ -2492,7 +2551,7 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
                         PyArrayObject **op)
 {
     int nin, nout;
-    int i, niter;
+    int i, nop;
     char *ufunc_name;
     int retval = -1, any_object = 0, subok = 1;
     NPY_CASTING input_casting;
@@ -2541,14 +2600,14 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
 
     nin = self->nin;
     nout = self->nout;
-    niter = nin + nout;
+    nop = nin + nout;
 
     ufunc_name = self->name ? self->name : "<unnamed ufunc>";
 
     NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s\n", ufunc_name);
 
     /* Initialize all the operands and dtypes to NULL */
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         op[i] = NULL;
         dtype[i] = NULL;
         arr_prep[i] = NULL;
@@ -2629,7 +2688,7 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
         printf(" ");
     }
     printf("\noutput types:\n");
-    for (i = nin; i < niter; ++i) {
+    for (i = nin; i < nop; ++i) {
         PyObject_Print((PyObject *)dtype[i], stdout, 0);
         printf(" ");
     }
@@ -2678,7 +2737,7 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
     }
 
     /* The caller takes ownership of all the references in op */
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         Py_XDECREF(dtype[i]);
         Py_XDECREF(arr_prep[i]);
     }
@@ -2692,7 +2751,7 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
 
 fail:
     NPY_UF_DBG_PRINT1("Returning failure code %d\n", retval);
-    for (i = 0; i < niter; ++i) {
+    for (i = 0; i < nop; ++i) {
         Py_XDECREF(op[i]);
         op[i] = NULL;
         Py_XDECREF(dtype[i]);
@@ -3087,8 +3146,6 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
 
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
-        npy_intp *stride;
-        npy_intp *count_ptr;
 
         int itemsize = op_dtypes[0]->elsize;
 
@@ -3098,8 +3155,6 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
             goto fail;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
-        stride = NpyIter_GetInnerStrideArray(iter);
-        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
 
 
         /* Execute the loop with two nested iterators */
@@ -3383,6 +3438,9 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
     if (iter_inner != NULL) {
         NpyIter_Deallocate(iter_inner);
     }
+
+    Py_XDECREF(errobj);
+
     return (PyObject *)out;
 
 fail:
@@ -3634,8 +3692,6 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
         char **dataptr;
         npy_intp count_m1;
         npy_intp stride0, stride1;
-        npy_intp *stride;
-        npy_intp *count_ptr;
         npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
 
         int itemsize = op_dtypes[0]->elsize;
@@ -3646,9 +3702,6 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
             goto fail;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
-        stride = NpyIter_GetInnerStrideArray(iter);
-        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
-
 
         /* Execute the loop with just the outer iterator */
         count_m1 = PyArray_DIM(op[1], axis)-1;
@@ -3771,6 +3824,9 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
     }
+
+    Py_XDECREF(errobj);
+
     return (PyObject *)out;
 
 fail:
@@ -4410,6 +4466,7 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
     self->core_signature = NULL;
     if (signature != NULL) {
         if (_parse_signature(self, signature) != 0) {
+            Py_DECREF(self);
             return NULL;
         }
     }
diff --git a/numpy/core/src/umath/umath_tests.c.src b/numpy/core/src/umath/umath_tests.c.src
index 8294220903b3..cb1d541f5c16 100644
--- a/numpy/core/src/umath/umath_tests.c.src
+++ b/numpy/core/src/umath/umath_tests.c.src
@@ -271,6 +271,7 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
     }
     if (f == NULL) return NULL;
     core_enabled = ((PyUFuncObject*)f)->core_enabled;
+    Py_DECREF(f);
     return Py_BuildValue("i", core_enabled);
 }
 
diff --git a/numpy/core/src/umath/umathmodule.c.src b/numpy/core/src/umath/umathmodule.c.src
index 7a76c2b3ea30..8d081f85b239 100644
--- a/numpy/core/src/umath/umathmodule.c.src
+++ b/numpy/core/src/umath/umathmodule.c.src
@@ -94,7 +94,6 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
         fname_len = 1;
         PyErr_Clear();
     }
-    Py_XDECREF(pyname);
 
     /*
      * self->ptr holds a pointer for enough memory for
@@ -119,6 +118,7 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     self->ptr = _pya_malloc(offset[0] + offset[1] + sizeof(void *) +
                             (fname_len + 14));
     if (self->ptr == NULL) {
+        Py_XDECREF(pyname);
         return PyErr_NoMemory();
     }
     Py_INCREF(function);
@@ -139,6 +139,8 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     memcpy(str+fname_len, " (vectorized)", 14);
     self->name = str;
 
+    Py_XDECREF(pyname);
+
     /* Do a better job someday */
     self->doc = "dynamic ufunc based on a python function";
 
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
new file mode 100644
index 000000000000..255ef45655c8
--- /dev/null
+++ b/numpy/core/tests/test_api.py
@@ -0,0 +1,28 @@
+import sys
+
+import numpy as np
+from numpy.testing import *
+from numpy.testing.utils import WarningManager
+import warnings
+
+def test_fastCopyAndTranspose():
+    # 0D array
+    a = np.array(2)
+    b = np.fastCopyAndTranspose(a)
+    assert_equal(b, a.T)
+    assert_(b.flags.owndata)
+
+    # 1D array
+    a = np.array([3,2,7,0])
+    b = np.fastCopyAndTranspose(a)
+    assert_equal(b, a.T)
+    assert_(b.flags.owndata)
+
+    # 2D array
+    a = np.arange(6).reshape(2,3)
+    b = np.fastCopyAndTranspose(a)
+    assert_equal(b, a.T)
+    assert_(b.flags.owndata)
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index 954869727eda..c7b69a09f5ed 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -6,5 +6,52 @@ def test_nan_inf(self):
         x = np.array([np.nan, np.inf])
         assert_equal(repr(x), 'array([ nan,  inf])')
 
+class TestComplexArray(TestCase):
+    def test_str(self):
+        rvals = [0, 1, -1, np.inf, -np.inf, np.nan]
+        cvals = [complex(rp, ip) for rp in rvals for ip in rvals]
+        dtypes = [np.complex64, np.cdouble, np.clongdouble]
+        actual = [str(np.array([c], dt)) for c in cvals for dt in dtypes]
+        wanted = [
+            '[ 0.+0.j]',    '[ 0.+0.j]',    '[ 0.0+0.0j]', 
+            '[ 0.+1.j]',    '[ 0.+1.j]',    '[ 0.0+1.0j]', 
+            '[ 0.-1.j]',    '[ 0.-1.j]',    '[ 0.0-1.0j]', 
+            '[ 0.+infj]',   '[ 0.+infj]',   '[ 0.0+infj]', 
+            '[ 0.-infj]',   '[ 0.-infj]',   '[ 0.0-infj]', 
+            '[ 0.+nanj]',   '[ 0.+nanj]',   '[ 0.0+nanj]', 
+            '[ 1.+0.j]',    '[ 1.+0.j]',    '[ 1.0+0.0j]', 
+            '[ 1.+1.j]',    '[ 1.+1.j]',    '[ 1.0+1.0j]', 
+            '[ 1.-1.j]',    '[ 1.-1.j]',    '[ 1.0-1.0j]', 
+            '[ 1.+infj]',   '[ 1.+infj]',   '[ 1.0+infj]', 
+            '[ 1.-infj]',   '[ 1.-infj]',   '[ 1.0-infj]', 
+            '[ 1.+nanj]',   '[ 1.+nanj]',   '[ 1.0+nanj]', 
+            '[-1.+0.j]',    '[-1.+0.j]',    '[-1.0+0.0j]', 
+            '[-1.+1.j]',    '[-1.+1.j]',    '[-1.0+1.0j]', 
+            '[-1.-1.j]',    '[-1.-1.j]',    '[-1.0-1.0j]', 
+            '[-1.+infj]',   '[-1.+infj]',   '[-1.0+infj]', 
+            '[-1.-infj]',   '[-1.-infj]',   '[-1.0-infj]', 
+            '[-1.+nanj]',   '[-1.+nanj]',   '[-1.0+nanj]', 
+            '[ inf+0.j]',   '[ inf+0.j]',   '[ inf+0.0j]', 
+            '[ inf+1.j]',   '[ inf+1.j]',   '[ inf+1.0j]', 
+            '[ inf-1.j]',   '[ inf-1.j]',   '[ inf-1.0j]', 
+            '[ inf+infj]',  '[ inf+infj]',  '[ inf+infj]', 
+            '[ inf-infj]',  '[ inf-infj]',  '[ inf-infj]', 
+            '[ inf+nanj]',  '[ inf+nanj]',  '[ inf+nanj]', 
+            '[-inf+0.j]',   '[-inf+0.j]',   '[-inf+0.0j]', 
+            '[-inf+1.j]',   '[-inf+1.j]',   '[-inf+1.0j]', 
+            '[-inf-1.j]',   '[-inf-1.j]',   '[-inf-1.0j]', 
+            '[-inf+infj]',  '[-inf+infj]',  '[-inf+infj]', 
+            '[-inf-infj]',  '[-inf-infj]',  '[-inf-infj]', 
+            '[-inf+nanj]',  '[-inf+nanj]',  '[-inf+nanj]', 
+            '[ nan+0.j]',   '[ nan+0.j]',   '[ nan+0.0j]', 
+            '[ nan+1.j]',   '[ nan+1.j]',   '[ nan+1.0j]', 
+            '[ nan-1.j]',   '[ nan-1.j]',   '[ nan-1.0j]', 
+            '[ nan+infj]',  '[ nan+infj]',  '[ nan+infj]', 
+            '[ nan-infj]',  '[ nan-infj]',  '[ nan-infj]', 
+            '[ nan+nanj]',  '[ nan+nanj]',  '[ nan+nanj]']
+
+        for res, val in zip(actual, wanted):
+            assert_(res == val)
+ 
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 7f29889c2a70..d0161b366c78 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -421,6 +421,12 @@ def check_einsum_sums(self, dtype):
         assert_equal(b, np.sum(a))
         assert_equal(b.dtype, np.dtype(dtype))
 
+        # A case which was failing (ticket #1885)
+        p = np.arange(2) + 1
+        q = np.arange(4).reshape(2,2) + 3
+        r = np.arange(4).reshape(2,2) + 7
+        assert_equal(np.einsum('z,mz,zm->', p, q, r), 253)
+
     def test_einsum_sums_int8(self):
         self.check_einsum_sums('i1');
 
@@ -466,5 +472,18 @@ def test_einsum_sums_cfloat128(self):
     def test_einsum_sums_clongdouble(self):
         self.check_einsum_sums(np.clongdouble);
 
+    def test_einsum_misc(self):
+        # This call used to crash because of a bug in
+        # PyArray_FillWithZero
+        a = np.ones((1,2))
+        b = np.ones((2,2,1))
+        assert_equal(np.einsum('ij...,j...->i...',a,b), [[[2],[2]]])
+
+        # The iterator had an issue with buffering this reduction
+        a = np.ones((5, 12, 4, 2, 3), np.int64)
+        b = np.ones((5, 12, 11), np.int64)
+        assert_equal(np.einsum('ijklm,ijn,ijn->',a,b,b),
+                        np.einsum('ijklm,ijn->',a,b))
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_iterator.py b/numpy/core/tests/test_iterator.py
index e5a073e12ef3..25194c9be377 100644
--- a/numpy/core/tests/test_iterator.py
+++ b/numpy/core/tests/test_iterator.py
@@ -1666,12 +1666,15 @@ def test_iter_buffered_cast_structured_type():
 
     # object -> struct type
     sdt = [('a', 'f4'), ('b', 'i8'), ('c', 'c8', (2,3)), ('d', 'O')]
-    a = np.arange(3, dtype='O') + 0.5
+    a = np.zeros((3,), dtype='O')
+    a[0] = (0.5,0.5,[[0.5,0.5,0.5],[0.5,0.5,0.5]],0.5)
+    a[1] = (1.5,1.5,[[1.5,1.5,1.5],[1.5,1.5,1.5]],1.5)
+    a[2] = (2.5,2.5,[[2.5,2.5,2.5],[2.5,2.5,2.5]],2.5)
     rc = sys.getrefcount(a[0])
     i = nditer(a, ['buffered','refs_ok'], ['readonly'],
                     casting='unsafe',
                     op_dtypes=sdt)
-    vals = [np.array(x) for x in i]
+    vals = [x.copy() for x in i]
     assert_equal(vals[0]['a'], 0.5)
     assert_equal(vals[0]['b'], 0)
     assert_equal(vals[0]['c'], [[(0.5)]*3]*2)
@@ -2218,6 +2221,25 @@ def test_iter_reduction():
     assert_equal(i.operands[1].ndim, 0)
     assert_equal(i.operands[1], np.sum(a))
 
+    # This is a tricky reduction case for the buffering double loop
+    # to handle
+    a = np.ones((2,3,5))
+    it1 = nditer([a,None], ['reduce_ok','external_loop'],
+                    [['readonly'], ['readwrite','allocate']],
+                    op_axes=[None,[0,-1,1]])
+    it2 = nditer([a,None], ['reduce_ok','external_loop',
+                            'buffered','delay_bufalloc'],
+                    [['readonly'], ['readwrite','allocate']],
+                    op_axes=[None,[0,-1,1]], buffersize=10)
+    it1.operands[1].fill(0)
+    it2.operands[1].fill(0)
+    it2.reset()
+    for x in it1:
+        x[1][...] += x[0]
+    for x in it2:
+        x[1][...] += x[0]
+    assert_equal(it1.operands[1], it2.operands[1])
+    assert_equal(it2.operands[1].sum(), a.size)
 
 def test_iter_buffering_reduction():
     # Test doing buffered reductions with the iterator
@@ -2248,5 +2270,16 @@ def test_iter_buffering_reduction():
         y[...] += x
     assert_equal(b, np.sum(a, axis=1))
 
+    # Iterator inner double loop was wrong on this one
+    p = np.arange(2) + 1
+    it = np.nditer([p,None],
+            ['delay_bufalloc','reduce_ok','buffered','external_loop'],
+            [['readonly'],['readwrite','allocate']],
+            op_axes=[[-1,0],[-1,-1]],
+            itershape=(2,2))
+    it.operands[1].fill(0)
+    it.reset()
+    assert_equal(it[0], [1,2,1,2])
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 811fb33ab4e4..dbf8618bb146 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1020,7 +1020,7 @@ def test_max_or_min(self):
         assert np.all(x <= 4)
 
 
-class TestPutmask(TestCase):
+class TestPutmask:
     def tst_basic(self,x,T,mask,val):
         np.putmask(x,mask,val)
         assert np.all(x[mask] == T(val))
@@ -1039,8 +1039,7 @@ def test_ip_types(self):
                         yield self.tst_basic,x.copy().astype(T),T,mask,val
 
     def test_mask_size(self):
-        self.assertRaises(ValueError, np.putmask,
-                              np.array([1,2,3]), [True], 5)
+        assert_raises(ValueError, np.putmask, np.array([1,2,3]), [True], 5)
 
     def tst_byteorder(self,dtype):
         x = np.array([1,2,3],dtype)
@@ -1067,7 +1066,7 @@ def test_masked_array(self):
         pass
 
 
-class TestTake(TestCase):
+class TestTake:
     def tst_basic(self,x):
         ind = range(x.shape[0])
         assert_array_equal(x.take(ind, axis=0), x)
@@ -1085,8 +1084,8 @@ def test_ip_types(self):
     def test_raise(self):
         x = np.random.random(24)*100
         x.shape = 2,3,4
-        self.assertRaises(IndexError, x.take, [0,1,2], axis=0)
-        self.assertRaises(IndexError, x.take, [-3], axis=0)
+        assert_raises(IndexError, x.take, [0,1,2], axis=0)
+        assert_raises(IndexError, x.take, [-3], axis=0)
         assert_array_equal(x.take([-1], axis=0)[0], x[1])
 
     def test_clip(self):
@@ -1149,6 +1148,11 @@ def tearDown(self):
             os.unlink(self.filename)
             #tmp_file.close()
 
+    def test_bool_fromstring(self):
+        v = np.array([True,False,True,False], dtype=np.bool_)
+        y = np.fromstring('1 0 -2.3 0.0', sep=' ', dtype=np.bool_)
+        assert_array_equal(v, y)
+
     def test_empty_files_binary(self):
         f = open(self.filename, 'w')
         f.close()
@@ -1251,7 +1255,7 @@ def test_big_binary(self):
             # check only start and end for speed:
             assert_((a[:n] == testbytes).all())
             assert_((a[-n:] == testbytes).all())
-        except MemoryError:
+        except (MemoryError, ValueError):
             pass
 
     def test_string(self):
@@ -1326,7 +1330,7 @@ def test_locale(self):
         in_foreign_locale(self.test_tofile_format)()
 
 
-class TestFromBuffer(TestCase):
+class TestFromBuffer:
     def tst_basic(self,buffer,expected,kwargs):
         assert_array_equal(np.frombuffer(buffer,**kwargs),expected)
 
@@ -1898,6 +1902,17 @@ def VV(n):
             self._check('^ixxxx', [('f0', 'i'), ('', 'V4')])
             self._check('^i7x', [('f0', 'i'), ('', 'V7')])
 
+        def test_native_padding_3(self):
+            dt = np.dtype([('a', 'b'), ('b', 'i'), ('sub', np.dtype('b,i')), ('c', 'i')], align=True)
+            self._check("T{b:a:xxxi:b:T{b:f0:=i:f1:}:sub:xxxi:c:}", dt)
+
+            dt = np.dtype([('a', 'b'), ('b', 'i'), ('c', 'b'), ('d', 'b'), ('e', 'b'), ('sub', np.dtype('b,i', align=True))])
+            self._check("T{b:a:=i:b:b:c:b:d:b:e:T{b:f0:xxxi:f1:}:sub:}", dt)
+
+        def test_padding_with_array_inside_struct(self):
+            dt = np.dtype([('a', 'b'), ('b', 'i'), ('c', 'b', (3,)), ('d', 'i')], align=True)
+            self._check("T{b:a:xxxi:b:3b:c:xi:d:}", dt)
+
         def test_byteorder_inside_struct(self):
             # The byte order after @T{=i} should be '=', not '@'.
             # Check this by noting the absence of native alignment.
@@ -1954,9 +1969,11 @@ def test_roundtrip(self):
                   ('l', 'S4'),
                   ('m', 'U4'),
                   ('n', 'V3'),
-                  ('o', '?')]
+                  ('o', '?'),
+                  ('p', np.half),
+                 ]
             x = np.array([(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                           asbytes('aaaa'), 'bbbb', asbytes('xxx'), True)],
+                           asbytes('aaaa'), 'bbbb', asbytes('xxx'), True, 1.0)],
                          dtype=dt)
             self._check_roundtrip(x)
 
@@ -1988,6 +2005,25 @@ def test_roundtrip(self):
                 x = np.array([1,2,3], dtype='<q')
                 assert_raises(ValueError, self._check_roundtrip, x)
 
+        def test_roundtrip_half(self):
+            half_list = [
+                1.0,
+                -2.0,
+                6.5504 * 10**4, #  (max half precision)
+                2**-14, # ~= 6.10352 * 10**-5 (minimum positive normal)
+                2**-24, # ~= 5.96046 * 10**-8 (minimum strictly positive subnormal)
+                0.0,
+                -0.0,
+                float('+inf'),
+                float('-inf'),
+                0.333251953125, # ~= 1/3
+            ]
+
+            x = np.array(half_list, dtype='>e')
+            self._check_roundtrip(x)
+            x = np.array(half_list, dtype='<e')
+            self._check_roundtrip(x)
+
         def test_export_simple_1d(self):
             x = np.array([1,2,3,4,5], dtype='i')
             y = memoryview(x)
@@ -2038,9 +2074,11 @@ def test_export_record(self):
                   ('l', 'S4'),
                   ('m', 'U4'),
                   ('n', 'V3'),
-                  ('o', '?')]
+                  ('o', '?'),
+                  ('p', np.half),
+                 ]
             x = np.array([(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                           asbytes('aaaa'), 'bbbb', asbytes('   '), True)],
+                           asbytes('aaaa'), 'bbbb', asbytes('   '), True, 1.0)],
                          dtype=dt)
             y = memoryview(x)
             assert_equal(y.shape, (1,))
@@ -2049,9 +2087,9 @@ def test_export_record(self):
 
             sz = sum([dtype(b).itemsize for a, b in dt])
             if dtype('l').itemsize == 4:
-                assert_equal(y.format, 'T{b:a:=h:b:i:c:l:d:^q:dx:B:e:@H:f:=I:g:L:h:^Q:hx:=f:i:d:j:^g:k:=Zf:ix:Zd:jx:^Zg:kx:4s:l:=4w:m:3x:n:?:o:}')
+                assert_equal(y.format, 'T{b:a:=h:b:i:c:l:d:^q:dx:B:e:@H:f:=I:g:L:h:^Q:hx:=f:i:d:j:^g:k:=Zf:ix:Zd:jx:^Zg:kx:4s:l:=4w:m:3x:n:?:o:@e:p:}')
             else:
-                assert_equal(y.format, 'T{b:a:=h:b:i:c:q:d:^q:dx:B:e:@H:f:=I:g:Q:h:^Q:hx:=f:i:d:j:^g:k:=Zf:ix:Zd:jx:^Zg:kx:4s:l:=4w:m:3x:n:?:o:}')
+                assert_equal(y.format, 'T{b:a:=h:b:i:c:q:d:^q:dx:B:e:@H:f:=I:g:Q:h:^Q:hx:=f:i:d:j:^g:k:=Zf:ix:Zd:jx:^Zg:kx:4s:l:=4w:m:3x:n:?:o:@e:p:}')
             assert_equal(y.strides, (sz,))
             assert_equal(y.itemsize, sz)
 
@@ -2093,6 +2131,18 @@ def test_reference_leak(self):
             count_2 = sys.getrefcount(np.core._internal)
             assert_equal(count_1, count_2)
 
+        def test_padded_struct_array(self):
+            dt1 = np.dtype([('a', 'b'), ('b', 'i'), ('sub', np.dtype('b,i')), ('c', 'i')], align=True)
+            x1 = np.arange(dt1.itemsize, dtype=np.int8).view(dt1)
+            self._check_roundtrip(x1)
+
+            dt2 = np.dtype([('a', 'b'), ('b', 'i'), ('c', 'b', (3,)), ('d', 'i')], align=True)
+            x2 = np.arange(dt2.itemsize, dtype=np.int8).view(dt2)
+            self._check_roundtrip(x2)
+
+            dt3 = np.dtype([('a', 'b'), ('b', 'i'), ('c', 'b'), ('d', 'b'), ('e', 'b'), ('sub', np.dtype('b,i', align=True))])
+            x3 = np.arange(dt3.itemsize, dtype=np.int8).view(dt3)
+            self._check_roundtrip(x3)
 
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index f7ecdfbcf46b..d80136605d09 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -1,4 +1,5 @@
 import sys
+import platform
 from decimal import Decimal
 
 import numpy as np
@@ -123,7 +124,7 @@ def test_all(self):
                 arg2 = rand(*dim2)
                 c1 = dot(arg1, arg2)
                 c2 = dot_(arg1, arg2)
-                assert (c1.shape == c2.shape)
+                assert_(c1.shape == c2.shape)
                 assert_almost_equal(c1, c2, decimal=self.N)
 
     def test_vecobject(self):
@@ -157,23 +158,23 @@ class TestNonarrayArgs(TestCase):
     # check that non-array arguments to functions wrap them in arrays
     def test_squeeze(self):
         A = [[[1,1,1],[2,2,2],[3,3,3]]]
-        assert squeeze(A).shape == (3,3)
+        assert_(squeeze(A).shape == (3,3))
 
     def test_cumproduct(self):
         A = [[1,2,3],[4,5,6]]
-        assert all(cumproduct(A) == array([1,2,6,24,120,720]))
+        assert_(all(cumproduct(A) == array([1,2,6,24,120,720])))
 
     def test_size(self):
         A = [[1,2,3],[4,5,6]]
-        assert size(A) == 6
-        assert size(A,0) == 2
-        assert size(A,1) == 3
+        assert_(size(A) == 6)
+        assert_(size(A,0) == 2)
+        assert_(size(A,1) == 3)
 
     def test_mean(self):
         A = [[1,2,3],[4,5,6]]
-        assert mean(A) == 3.5
-        assert all(mean(A,0) == array([2.5,3.5,4.5]))
-        assert all(mean(A,1) == array([2.,5.]))
+        assert_(mean(A) == 3.5)
+        assert_(all(mean(A,0) == array([2.5,3.5,4.5])))
+        assert_(all(mean(A,1) == array([2.,5.])))
 
     def test_std(self):
         A = [[1,2,3],[4,5,6]]
@@ -222,16 +223,25 @@ def test_bitwise_xor(self):
 
 
 class TestSeterr(TestCase):
+    def test_default(self):
+        err = geterr()
+        self.assertEqual(err, dict(
+            divide='warn',
+            invalid='warn',
+            over='warn',
+            under='ignore',
+        ))
+
     def test_set(self):
         err = seterr()
         try:
-            old = seterr(divide='warn')
+            old = seterr(divide='print')
             self.assertTrue(err == old)
             new = seterr()
-            self.assertTrue(new['divide'] == 'warn')
+            self.assertTrue(new['divide'] == 'print')
             seterr(over='raise')
             self.assertTrue(geterr()['over'] == 'raise')
-            self.assertTrue(new['divide'] == 'warn')
+            self.assertTrue(new['divide'] == 'print')
             seterr(**old)
             self.assertTrue(geterr() == old)
         finally:
@@ -276,6 +286,9 @@ def assert_op_raises_fpe(self, fpeerr, flop, sc1, sc2):
         self.assert_raises_fpe(fpeerr, flop, sc1, sc2[()]);
         self.assert_raises_fpe(fpeerr, flop, sc1[()], sc2[()]);
 
+    @dec.knownfailureif((sys.platform == "darwin") and
+                        ("powerpc" in platform.processor()),
+                        "See ticket 1755")
     def test_floating_exceptions(self):
         """Test basic arithmetic function errors"""
         oldsettings = np.seterr(all='raise')
@@ -316,8 +329,6 @@ def test_floating_exceptions(self):
                         lambda a,b:a+b, ft_max, ft_max*ft_eps)
                 self.assert_raises_fpe(overflow,
                         lambda a,b:a-b, -ft_max, ft_max*ft_eps)
-                self.assert_raises_fpe(overflow,
-                        np.power, ftype(2), ftype(2**fi.nexp))
                 self.assert_raises_fpe(divbyzero,
                         lambda a,b:a/b, ftype(1), ftype(0))
                 self.assert_raises_fpe(invalid,
@@ -333,15 +344,40 @@ def test_floating_exceptions(self):
         finally:
             np.seterr(**oldsettings)
 
+    @dec.knownfailureif(sys.platform.startswith('win') or
+                        (sys.platform == "darwin" and "powerpc" in platform.processor()),
+                        "See ticket 1755")
+    def test_floating_exceptions_power(self):
+        """Test basic arithmetic function errors"""
+        oldsettings = np.seterr(all='raise')
+        try:
+            # Test for all real and complex float types
+            for typecode in np.typecodes['AllFloat']:
+                ftype = np.obj2sctype(typecode)
+                if np.dtype(ftype).kind == 'f':
+                    # Get some extreme values for the type
+                    fi = np.finfo(ftype)
+                else:
+                    # 'c', complex, corresponding real dtype
+                    rtype = type(ftype(0).real)
+                    fi = np.finfo(rtype)
+                overflow = 'overflow'
+
+                self.assert_raises_fpe(overflow,
+                        np.power, ftype(2), ftype(2**fi.nexp))
+        finally:
+            np.seterr(**oldsettings)
+
 class TestTypes(TestCase):
     def check_promotion_cases(self, promote_func):
         """Tests that the scalars get coerced correctly."""
+        b = np.bool_(0)
         i8, i16, i32, i64 = int8(0), int16(0), int32(0), int64(0)
         u8, u16, u32, u64 = uint8(0), uint16(0), uint32(0), uint64(0)
         f32, f64, fld = float32(0), float64(0), longdouble(0)
         c64, c128, cld = complex64(0), complex128(0), clongdouble(0)
 
-        # coercion within the same type
+        # coercion within the same kind
         assert_equal(promote_func(i8,i16), np.dtype(int16))
         assert_equal(promote_func(i32,i8), np.dtype(int32))
         assert_equal(promote_func(i16,i64), np.dtype(int64))
@@ -353,7 +389,9 @@ def check_promotion_cases(self, promote_func):
         assert_equal(promote_func(cld,c128), np.dtype(clongdouble))
         assert_equal(promote_func(c64,fld), np.dtype(clongdouble))
 
-        # coercion between types
+        # coercion between kinds
+        assert_equal(promote_func(b,i32), np.dtype(int32))
+        assert_equal(promote_func(b,u8), np.dtype(uint8))
         assert_equal(promote_func(i8,u8), np.dtype(int16))
         assert_equal(promote_func(u8,i32), np.dtype(int32))
         assert_equal(promote_func(i64,u32), np.dtype(int64))
@@ -367,6 +405,10 @@ def check_promotion_cases(self, promote_func):
         assert_equal(promote_func(cld,f64), np.dtype(clongdouble))
 
         # coercion between scalars and 1-D arrays
+        assert_equal(promote_func(array([b]),i8), np.dtype(int8))
+        assert_equal(promote_func(array([b]),u8), np.dtype(uint8))
+        assert_equal(promote_func(array([b]),i32), np.dtype(int32))
+        assert_equal(promote_func(array([b]),u32), np.dtype(uint32))
         assert_equal(promote_func(array([i8]),i64), np.dtype(int8))
         assert_equal(promote_func(u64,array([i32])), np.dtype(int32))
         assert_equal(promote_func(i64,array([u32])), np.dtype(uint32))
@@ -375,49 +417,80 @@ def check_promotion_cases(self, promote_func):
         assert_equal(promote_func(fld,array([f32])), np.dtype(float32))
         assert_equal(promote_func(array([f64]),fld), np.dtype(float64))
         assert_equal(promote_func(fld,array([c64])), np.dtype(complex64))
+        assert_equal(promote_func(c64,array([f64])), np.dtype(complex128))
+        assert_equal(promote_func(complex64(3j),array([f64])),
+                                                    np.dtype(complex128))
+
+        # coercion between scalars and 1-D arrays, where
+        # the scalar has greater kind than the array
+        assert_equal(promote_func(array([b]),f64), np.dtype(float64))
+        assert_equal(promote_func(array([b]),i64), np.dtype(int64))
+        assert_equal(promote_func(array([b]),u64), np.dtype(uint64))
+        assert_equal(promote_func(array([i8]),f64), np.dtype(float64))
+        assert_equal(promote_func(array([u16]),f64), np.dtype(float64))
+        # uint and int are treated as the same "kind" for
+        # the purposes of array-scalar promotion.
+        assert_equal(promote_func(array([u16]), i32), np.dtype(uint16))
+        # float and complex are treated as the same "kind" for
+        # the purposes of array-scalar promotion, so that you can do
+        # (0j + float32array) to get a complex64 array instead of
+        # a complex128 array.
+        assert_equal(promote_func(array([f32]),c128), np.dtype(complex64))
 
     def test_coercion(self):
         def res_type(a, b):
             return np.add(a, b).dtype
-
-        ctx = WarningManager()
-        ctx.__enter__()
-        warnings.simplefilter('ignore', np.ComplexWarning)
-
         self.check_promotion_cases(res_type)
 
-        f64 = float64(0)
-        c64 = complex64(0)
-        ## Scalars do not coerce to complex if the value is real
-        #assert_equal(res_type(c64,array([f64])), np.dtype(float64))
-        # But they do if the value is complex
-        assert_equal(res_type(complex64(3j),array([f64])),
-                                                    np.dtype(complex128))
-
-        # Scalars do coerce to complex even if the value is real
-        # This is so "a+0j" can be reliably used to make something complex.
-        assert_equal(res_type(c64,array([f64])), np.dtype(complex128))
-
-        ctx.__exit__()
-
+        # Use-case: float/complex scalar * bool/int8 array
+        #           shouldn't narrow the float/complex type
+        for a in [np.array([True,False]), np.array([-3,12], dtype=np.int8)]:
+            b = 1.234 * a
+            assert_equal(b.dtype, np.dtype('f8'), "array type %s" % a.dtype)
+            b = np.longdouble(1.234) * a
+            assert_equal(b.dtype, np.dtype(np.longdouble),
+                                                "array type %s" % a.dtype)
+            b = np.float64(1.234) * a
+            assert_equal(b.dtype, np.dtype('f8'), "array type %s" % a.dtype)
+            b = np.float32(1.234) * a
+            assert_equal(b.dtype, np.dtype('f4'), "array type %s" % a.dtype)
+            b = np.float16(1.234) * a
+            assert_equal(b.dtype, np.dtype('f2'), "array type %s" % a.dtype)
+
+            b = 1.234j * a
+            assert_equal(b.dtype, np.dtype('c16'), "array type %s" % a.dtype)
+            b = np.clongdouble(1.234j) * a
+            assert_equal(b.dtype, np.dtype(np.clongdouble),
+                                                "array type %s" % a.dtype)
+            b = np.complex128(1.234j) * a
+            assert_equal(b.dtype, np.dtype('c16'), "array type %s" % a.dtype)
+            b = np.complex64(1.234j) * a
+            assert_equal(b.dtype, np.dtype('c8'), "array type %s" % a.dtype)
+
+        # The following use-case is problematic, and to resolve its
+        # tricky side-effects requires more changes.
+        #
+        ## Use-case: (1-t)*a, where 't' is a boolean array and 'a' is
+        ##            a float32, shouldn't promote to float64
+        #a = np.array([1.0, 1.5], dtype=np.float32)
+        #t = np.array([True, False])
+        #b = t*a
+        #assert_equal(b, [1.0, 0.0])
+        #assert_equal(b.dtype, np.dtype('f4'))
+        #b = (1-t)*a
+        #assert_equal(b, [0.0, 1.5])
+        #assert_equal(b.dtype, np.dtype('f4'))
+        ## Probably ~t (bitwise negation) is more proper to use here,
+        ## but this is arguably less intuitive to understand at a glance, and
+        ## would fail if 't' is actually an integer array instead of boolean:
+        #b = (~t)*a
+        #assert_equal(b, [0.0, 1.5])
+        #assert_equal(b.dtype, np.dtype('f4'))
 
     def test_result_type(self):
         self.check_promotion_cases(np.result_type)
 
-        f64 = float64(0)
-        c64 = complex64(0)
-        ## Scalars do not coerce to complex if the value is real
-        #assert_equal(np.result_type(c64,array([f64])), np.dtype(float64))
-        # But they do if the value is complex
-        assert_equal(np.result_type(complex64(3j),array([f64])),
-                                                    np.dtype(complex128))
-
-        # Scalars do coerce to complex even if the value is real
-        # This is so "a+0j" can be reliably used to make something complex.
-        assert_equal(np.result_type(c64,array([f64])), np.dtype(complex128))
-
-
-    def can_cast(self):
+    def test_can_cast(self):
         assert_(np.can_cast(np.int32, np.int64))
         assert_(np.can_cast(np.float64, np.complex))
         assert_(not np.can_cast(np.complex, np.float))
@@ -527,7 +600,7 @@ def test_boolean(self):
         g1 = randint(0,5,size=15)
         g2 = randint(0,8,size=15)
         V[g1,g2] = -V[g1,g2]
-        assert (array([a[0][V>0],a[1][V>0],a[2][V>0]]) == a[:,V>0]).all()
+        assert_((array([a[0][V>0],a[1][V>0],a[2][V>0]]) == a[:,V>0]).all())
 
 
 class TestBinaryRepr(TestCase):
@@ -559,55 +632,55 @@ def test_negative(self):
 class TestArrayComparisons(TestCase):
     def test_array_equal(self):
         res = array_equal(array([1,2]), array([1,2]))
-        assert res
-        assert type(res) is bool
+        assert_(res)
+        assert_(type(res) is bool)
         res = array_equal(array([1,2]), array([1,2,3]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
         res = array_equal(array([1,2]), array([3,4]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
         res = array_equal(array([1,2]), array([1,3]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
 
     def test_array_equiv(self):
         res = array_equiv(array([1,2]), array([1,2]))
-        assert res
-        assert type(res) is bool
+        assert_(res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,2]), array([1,2,3]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,2]), array([3,4]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,2]), array([1,3]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
 
         res = array_equiv(array([1,1]), array([1]))
-        assert res
-        assert type(res) is bool
+        assert_(res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,1]), array([[1],[1]]))
-        assert res
-        assert type(res) is bool
+        assert_(res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,2]), array([2]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,2]), array([[1],[2]]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
         res = array_equiv(array([1,2]), array([[1,2,3],[4,5,6],[7,8,9]]))
-        assert not res
-        assert type(res) is bool
+        assert_(not res)
+        assert_(type(res) is bool)
 
 
 def assert_array_strict_equal(x, y):
     assert_array_equal(x, y)
     # Check flags
-    assert x.flags == y.flags
+    assert_(x.flags == y.flags)
     # check endianness
-    assert x.dtype.isnative == y.dtype.isnative
+    assert_(x.dtype.isnative == y.dtype.isnative)
 
 
 class TestClip(TestCase):
@@ -647,7 +720,7 @@ def _neg_byteorder(self, a):
     def _generate_non_native_data(self, n, m):
         data = randn(n, m)
         data = self._neg_byteorder(data)
-        assert not data.dtype.isnative
+        assert_(not data.dtype.isnative)
         return data
 
     def _generate_int_data(self, n, m):
@@ -699,7 +772,7 @@ def test_simple_nonnative(self):
         a   = self._generate_data(self.nr, self.nc)
         m   = -0.5
         M   = self._neg_byteorder(0.6)
-        assert not M.dtype.isnative
+        assert_(not M.dtype.isnative)
         ac  = self.fastclip(a, m, M)
         act = self.clip(a, m, M)
         assert_array_equal(ac, act)
@@ -727,8 +800,8 @@ def test_clip_non_contig(self):
         """Test clip for non contiguous native input and native scalar min/max."""
         a   = self._generate_data(self.nr * 2, self.nc * 3)
         a   = a[::2, ::3]
-        assert not a.flags['F_CONTIGUOUS']
-        assert not a.flags['C_CONTIGUOUS']
+        assert_(not a.flags['F_CONTIGUOUS'])
+        assert_(not a.flags['C_CONTIGUOUS'])
         ac  = self.fastclip(a, -1.6, 1.7)
         act = self.clip(a, -1.6, 1.7)
         assert_array_strict_equal(ac, act)
@@ -812,8 +885,8 @@ def test_noncontig_inplace(self):
         """Test non contiguous double input with double scalar min/max in-place."""
         a   = self._generate_data(self.nr * 2, self.nc * 3)
         a   = a[::2, ::3]
-        assert not a.flags['F_CONTIGUOUS']
-        assert not a.flags['C_CONTIGUOUS']
+        assert_(not a.flags['F_CONTIGUOUS'])
+        assert_(not a.flags['C_CONTIGUOUS'])
         ac  = a.copy()
         m   = -0.5
         M   = 0.6
@@ -883,7 +956,7 @@ def test_type_cast_07(self):
         m   = -0.5 * ones(a.shape)
         M   = 1.
         a_s = self._neg_byteorder(a)
-        assert not a_s.dtype.isnative
+        assert_(not a_s.dtype.isnative)
         act = a_s.clip(m, M)
         ac  = self.fastclip(a_s, m, M)
         assert_array_strict_equal(ac, act)
@@ -894,7 +967,7 @@ def test_type_cast_08(self):
         m   = -0.5
         M   = 1.
         a_s = self._neg_byteorder(a)
-        assert not a_s.dtype.isnative
+        assert_(not a_s.dtype.isnative)
         ac  = self.fastclip(a_s, m , M)
         act = a_s.clip(m, M)
         assert_array_strict_equal(ac, act)
@@ -905,7 +978,7 @@ def test_type_cast_09(self):
         m   = -0.5 * ones(a.shape)
         M   = 1.
         m_s = self._neg_byteorder(m)
-        assert not m_s.dtype.isnative
+        assert_(not m_s.dtype.isnative)
         ac  = self.fastclip(a, m_s , M)
         act = self.clip(a, m_s, M)
         assert_array_strict_equal(ac, act)
@@ -1030,15 +1103,15 @@ def test_clip_func_takes_out(self):
         self.assertTrue(a2 is a)
 
 
-class test_allclose_inf(TestCase):
+class TestAllclose:
     rtol = 1e-5
     atol = 1e-8
 
     def tst_allclose(self,x,y):
-        assert allclose(x,y), "%s and %s not close" % (x,y)
+        assert_(allclose(x,y), "%s and %s not close" % (x,y))
 
     def tst_not_allclose(self,x,y):
-        assert not allclose(x,y), "%s and %s shouldn't be close" % (x,y)
+        assert_(not allclose(x,y), "%s and %s shouldn't be close" % (x,y))
 
     def test_ip_allclose(self):
         """Parametric test factory."""
@@ -1053,7 +1126,9 @@ def test_ip_allclose(self):
                 ([1], [1+rtol+atol]),
                 (arr, arr + arr*rtol),
                 (arr, arr + arr*rtol + atol*2),
-                (aran, aran + aran*rtol),]
+                (aran, aran + aran*rtol),
+                (inf, inf),
+                (inf, [inf])]
 
         for (x,y) in data:
             yield (self.tst_allclose,x,y)
@@ -1152,6 +1227,8 @@ def check_like_function(self, like_function, value):
             assert_equal(dz.shape, d.shape)
             assert_equal(array(dz.strides)*d.dtype.itemsize,
                          array(d.strides)*dz.dtype.itemsize)
+            assert_equal(d.flags.c_contiguous, dz.flags.c_contiguous)
+            assert_equal(d.flags.f_contiguous, dz.flags.f_contiguous)
             if dtype is None:
                 assert_equal(dz.dtype, d.dtype)
             else:
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index 8a50ca68039a..f96a5452dc91 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -41,6 +41,8 @@ def test_recarray_fromfile(self):
         fd = open(filename, 'rb')
         fd.seek(2880 * 2)
         r = np.rec.fromfile(fd, formats='f8,i4,a5', shape=3, byteorder='big')
+        fd.seek(2880 * 2)
+        r = np.rec.array(fd, formats='f8,i4,a5', shape=3, byteorder='big')
 
     def test_recarray_from_obj(self):
         count = 10
@@ -135,16 +137,11 @@ def assign_invalid_column(x):
         self.assertRaises(AttributeError, assign_invalid_column, a)
 
     def test_out_of_order_fields(self):
-        """Ticket #1431. Current behavior deprecated in numpy 1.5"""
+        """Ticket #1431."""
         x = self.data[['col1', 'col2']]
         y = self.data[['col2', 'col1']]
-        # make sure change is applied in 1.6/2.0
-        if np.version.short_version[:3] == '1.5':
-            assert_array_equal(x, y)
-        elif float(np.version.short_version[:3]) >= 1.6 and np.version.release:
-            assert_(y[0][0] == 4)
+        assert_equal(x[0][0], y[0][1])
 
-warnings.filterwarnings('ignore', message="Out of order field selection on recarrays")
 
 def test_find_duplicate():
     l1 = [1, 2, 3, 4, 5, 6]
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 4c02ee60fdbf..9fb0fd4e3438 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -1,6 +1,7 @@
 from StringIO import StringIO
 import pickle
 import sys
+import platform
 import gc
 import copy
 from os import path
@@ -138,7 +139,8 @@ def test_mem_dtype_align(self,level=rlevel):
         self.assertRaises(TypeError,np.dtype,
                               {'names':['a'],'formats':['foo']},align=1)
 
-    @dec.knownfailureif(sys.version_info[0] >= 3,
+    @dec.knownfailureif((sys.version_info[0] >= 3) or
+                        (sys.platform == "win32" and platform.architecture()[0] == "64bit"),
                         "numpy.intp('0xff', 16) not supported on Py3, "
                         "as it does not inherit from Python int")
     def test_intp(self,level=rlevel):
@@ -1151,7 +1153,8 @@ def test_array_from_sequence_scalar_array2(self):
 
     def test_array_too_big(self):
         """Ticket #1080."""
-        assert_raises(ValueError, np.zeros, [2**10]*10)
+        assert_raises(ValueError, np.zeros, [975]*7, np.int8)
+        assert_raises(ValueError, np.zeros, [26244]*5, np.int8)
 
     def test_dtype_keyerrors_(self):
         """Ticket #1106."""
@@ -1269,7 +1272,7 @@ def test_signed_integer_division_overflow(self):
         """Ticket #1317."""
         def test_type(t):
             min = np.array([np.iinfo(t).min])
-            min /= -1
+            min //= -1
 
         old_err = np.seterr(divide="ignore")
         try:
@@ -1535,6 +1538,8 @@ def test_setting_rank0_string(self):
         a[()] = np.array(4)
         assert_equal(a, np.array(4))
 
+    @dec.knownfailureif(sys.version_info[0] >= 3,
+                        "a.dtype is U5 for Py 3.x. Knownfail for 1.6.x")
     def test_string_astype(self):
         "Ticket #1748"
         s1 = asbytes('black')
@@ -1545,7 +1550,7 @@ def test_string_astype(self):
         b = a.astype('str')
         assert_equal(b.dtype, np.dtype('S5'))
 
-    def test_string_astype(self):
+    def test_ticket_1756(self):
         """Ticket #1756 """
         s = asbytes('0123456789abcdef')
         a = np.array([s]*5)
@@ -1574,5 +1579,37 @@ def test_ticket_1770(self):
         except:
             raise AssertionError
 
+    def test_structured_type_to_object(self):
+        a_rec = np.array([(0,1), (3,2)], dtype='i4,i8')
+        a_obj = np.empty((2,), dtype=object)
+        a_obj[0] = (0,1)
+        a_obj[1] = (3,2)
+        # astype records -> object
+        assert_equal(a_rec.astype(object), a_obj)
+        # '=' records -> object
+        b = np.empty_like(a_obj)
+        b[...] = a_rec
+        assert_equal(b, a_obj)
+        # '=' object -> records
+        b = np.empty_like(a_rec)
+        b[...] = a_obj
+        assert_equal(b, a_rec)
+
+    def test_assign_obj_listoflists(self):
+        # Ticket # 1870
+        # The inner list should get assigned to the object elements
+        a = np.zeros(4, dtype=object)
+        b = a.copy()
+        a[0] = [1]
+        a[1] = [2]
+        a[2] = [3]
+        a[3] = [4]
+        b[...] = [[1], [2], [3], [4]]
+        assert_equal(a, b)
+        # The first dimension should get broadcast
+        a = np.zeros((2,2), dtype=object)
+        a[...] = [[1,2]]
+        assert_equal(a, [[1,2], [1,2]])
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index a2b3a232b500..1b872e23922c 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -81,7 +81,7 @@ def test_int_from_long(self):
 #            assert_equal( val, val2 )
 
 
-class TestRepr(TestCase):
+class TestRepr:
     def _test_type_repr(self, t):
         finfo=np.finfo(t)
         last_fraction_bit_idx = finfo.nexp + finfo.nmant
@@ -111,7 +111,7 @@ def test_float_repr(self):
         # long double test cannot work, because eval goes through a python
         # float
         for t in [np.float32, np.float64]:
-            yield test_float_repr, t
+            yield self._test_type_repr, t
 
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index a7a41dfe2f28..2d952930dfc9 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -458,5 +458,19 @@ def broadcastable(s1,s2):
 
         assert_equal(ref, True, err_msg="reference check")
 
+    def test_casting_out_param(self):
+        # Test that it's possible to do casts on output
+        a = np.ones((200,100), np.int64)
+        b = np.ones((200,100), np.int64)
+        c = np.ones((200,100), np.float64)
+        np.add(a, b, out=c)
+        assert_equal(c, 2)
+
+        a = np.zeros(65536)
+        b = np.zeros(65536, dtype=np.float32)
+        np.subtract(a, 0, out=b)
+        assert_equal(b, 0)
+
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index b5f9d5745eb9..781be3cb7692 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1,4 +1,5 @@
 import sys
+import platform
 
 from numpy.testing import *
 import numpy.core.umath as ncu
@@ -1080,7 +1081,9 @@ def test_nextafter():
 def test_nextafterf():
     return _test_nextafter(np.float32)
 
-@dec.knownfailureif(sys.platform == 'win32', "Long double support buggy on win32")
+@dec.knownfailureif(sys.platform == 'win32' or
+                    ("powerpc" in platform.processor()),
+                    "Long double support buggy on win32 and PPC.")
 def test_nextafterl():
     return _test_nextafter(np.longdouble)
 
@@ -1105,7 +1108,9 @@ def test_spacing():
 def test_spacingf():
     return _test_spacing(np.float32)
 
-@dec.knownfailureif(sys.platform == 'win32', "Long double support buggy on win32")
+@dec.knownfailureif(sys.platform == 'win32' or
+                    ("powerpc" in platform.processor()),
+                    "Long double support buggy on win32 and PPC.")
 def test_spacingl():
     return _test_spacing(np.longdouble)
 
diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py
index 5cc5d9566a2d..b962b7cd9680 100644
--- a/numpy/core/tests/test_umath_complex.py
+++ b/numpy/core/tests/test_umath_complex.py
@@ -12,7 +12,7 @@
 # At least on Windows the results of many complex functions are not conforming
 # to the C99 standard. See ticket 1574.
 # Ditto for Solaris (ticket 1642) and OS X on PowerPC.
-olderr = np.seterr(divide='ignore')
+olderr = np.seterr(invalid='ignore', divide='ignore')
 try:
     functions_seem_flaky = ((np.exp(complex(np.inf, 0)).imag != 0)
                             or (np.log(complex(np.NZERO, 0)).imag != np.pi))
@@ -434,7 +434,13 @@ class TestCabs(object):
     def test_simple(self):
         x = np.array([1+1j, 0+2j, 1+2j, np.inf, np.nan])
         y_r = np.array([np.sqrt(2.), 2, np.sqrt(5), np.inf, np.nan])
-        y = np.abs(x)
+
+        olderr = np.seterr(invalid='ignore')
+        try:
+            y = np.abs(x)
+        finally:
+            np.seterr(**olderr)
+
         for i in range(len(x)):
             assert_almost_equal(y[i], y_r[i])
 
@@ -446,8 +452,12 @@ def test_fabs(self):
         x = np.array([complex(1, np.NZERO)], dtype=np.complex)
         assert_array_equal(np.abs(x), np.real(x))
 
-        x = np.array([complex(np.inf, np.NZERO)], dtype=np.complex)
-        assert_array_equal(np.abs(x), np.real(x))
+        olderr = np.seterr(invalid='ignore')
+        try:
+            x = np.array([complex(np.inf, np.NZERO)], dtype=np.complex)
+            assert_array_equal(np.abs(x), np.real(x))
+        finally:
+            np.seterr(**olderr)
 
         x = np.array([complex(np.nan, np.NZERO)], dtype=np.complex)
         assert_array_equal(np.abs(x), np.real(x))
diff --git a/numpy/ctypeslib.py b/numpy/ctypeslib.py
index 1fdf3c396158..8dcfa635f44c 100644
--- a/numpy/ctypeslib.py
+++ b/numpy/ctypeslib.py
@@ -97,7 +97,16 @@ def load_library(libname, loader_path):
             # Try to load library with platform-specific name, otherwise
             # default to libname.[so|pyd].  Sometimes, these files are built
             # erroneously on non-linux platforms.
-            libname_ext = ['%s.so' % libname, '%s.pyd' % libname]
+            from numpy.distutils.misc_util import get_shared_lib_extension
+            so_ext = get_shared_lib_extension()
+            libname_ext = [libname + so_ext]
+            if sys.version[:3] >= '3.2':
+                # For Python >= 3.2 a tag may be added to lib extension
+                # (platform dependent).  If we find such a tag, try both with
+                # and without it.
+                so_ext2 = get_shared_lib_extension(is_python_ext=True)
+                if not so_ext2 == so_ext:
+                    libname_ext.insert(0, libname + so_ext2)
             if sys.platform == 'win32':
                 libname_ext.insert(0, '%s.dll' % libname)
             elif sys.platform == 'darwin':
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index 5d986570c9ac..0629dec52c5a 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -35,3 +35,10 @@ def finalize_options(self):
 
     def run(self):
         old_build.run(self)
+
+        #DISTNUMPY
+        #Building DistNumPy as the last part of the build command.
+        plat_specifier = ".%s-%s" % (get_platform(), sys.version[0:3])
+        build_src = os.path.join(self.build_base, 'src'+plat_specifier)
+        import distnumpy
+        distnumpy.build(build_src)
diff --git a/numpy/distutils/command/install_clib.py b/numpy/distutils/command/install_clib.py
index 638d4beacbb5..8c96069178dc 100644
--- a/numpy/distutils/command/install_clib.py
+++ b/numpy/distutils/command/install_clib.py
@@ -19,6 +19,11 @@ def run (self):
         build_clib_cmd = get_cmd("build_clib")
         build_dir = build_clib_cmd.build_clib
 
+        #DISTNUMPY
+        #No need to 'install_clib' DistNumPy
+        if build_dir is None:
+            return
+
         # We need the compiler to get the library name -> filename association
         if not build_clib_cmd.compiler:
             compiler = new_compiler(compiler=None)
diff --git a/numpy/distutils/extension.py b/numpy/distutils/extension.py
index 2db62969eed5..9f28263d8104 100644
--- a/numpy/distutils/extension.py
+++ b/numpy/distutils/extension.py
@@ -48,6 +48,13 @@ def __init__ (self, name, sources,
 
         # Python 2.4 distutils new features
         self.swig_opts = swig_opts or []
+        # swig_opts is assumed to be a list. Here we handle the case where it
+        # is specified as a string instead.
+        if isinstance(self.swig_opts, basestring):
+            import warnings
+            msg = "swig_opts is specified as a string instead of a list"
+            warnings.warn(msg, SyntaxWarning)
+            self.swig_opts = self.swig_opts.split()
 
         # Python 2.3 distutils new features
         self.depends = depends or []
diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
index d2400459fb5b..69d9d68d3105 100644
--- a/numpy/distutils/fcompiler/__init__.py
+++ b/numpy/distutils/fcompiler/__init__.py
@@ -27,7 +27,7 @@
 
 from numpy.compat import open_latin1
 
-from distutils.sysconfig import get_config_var, get_config_vars, get_python_lib
+from distutils.sysconfig import get_python_lib
 from distutils.fancy_getopt import FancyGetopt
 from distutils.errors import DistutilsModuleError, \
      DistutilsExecError, CompileError, LinkError, DistutilsPlatformError
@@ -35,7 +35,8 @@
 
 from numpy.distutils.ccompiler import CCompiler, gen_lib_options
 from numpy.distutils import log
-from numpy.distutils.misc_util import is_string, all_strings, is_sequence, make_temp_file
+from numpy.distutils.misc_util import is_string, all_strings, is_sequence, \
+    make_temp_file, get_shared_lib_extension
 from numpy.distutils.environment import EnvironmentConfig
 from numpy.distutils.exec_command import find_executable
 from numpy.distutils.compat import get_exception
@@ -195,10 +196,8 @@ class FCompiler(CCompiler):
 
     src_extensions = ['.for','.ftn','.f77','.f','.f90','.f95','.F','.F90']
     obj_extension = ".o"
-    shared_lib_extension = get_config_var('SO')  # or .dll
-    # fix long extension for Python >=3.2, see PEP 3149.
-    if 'SOABI' in get_config_vars():
-        shared_lib_extension = shared_lib_extension.replace('.'+get_config_var('SOABI'), '', 1)
+
+    shared_lib_extension = get_shared_lib_extension()
     static_lib_extension = ".a"  # or .lib
     static_lib_format = "lib%s%s" # or %s%s
     shared_lib_format = "%s%s"
diff --git a/numpy/distutils/fcompiler/intel.py b/numpy/distutils/fcompiler/intel.py
index b593a91c7cfd..190584829295 100644
--- a/numpy/distutils/fcompiler/intel.py
+++ b/numpy/distutils/fcompiler/intel.py
@@ -57,7 +57,7 @@ def get_flags_free(self):
         return ["-FR"]
 
     def get_flags_opt(self):
-        return ['-O3','-unroll']
+        return ['-O1']
 
     def get_flags_arch(self):
         v = self.get_version()
@@ -201,7 +201,7 @@ def get_flags_debug(self):
         return ['/4Yb','/d2']
 
     def get_flags_opt(self):
-        return ['/O3','/Qip']
+        return ['/O1']
 
     def get_flags_arch(self):
         opt = []
diff --git a/numpy/distutils/intelccompiler.py b/numpy/distutils/intelccompiler.py
index b82445ab899b..9cff858cef39 100644
--- a/numpy/distutils/intelccompiler.py
+++ b/numpy/distutils/intelccompiler.py
@@ -1,12 +1,8 @@
-
 from distutils.unixccompiler import UnixCCompiler
 from numpy.distutils.exec_command import find_executable
 
 class IntelCCompiler(UnixCCompiler):
-
-    """ A modified Intel compiler compatible with an gcc built Python.
-    """
-
+    """ A modified Intel compiler compatible with an gcc built Python."""
     compiler_type = 'intel'
     cc_exe = 'icc'
     cc_args = 'fPIC'
@@ -31,10 +27,8 @@ class IntelItaniumCCompiler(IntelCCompiler):
             break
 
 class IntelEM64TCCompiler(UnixCCompiler):
-
-""" A modified Intel x86_64 compiler compatible with a 64bit gcc built Python.
+    """ A modified Intel x86_64 compiler compatible with a 64bit gcc built Python.
     """
-
     compiler_type = 'intelem'
     cc_exe = 'icc -m64 -fPIC'
     cc_args = "-fPIC"
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 8af492894f27..1539c4176093 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -9,6 +9,7 @@
 import subprocess
 import shutil
 
+import distutils
 from distutils.errors import DistutilsError
 
 try:
@@ -583,6 +584,35 @@ def get_lib_source_files(lib):
             filenames.append(d)
     return filenames
 
+def get_shared_lib_extension(is_python_ext=False):
+    """Return the correct file extension for shared libraries.
+
+    Parameters
+    ----------
+    is_python_ext : bool, optional
+        Whether the shared library is a Python extension.  Default is False.
+
+    Returns
+    -------
+    so_ext : str
+        The shared library extension.
+
+    Notes
+    -----
+    For Python shared libs, `so_ext` will typically be '.so' on Linux and OS X,
+    and '.pyd' on Windows.  For Python >= 3.2 `so_ext` has a tag prepended on
+    POSIX systems according to PEP 3149.  For Python 3.2 this is implemented on
+    Linux, but not on OS X.
+
+    """
+    so_ext = distutils.sysconfig.get_config_var('SO') or ''
+    # fix long extension for Python >=3.2, see PEP 3149.
+    if (not is_python_ext) and 'SOABI' in distutils.sysconfig.get_config_vars():
+        # Does nothing unless SOABI config var exists
+        so_ext = so_ext.replace('.' + distutils.sysconfig.get_config_var('SOABI'), '', 1)
+
+    return so_ext
+
 def get_data_files(data):
     if is_string(data):
         return [data]
@@ -772,7 +802,7 @@ def __init__(self,
     def todict(self):
         """
         Return a dictionary compatible with the keyword arguments of distutils
-        setup function. 
+        setup function.
 
         Examples
         --------
@@ -947,8 +977,8 @@ def get_subpackage(self,subpackage_name,
     def add_subpackage(self,subpackage_name,
                        subpackage_path=None,
                        standalone = False):
-        """Add a sub-package to the current Configuration instance. 
-        
+        """Add a sub-package to the current Configuration instance.
+
         This is useful in a setup.py script for adding sub-packages to a
         package.
 
@@ -994,7 +1024,7 @@ def add_data_dir(self,data_path):
         installed (and distributed). The data_path can be either a relative
         path-name, or an absolute path-name, or a 2-tuple where the first
         argument shows where in the install directory the data directory
-        should be installed to. 
+        should be installed to.
 
         Parameters
         ----------
@@ -1389,7 +1419,7 @@ def add_extension(self,name,sources,**kw):
         Notes
         -----
         The self.paths(...) method is applied to all lists that may contain
-        paths. 
+        paths.
         """
         ext_args = copy.copy(kw)
         ext_args['name'] = dot_join(self.name,name)
@@ -1863,7 +1893,7 @@ def _get_hg_revision(self,path):
             return revision
         branch_fn = njoin(path,'.hg','branch')
         branch_cache_fn = njoin(path,'.hg','branch.cache')
-        
+
         if os.path.isfile(branch_fn):
             branch0 = None
             f = open(branch_fn)
@@ -1889,8 +1919,8 @@ def get_version(self, version_file=None, version_variable=None):
         """Try to get version string of a package.
 
         Return a version string of the current package or None if the version
-        information could not be detected. 
-        
+        information could not be detected.
+
         Notes
         -----
         This method scans files named
@@ -1956,8 +1986,8 @@ def get_version(self, version_file=None, version_variable=None):
 
     def make_svn_version_py(self, delete=True):
         """Appends a data function to the data_files list that will generate
-        __svn_version__.py file to the current package directory. 
-        
+        __svn_version__.py file to the current package directory.
+
         Generate package __svn_version__.py file from SVN revision number,
         it will be removed after python exits but will be available
         when sdist, etc commands are executed.
@@ -1999,8 +2029,8 @@ def rm_file(f=target,p=self.info):
 
     def make_hg_version_py(self, delete=True):
         """Appends a data function to the data_files list that will generate
-        __hg_version__.py file to the current package directory. 
-        
+        __hg_version__.py file to the current package directory.
+
         Generate package __hg_version__.py file from Mercurial revision,
         it will be removed after python exits but will be available
         when sdist, etc commands are executed.
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 667ca82bc13b..6dedf59292e1 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -128,7 +128,8 @@
 
 from numpy.distutils.exec_command import \
     find_executable, exec_command, get_pythonexe
-from numpy.distutils.misc_util import is_sequence, is_string
+from numpy.distutils.misc_util import is_sequence, is_string, \
+                                      get_shared_lib_extension
 from numpy.distutils.command.config import config as cmd_config
 from numpy.distutils.compat import get_exception
 
@@ -210,11 +211,7 @@ def libpaths(paths,bits):
 default_include_dirs = filter(os.path.isdir, default_include_dirs)
 default_src_dirs = filter(os.path.isdir, default_src_dirs)
 
-so_ext = distutils.sysconfig.get_config_vars('SO')[0] or ''
-# fix long extension for Python >=3.2, see PEP 3149.
-if 'SOABI' in distutils.sysconfig.get_config_vars():
-    so_ext = so_ext.replace('.'+distutils.sysconfig.get_config_var('SOABI'), '', 1)
-
+so_ext = get_shared_lib_extension()
 
 def get_standard_file(fname):
     """Returns a list of files named 'fname' from
diff --git a/numpy/doc/misc.py b/numpy/doc/misc.py
index 81d7a54afe01..8fa3f8a31a7a 100644
--- a/numpy/doc/misc.py
+++ b/numpy/doc/misc.py
@@ -46,24 +46,28 @@
  >>> np.nansum(x)
  42.0
 
-How numpy handles numerical exceptions
-
-Default is to "warn"
-But this can be changed, and it can be set individually for different kinds
-of exceptions. The different behaviors are: ::
-
- 'ignore' : ignore completely
- 'warn'   : print a warning (once only)
- 'raise'  : raise an exception
- 'call'   : call a user-supplied function (set using seterrcall())
-
-These behaviors can be set for all kinds of errors or specific ones: ::
-
- all:       apply to all numeric exceptions
- invalid:   when NaNs are generated
- divide:    divide by zero (for integers as well!)
- overflow:  floating point overflows
- underflow: floating point underflows
+How numpy handles numerical exceptions:
+------------------------------------------
+
+The default is to ``'warn'`` for ``invalid``, ``divide``, and ``overflow``
+and ``'ignore'`` for ``underflow``.  But this can be changed, and it can be
+set individually for different kinds of exceptions. The different behaviors
+are:
+
+ - 'ignore' : Take no action when the exception occurs.
+ - 'warn'   : Print a `RuntimeWarning` (via the Python `warnings` module).
+ - 'raise'  : Raise a `FloatingPointError`.
+ - 'call'   : Call a function specified using the `seterrcall` function.
+ - 'print'  : Print a warning directly to ``stdout``.
+ - 'log'    : Record error in a Log object specified by `seterrcall`.
+
+These behaviors can be set for all kinds of errors or specific ones:
+
+ - all       : apply to all numeric exceptions
+ - invalid   : when NaNs are generated
+ - divide    : divide by zero (for integers as well!)
+ - overflow  : floating point overflows
+ - underflow : floating point underflows
 
 Note that integer divide-by-zero is handled by the same machinery.
 These behaviors are set on a per-thread basis.
diff --git a/numpy/f2py/cfuncs.py b/numpy/f2py/cfuncs.py
index 56a193963b29..9410a9f276d6 100644
--- a/numpy/f2py/cfuncs.py
+++ b/numpy/f2py/cfuncs.py
@@ -59,6 +59,7 @@
 #include "arrayobject.h"'''
 
 includes['arrayobject.h']='#include "fortranobject.h"'
+includes['stdarg.h']='#include <stdarg.h>'
 
 ############# Type definitions ###############
 
@@ -243,6 +244,7 @@
 #define MIN(a,b) ((a < b) ? (a) : (b))
 #endif
 """
+needs['len..']=['f2py_size']
 cppmacros['len..']="""\
 #define rank(var) var ## _Rank
 #define shape(var,dim) var ## _Dims[dim]
@@ -251,9 +253,36 @@
 #define fshape(var,dim) shape(var,rank(var)-dim-1)
 #define len(var) shape(var,0)
 #define flen(var) fshape(var,0)
-#define size(var) PyArray_SIZE((PyArrayObject *)(capi_ ## var ## _tmp))
+#define old_size(var) PyArray_SIZE((PyArrayObject *)(capi_ ## var ## _tmp))
 /* #define index(i) capi_i ## i */
 #define slen(var) capi_ ## var ## _len
+#define size(var, ...) f2py_size((PyArrayObject *)(capi_ ## var ## _tmp), ## __VA_ARGS__, -1)
+"""
+needs['f2py_size']=['stdarg.h']
+cfuncs['f2py_size']="""\
+int f2py_size(PyArrayObject* var, ...)
+{
+  npy_int sz = 0;
+  npy_int dim;
+  npy_int rank;
+  va_list argp;
+  va_start(argp, var);
+  dim = va_arg(argp, npy_int);
+  if (dim==-1)
+    {
+      sz = PyArray_SIZE(var);
+    }
+  else
+    {
+      rank = PyArray_NDIM(var);
+      if (dim>=1 && dim<=rank)
+        sz = PyArray_DIM(var, dim-1);
+      else
+        fprintf(stderr, \"f2py_size: 2nd argument value=%d fails to satisfy 1<=value<=%d. Result will be 0.\\n\", dim, rank);
+    }
+  va_end(argp);
+  return sz;
+}
 """
 
 cppmacros['pyobj_from_char1']='#define pyobj_from_char1(v) (PyInt_FromLong(v))'
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index aa4c3bc0ab7f..eaeed50522fa 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -13,7 +13,7 @@
 Pearu Peterson
 """
 __version__ = "$Revision: 1.177 $"[10:-1]
-
+import platform
 import __version__
 f2py_version = __version__.version
 
@@ -283,7 +283,7 @@ def readfortrancode(ffile,dowithline=show,istop=1):
     cont=0
     finalline=''
     ll=''
-    commentline=re.compile(r'(?P<line>([^"]*"[^"]*"[^"!]*|[^\']*\'[^\']*\'[^\'!]*|[^!]*))!{1}(?P<rest>.*)')
+    commentline=re.compile(r'(?P<line>([^"]*["][^"]*["][^"!]*|[^\']*\'[^\']*\'[^\'!]*|[^!\'"]*))!{1}(?P<rest>.*)')
     includeline=re.compile(r'\s*include\s*(\'|")(?P<name>[^\'"]*)(\'|")',re.I)
     cont1=re.compile(r'(?P<line>.*)&\s*\Z')
     cont2=re.compile(r'(\s*&|)(?P<line>.*)')
@@ -416,7 +416,7 @@ def readfortrancode(ffile,dowithline=show,istop=1):
                         readfortrancode(fn1,dowithline=dowithline,istop=0)
                         break
                 if not foundfile:
-                    outmess('readfortrancode: could not find include file %s. Ignoring.\n'%(`fn`))
+                    outmess('readfortrancode: could not find include file %s in %s. Ignoring.\n'%(`fn`, os.pathsep.join(include_dirs)))
         else:
             dowithline(finalline)
         l1=ll
@@ -428,13 +428,19 @@ def readfortrancode(ffile,dowithline=show,istop=1):
     m=includeline.match(origfinalline)
     if m:
         fn=m.group('name')
-        fn1=os.path.join(os.path.dirname(currentfilename),fn)
         if os.path.isfile(fn):
             readfortrancode(fn,dowithline=dowithline,istop=0)
-        elif os.path.isfile(fn1):
-            readfortrancode(fn1,dowithline=dowithline,istop=0)
         else:
-            outmess('readfortrancode: could not find include file %s. Ignoring.\n'%(`fn`))
+            include_dirs = [os.path.dirname(currentfilename)] + include_paths
+            foundfile = 0
+            for inc_dir in include_dirs:
+                fn1 = os.path.join(inc_dir,fn)
+                if os.path.isfile(fn1):
+                    foundfile = 1
+                    readfortrancode(fn1,dowithline=dowithline,istop=0)
+                    break
+            if not foundfile:
+                outmess('readfortrancode: could not find include file %s in %s. Ignoring.\n'%(`fn`, os.pathsep.join(include_dirs)))
     else:
         dowithline(finalline)
     filepositiontext=''
@@ -1149,7 +1155,6 @@ def analyzeline(m,case,line):
                     groupcache[groupcounter]['use'][name]['map']=rl
             else:
                 pass
-
         else:
             print m.groupdict()
             outmess('analyzeline: Could not crack the use statement.\n')
@@ -1248,6 +1253,8 @@ def updatevars(typespec,selector,attrspec,entitydecl):
         l = []
         c = re.compile(r'(?P<start>[a-zA-Z]+)')
         for a in attrspec:
+            if not a:
+                continue
             m = c.match(a)
             if m:
                 s = m.group('start').lower()
@@ -1490,6 +1497,7 @@ def get_useparameters(block, param_map=None):
     for usename,mapping in usedict.items():
         usename = usename.lower()
         if usename not in f90modulevars:
+            outmess('get_useparameters: no module %s info used by %s\n' % (usename, block.get('name')))
             continue
         mvars = f90modulevars[usename]
         params = get_parameters(mvars)
@@ -1503,6 +1511,7 @@ def get_useparameters(block, param_map=None):
                 outmess('get_useparameters: overriding parameter %s with'\
                         ' value from module %s' % (`k`,`usename`))
             param_map[k] = v
+
     return param_map
 
 def postcrack2(block,tab='',param_map=None):
@@ -1972,7 +1981,12 @@ def _selected_real_kind_func(p,r=0,radix=0):
     #XXX: This should be processor dependent
     if p<7: return 4
     if p<16: return 8
-    if p<19: return 10
+    if platform.machine().lower().startswith('power'):
+        if p<=20:
+            return 16
+    else:
+        if p<19:
+            return 10
     return -1
 
 def get_parameters(vars, global_params={}):
@@ -2015,7 +2029,6 @@ def get_parameters(vars, global_params={}):
             if iscomplex(vars[n]):
                 if v[0]=='(' and v[-1]==')':
                     l = markoutercomma(v[1:-1]).split('@,@')
-                    print n,params
             try:
                 params[n] = eval(v,g_params,params)
             except Exception,msg:
@@ -2052,13 +2065,6 @@ def _eval_scalar(value,params):
                 % (msg,value,params.keys()))
     return value
 
-_size_call_sub = re.compile(r'size\s*\((?P<arg1>\w+)\s*[,]').sub
-def two_argument_size_hook(expr):
-    new_expr = _size_call_sub(r'shape(\g<arg1>,-1+', expr)
-    if verbose > 1 and expr!=new_expr:
-        outmess('two_argument_size_hook: mapping %r to %r\n' % (expr, new_expr))
-    return new_expr
-
 def analyzevars(block):
     global f90modulevars
     setmesstext(block)
@@ -2201,7 +2207,6 @@ def analyzevars(block):
                         if d[:4] == '1 * ': d = d[4:]
                         if di and di[-4:] == '/(1)': di = di[:-4]
                         if v: savelindims[d] = v,di
-                    d = two_argument_size_hook(d)
                     vars[n]['dimension'].append(d)
         if 'dimension' in vars[n]:
             if isintent_c(vars[n]):
@@ -2318,7 +2323,6 @@ def analyzevars(block):
                 if not vars[n]['depend']: del vars[n]['depend']
             if isscalar(vars[n]):
                 vars[n]['='] = _eval_scalar(vars[n]['='],params)
-                vars[n]['='] = two_argument_size_hook(vars[n]['='])
 
     for n in vars.keys():
         if n==block['name']: # n is block name
@@ -2382,33 +2386,49 @@ def analyzevars(block):
     return vars
 
 analyzeargs_re_1 = re.compile(r'\A[a-z]+[\w$]*\Z',re.I)
+def expr2name(a, block, args=[]):
+    orig_a = a
+    a_is_expr = not analyzeargs_re_1.match(a)
+    if a_is_expr: # `a` is an expression
+        implicitrules,attrrules=buildimplicitrules(block)
+        at=determineexprtype(a,block['vars'],implicitrules)
+        na='e_'
+        for c in a:
+            c = c.lower()
+            if c not in string.lowercase+string.digits: c='_'
+            na=na+c
+        if na[-1]=='_': na=na+'e'
+        else: na=na+'_e'
+        a=na
+        while a in block['vars'] or a in block['args']:
+            a=a+'r'
+    if a in args:
+        k = 1
+        while a + str(k) in args:
+            k = k + 1
+        a = a + str(k)
+    if a_is_expr:
+        block['vars'][a]=at
+    else:
+        if a not in block['vars']:
+            if orig_a in block['vars']:
+                block['vars'][a] = block['vars'][orig_a]
+            else:
+                block['vars'][a]={}
+        if 'externals' in block and orig_a in block['externals']+block['interfaced']:
+            block['vars'][a]=setattrspec(block['vars'][a],'external')
+    return a
+
 def analyzeargs(block):
     setmesstext(block)
     implicitrules,attrrules=buildimplicitrules(block)
     if 'args' not in block:
         block['args']=[]
     args=[]
-    re_1 = analyzeargs_re_1
     for a in block['args']:
-        if not re_1.match(a): # `a` is an expression
-            at=determineexprtype(a,block['vars'],implicitrules)
-            na='e_'
-            for c in a:
-                if c not in string.lowercase+string.digits: c='_'
-                na=na+c
-            if na[-1]=='_': na=na+'e'
-            else: na=na+'_e'
-            a=na
-            while a in block['vars'] or a in block['args']:
-                a=a+'r'
-            block['vars'][a]=at
+        a = expr2name(a, block, args)
         args.append(a)
-        if a not in block['vars']:
-            block['vars'][a]={}
-        if 'externals' in block and a in block['externals']+block['interfaced']:
-            block['vars'][a]=setattrspec(block['vars'][a],'external')
     block['args']=args
-
     if 'entry' in block:
         for k,args1 in block['entry'].items():
             for a in args1:
@@ -2498,14 +2518,17 @@ def crack2fortrangen(block,tab='\n', as_interface=False):
     args=''
     blocktype=block['block']
     if blocktype=='program': return ''
-    al=[]
+    argsl = []
     if 'name' in block:
         name=block['name']
     if 'args' in block:
         vars = block['vars']
-        al = [a for a in block['args'] if not isintent_callback(vars[a])]
-        if block['block']=='function' or al:
-            args='(%s)'%','.join(al)
+        for a in block['args']:
+            a = expr2name(a, block, argsl)
+            if not isintent_callback(vars[a]):
+                argsl.append(a)
+        if block['block']=='function' or argsl:
+            args='(%s)'%','.join(argsl)
     f2pyenhancements = ''
     if 'f2pyenhancements' in block:
         for k in block['f2pyenhancements'].keys():
@@ -2527,12 +2550,12 @@ def crack2fortrangen(block,tab='\n', as_interface=False):
     result=''
     if 'result' in block:
         result=' result (%s)'%block['result']
-        if block['result'] not in al:
-            al.append(block['result'])
+        if block['result'] not in argsl:
+            argsl.append(block['result'])
     #if 'prefix' in block:
     #    prefix=block['prefix']+' '
     body=crack2fortrangen(block['body'],tab+tabchar)
-    vars=vars2fortran(block,block['vars'],al,tab+tabchar, as_interface=as_interface)
+    vars=vars2fortran(block,block['vars'],argsl,tab+tabchar, as_interface=as_interface)
     mess=''
     if 'from' in block and not as_interface:
         mess='! in %s'%block['from']
@@ -2674,7 +2697,7 @@ def vars2fortran(block,vars,args,tab='', as_interface=False):
                 if l not in ['external']:
                     attr.append(l)
             if attr:
-                vardef='%s %s'%(vardef,','.join(attr))
+                vardef='%s, %s'%(vardef,','.join(attr))
                 c=','
         if 'dimension' in vars[a]:
 #             if not isintent_c(vars[a]):
diff --git a/numpy/f2py/f2py2e.py b/numpy/f2py/f2py2e.py
index 86c4b31a44d8..1d0631e8d0bc 100755
--- a/numpy/f2py/f2py2e.py
+++ b/numpy/f2py/f2py2e.py
@@ -4,7 +4,7 @@
 f2py2e - Fortran to Python C/API generator. 2nd Edition.
          See __usage__ below.
 
-Copyright 1999--2005 Pearu Peterson all rights reserved,
+Copyright 1999--2011 Pearu Peterson all rights reserved,
 Pearu Peterson <pearu@cens.ioc.ee>
 Permission to use, modify, and distribute this software is given under the
 terms of the NumPy License.
@@ -13,7 +13,6 @@
 $Date: 2005/05/06 08:31:19 $
 Pearu Peterson
 """
-__version__ = "$Revision: 1.90 $"[10:-1]
 
 import __version__
 f2py_version = __version__.version
@@ -112,7 +111,7 @@
                    functions. --wrap-functions is default because it ensures
                    maximum portability/compiler independence.
 
-  --include_paths <path1>:<path2>:...   Search include files from the given
+  --include-paths <path1>:<path2>:...   Search include files from the given
                    directories.
 
   --help-link [..] List system resources found by system_info.py. See also
@@ -170,7 +169,7 @@
 numpy Version: %s
 Requires:    Python 2.3 or higher.
 License:     NumPy license (see LICENSE.txt in the NumPy source code)
-Copyright 1999 - 2005 Pearu Peterson all rights reserved.
+Copyright 1999 - 2011 Pearu Peterson all rights reserved.
 http://cens.ioc.ee/projects/f2py2e/"""%(f2py_version, numpy_version)
 
 def scaninputline(inputline):
@@ -218,7 +217,10 @@ def scaninputline(inputline):
         elif l[:8]=='-include':
             cfuncs.outneeds['userincludes'].append(l[9:-1])
             cfuncs.userincludes[l[9:-1]]='#include '+l[8:]
-        elif l[:15]=='--include_paths':
+        elif l[:15] in '--include_paths':
+            outmess('f2py option --include_paths is deprecated, use --include-paths instead.\n')
+            f7=1
+        elif l[:15] in '--include-paths':
             f7=1
         elif l[0]=='-':
             errmess('Unknown option %s\n'%`l`)
@@ -486,6 +488,14 @@ def run_compile():
 
     modulename = 'untitled'
     sources = sys.argv[1:]
+
+    for optname in ['--include_paths', '--include-paths']:
+        if optname in sys.argv:
+            i = sys.argv.index (optname)
+            f2py_flags.extend (sys.argv[i:i+2])
+            del sys.argv[i+1],sys.argv[i]
+            sources = sys.argv[1:]
+
     if '-m' in sys.argv:
         i = sys.argv.index('-m')
         modulename = sys.argv[i+1]
diff --git a/numpy/f2py/f90mod_rules.py b/numpy/f2py/f90mod_rules.py
index 6ce9b15cac18..e4a4b0e96fc7 100644
--- a/numpy/f2py/f90mod_rules.py
+++ b/numpy/f2py/f90mod_rules.py
@@ -180,8 +180,13 @@ def iadd(line,s=ihooks): s[0] = '%s\n%s'%(s[0],line)
                     #efargs.append(fargs[-1])
                     ifargs.append(func2subr.createfuncwrapper(b,signature=1))
                 else:
-                    fargs.append(b['name'])
-                    mfargs.append(fargs[-1])
+                    if wrap:
+                        fhooks[0]=fhooks[0]+wrap
+                        fargs.append('f2pywrap_%s_%s'%(m['name'],b['name']))
+                        ifargs.append(func2subr.createsubrwrapper(b,signature=1))
+                    else:
+                        fargs.append(b['name'])
+                        mfargs.append(fargs[-1])
                     #if '--external-modroutines' in options and options['--external-modroutines']:
                     #    outmess('\t\t\tapplying --external-modroutines for %s\n'%(b['name']))
                     #     efargs.append(fargs[-1])
diff --git a/numpy/f2py/func2subr.py b/numpy/f2py/func2subr.py
index 02401d504c6b..f746108ad13e 100644
--- a/numpy/f2py/func2subr.py
+++ b/numpy/f2py/func2subr.py
@@ -90,7 +90,6 @@ def createfuncwrapper(rout,signature=0):
                 v['dimension'][i] = dn
     rout['args'].extend(extra_args)
     need_interface = bool(extra_args)
-
     
     ret = ['']
     def add(line,ret=ret):
@@ -124,9 +123,7 @@ def add(line,ret=ret):
         add('subroutine f2pywrap%s (%s)'%(name,sargs))
         if not need_interface:
             add('external %s'%(fortranname))
-        #if not return_char_star:
-        l = l + ', '+fortranname
-
+            l = l + ', '+fortranname
     if need_interface:
         for line in rout['saved_interface'].split('\n'):
             if line.lstrip().startswith('use '):
@@ -143,16 +140,25 @@ def add(line,ret=ret):
         if isscalar(vars[a]):
             add(var2fixfortran(vars,a,f90mode=f90mode))
             dumped_args.append(a)
+    for a in args:
+        if a in dumped_args: continue
+        if isintent_in(vars[a]):
+            add(var2fixfortran(vars,a,f90mode=f90mode))
+            dumped_args.append(a)
     for a in args:
         if a in dumped_args: continue
         add(var2fixfortran(vars,a,f90mode=f90mode))
-
+            
     add(l)
 
     if need_interface:
-        add('interface')
-        add(rout['saved_interface'].lstrip())
-        add('end interface')
+        if f90mode:
+            # f90 module already defines needed interface
+            pass
+        else:
+            add('interface')
+            add(rout['saved_interface'].lstrip())
+            add('end interface')
 
     sargs = ', '.join([a for a in args if a not in extra_args])
 
@@ -187,7 +193,7 @@ def createsubrwrapper(rout,signature=0):
                 v['dimension'][i] = dn
     rout['args'].extend(extra_args)
     need_interface = bool(extra_args)
-    
+
     ret = ['']
     def add(line,ret=ret):
         ret[0] = '%s\n      %s'%(ret[0],line)
@@ -227,9 +233,13 @@ def add(line,ret=ret):
         add(var2fixfortran(vars,a,f90mode=f90mode))
 
     if need_interface:
-        add('interface')
-        add(rout['saved_interface'].lstrip())
-        add('end interface')
+        if f90mode:
+            # f90 module already defines needed interface
+            pass
+        else:
+            add('interface')
+            add(rout['saved_interface'].lstrip())
+            add('end interface')
 
     sargs = ', '.join([a for a in args if a not in extra_args])
 
@@ -268,8 +278,7 @@ def assubr(rout):
                     break
             if flag:
                 fvar['intent'].append('out=%s' % (rname))
-
-        rout['args'] = [fname] + rout['args']
+        rout['args'][:] = [fname] + rout['args']
         return rout,createfuncwrapper(rout)
     if issubroutine_wrap(rout):
         fortranname = getfortranname(rout)
diff --git a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
index 71bee783da6a..73aa408629ad 100644
--- a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
+++ b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
@@ -53,8 +53,10 @@ static PyObject *f2py_rout_wrap_call(PyObject *capi_self,
     dims[i] = (npy_intp)PyInt_AsLong(PySequence_GetItem(dims_capi,i));
 
   capi_arr_tmp = array_from_pyobj(type_num,dims,rank,intent|F2PY_INTENT_OUT,arr_capi);
-  if (capi_arr_tmp == NULL)
+  if (capi_arr_tmp == NULL) {
+    free(dims);
     return NULL;
+  }
   capi_buildvalue = Py_BuildValue("N",capi_arr_tmp);
   free(dims);
   return capi_buildvalue;
diff --git a/numpy/f2py/tests/src/assumed_shape/foo_mod.f90 b/numpy/f2py/tests/src/assumed_shape/foo_mod.f90
new file mode 100644
index 000000000000..cbe6317ed8f3
--- /dev/null
+++ b/numpy/f2py/tests/src/assumed_shape/foo_mod.f90
@@ -0,0 +1,41 @@
+
+module mod
+
+contains
+
+subroutine sum(x, res)
+  implicit none
+  real, intent(in) :: x(:)
+  real, intent(out) :: res
+
+  integer :: i
+
+  !print *, "sum: size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+end subroutine sum
+
+function fsum(x) result (res)
+  implicit none
+  real, intent(in) :: x(:)
+  real :: res
+
+  integer :: i
+
+  !print *, "fsum: size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+end function fsum
+
+
+end module mod
diff --git a/numpy/f2py/tests/src/size/foo.f90 b/numpy/f2py/tests/src/size/foo.f90
index 9602837fe2fd..5b66f8c430d7 100644
--- a/numpy/f2py/tests/src/size/foo.f90
+++ b/numpy/f2py/tests/src/size/foo.f90
@@ -12,3 +12,33 @@ subroutine foo(a, n, m, b)
     b(i) = sum(a(i,:))
   enddo
 end subroutine
+
+subroutine trans(x,y)
+  implicit none
+  real, intent(in), dimension(:,:) :: x
+  real, intent(out), dimension( size(x,2), size(x,1) ) :: y
+  integer :: N, M, i, j
+  N = size(x,1)
+  M = size(x,2)
+  DO i=1,N
+     do j=1,M
+        y(j,i) = x(i,j)
+     END DO
+  END DO
+end subroutine trans
+
+subroutine flatten(x,y)
+  implicit none
+  real, intent(in), dimension(:,:) :: x
+  real, intent(out), dimension( size(x) ) :: y
+  integer :: N, M, i, j, k
+  N = size(x,1)
+  M = size(x,2)
+  k = 1
+  DO i=1,N
+     do j=1,M
+        y(k) = x(i,j)
+        k = k + 1
+     END DO
+  END DO
+end subroutine flatten
diff --git a/numpy/f2py/tests/test_assumed_shape.py b/numpy/f2py/tests/test_assumed_shape.py
index da362b7603b1..e501b13c3d98 100644
--- a/numpy/f2py/tests/test_assumed_shape.py
+++ b/numpy/f2py/tests/test_assumed_shape.py
@@ -13,6 +13,7 @@ class TestAssumedShapeSumExample(util.F2PyTest):
     sources = [_path('src', 'assumed_shape', 'foo_free.f90'),
                _path('src', 'assumed_shape', 'foo_use.f90'),
                _path('src', 'assumed_shape', 'precision.f90'),
+               _path('src', 'assumed_shape', 'foo_mod.f90'),
                ]
 
     @dec.slow
@@ -21,10 +22,14 @@ def test_all(self):
         assert_(r==3,`r`)
         r = self.module.sum([1,2])
         assert_(r==3,`r`)
-
         r = self.module.sum_with_use([1,2])
         assert_(r==3,`r`)
 
+        r = self.module.mod.sum([1,2])
+        assert_(r==3,`r`)
+        r = self.module.mod.fsum([1,2])
+        assert_(r==3,`r`)
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/numpy/f2py/tests/test_size.py b/numpy/f2py/tests/test_size.py
index c00dd9a31b17..a548e9885eb1 100644
--- a/numpy/f2py/tests/test_size.py
+++ b/numpy/f2py/tests/test_size.py
@@ -24,6 +24,22 @@ def test_all(self):
         r = self.module.foo([[1,2],[3,4],[5,6]])
         assert_equal(r, [3,7,11],`r`)
 
+    @dec.slow
+    def test_transpose(self):
+        r = self.module.trans([[1,2]])
+        assert_equal(r, [[1],[2]],`r`)
+
+        r = self.module.trans([[1,2,3],[4,5,6]])
+        assert_equal(r, [[1,4],[2,5],[3,6]],`r`)
+
+    @dec.slow
+    def test_flatten(self):
+        r = self.module.flatten([[1,2]])
+        assert_equal(r, [1,2],`r`)
+
+        r = self.module.flatten([[1,2,3],[4,5,6]])
+        assert_equal(r, [1,2,3,4,5,6],`r`)
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index e24583ec1c34..a5c3c4b28c29 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -600,9 +600,15 @@ def __init__(self, dtype_or_func=None, default=None, missing_values=None,
             # If the input was a dtype, set the function to the last we saw
             if self.func is None:
                 self.func = func
-            # If the status is 1 (int), change the function to smthg more robust
+            # If the status is 1 (int), change the function to
+            # something more robust.
             if self.func == self._mapper[1][1]:
-                self.func = lambda x : int(float(x))
+                if issubclass(ttype, np.uint64):
+                    self.func = np.uint64
+                elif issubclass(ttype, np.int64):
+                    self.func = np.int64
+                else:
+                    self.func = lambda x : int(float(x))
         # Store the list of strings corresponding to missing values.
         if missing_values is None:
             self.missing_values = set([asbytes('')])
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index dd792c509a57..ca291f6bb869 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -58,7 +58,7 @@ def iterable(y):
     except: return 0
     return 1
 
-def histogram(a, bins=10, range=None, normed=False, weights=None):
+def histogram(a, bins=10, range=None, normed=False, weights=None, density=None):
     """
     Compute the histogram of a set of data.
 
@@ -76,17 +76,27 @@ def histogram(a, bins=10, range=None, normed=False, weights=None):
         is simply ``(a.min(), a.max())``.  Values outside the range are
         ignored.
     normed : bool, optional
+        This keyword is deprecated in Numpy 1.6 due to confusing/buggy
+        behavior. It will be removed in Numpy 2.0. Use the density keyword
+        instead.
         If False, the result will contain the number of samples
         in each bin.  If True, the result is the value of the
         probability *density* function at the bin, normalized such that
-        the *integral* over the range is 1. Note that the sum of the
-        histogram values will not be equal to 1 unless bins of unity
-        width are chosen; it is not a probability *mass* function.
+        the *integral* over the range is 1. Note that this latter behavior is
+        known to be buggy with unequal bin widths; use `density` instead.
     weights : array_like, optional
         An array of weights, of the same shape as `a`.  Each value in `a`
         only contributes its associated weight towards the bin count
         (instead of 1).  If `normed` is True, the weights are normalized,
         so that the integral of the density over the range remains 1
+    density : bool, optional
+        If False, the result will contain the number of samples
+        in each bin.  If True, the result is the value of the
+        probability *density* function at the bin, normalized such that
+        the *integral* over the range is 1. Note that the sum of the
+        histogram values will not be equal to 1 unless bins of unity
+        width are chosen; it is not a probability *mass* function.
+        Overrides the `normed` keyword if given.
 
     Returns
     -------
@@ -116,13 +126,13 @@ def histogram(a, bins=10, range=None, normed=False, weights=None):
     --------
     >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
     (array([0, 2, 1]), array([0, 1, 2, 3]))
-    >>> np.histogram(np.arange(4), bins=np.arange(5), normed=True)
+    >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
     (array([ 0.25,  0.25,  0.25,  0.25]), array([0, 1, 2, 3, 4]))
     >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
     (array([1, 4, 1]), array([0, 1, 2, 3]))
 
     >>> a = np.arange(5)
-    >>> hist, bin_edges = np.histogram(a, normed=True)
+    >>> hist, bin_edges = np.histogram(a, density=True)
     >>> hist
     array([ 0.5,  0. ,  0.5,  0. ,  0. ,  0.5,  0. ,  0.5,  0. ,  0.5])
     >>> hist.sum()
@@ -148,17 +158,21 @@ def histogram(a, bins=10, range=None, normed=False, weights=None):
                 'max must be larger than min in range parameter.')
 
     if not iterable(bins):
+        if np.isscalar(bins) and bins < 1:
+            raise ValueError("`bins` should be a positive integer.")
         if range is None:
-            range = (a.min(), a.max())
+            if a.size == 0:
+                # handle empty arrays. Can't determine range, so use 0-1.
+                range = (0, 1)
+            else:
+                range = (a.min(), a.max())
         mn, mx = [mi+0.0 for mi in range]
         if mn == mx:
             mn -= 0.5
             mx += 0.5
         bins = linspace(mn, mx, bins+1, endpoint=True)
-        uniform = True
     else:
         bins = asarray(bins)
-        uniform = False
         if (np.diff(bins) < 0).any():
             raise AttributeError(
                     'bins must increase monotonically.')
@@ -191,20 +205,19 @@ def histogram(a, bins=10, range=None, normed=False, weights=None):
 
     n = np.diff(n)
 
-    if normed:
-        db = array(np.diff(bins), float)
-        if not uniform:
-            warnings.warn("""
-            This release of NumPy fixes a normalization bug in histogram
-            function occuring with non-uniform bin widths. The returned 
-            value is now a density: n / (N * bin width), where n is the 
-            bin count and N the total number of points. 
-            """)
-        return n/db/n.sum(), bins
-        
-        
+    if density is not None:
+        if density:
+            db = array(np.diff(bins), float)
+            return n/db/n.sum(), bins
+        else:
+            return n, bins
     else:
-        return n, bins
+        # deprecated, buggy behavior. Remove for Numpy 2.0
+        if normed:
+            db = array(np.diff(bins), float)
+            return n/(n*db).sum(), bins
+        else:
+            return n, bins
 
 
 def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
@@ -228,7 +241,7 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         A sequence of lower and upper bin edges to be used if the edges are
         not given explicitely in `bins`. Defaults to the minimum and maximum
         values along each dimension.
-    normed : boolean, optional
+    normed : bool, optional
         If False, returns the number of samples in each bin. If True, returns
         the bin density, ie, the bin count divided by the bin hypervolume.
     weights : array_like (N,), optional
@@ -247,8 +260,8 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
 
     See Also
     --------
-    histogram: 1D histogram
-    histogram2d: 2D histogram
+    histogram: 1-D histogram
+    histogram2d: 2-D histogram
 
     Examples
     --------
@@ -280,13 +293,19 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
                     'The dimension of bins must be equal'\
                     ' to the dimension of the sample x.')
     except TypeError:
+        # bins is an integer
         bins = D*[bins]
 
     # Select range for each dimension
     # Used only if number of bins is given.
     if range is None:
-        smin = atleast_1d(array(sample.min(0), float))
-        smax = atleast_1d(array(sample.max(0), float))
+        # Handle empty input. Range can't be determined in that case, use 0-1.
+        if N == 0:
+            smin = zeros(D)
+            smax = ones(D)
+        else:
+            smin = atleast_1d(array(sample.min(0), float))
+            smax = atleast_1d(array(sample.max(0), float))
     else:
         smin = zeros(D)
         smax = zeros(D)
@@ -302,12 +321,23 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     # Create edge arrays
     for i in arange(D):
         if isscalar(bins[i]):
+            if bins[i] < 1:
+                raise ValueError("Element at index %s in `bins` should be "
+                                 "a positive integer." % i)
             nbin[i] = bins[i] + 2 # +2 for outlier bins
             edges[i] = linspace(smin[i], smax[i], nbin[i]-1)
         else:
             edges[i] = asarray(bins[i], float)
             nbin[i] = len(edges[i])+1  # +1 for outlier bins
         dedges[i] = diff(edges[i])
+        if np.any(np.asarray(dedges[i]) <= 0):
+            raise ValueError("""
+            Found bin edge of size <= 0. Did you specify `bins` with
+            non-monotonic sequence?""")
+
+    # Handle empty input.
+    if N == 0:
+        return np.zeros(D), edges
 
     nbin =  asarray(nbin)
 
@@ -322,12 +352,14 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     outliers = zeros(N, int)
     for i in arange(D):
         # Rounding precision
-        decimal = int(-log10(dedges[i].min())) +6
-        # Find which points are on the rightmost edge.
-        on_edge = where(around(sample[:,i], decimal) == around(edges[i][-1],
-                                                               decimal))[0]
-        # Shift these points one bin to the left.
-        Ncount[i][on_edge] -= 1
+        mindiff = dedges[i].min()
+        if not np.isinf(mindiff):
+            decimal = int(-log10(mindiff)) + 6
+            # Find which points are on the rightmost edge.
+            on_edge = where(around(sample[:,i], decimal) == around(edges[i][-1],
+                                                                   decimal))[0]
+            # Shift these points one bin to the left.
+            Ncount[i][on_edge] -= 1
 
     # Flattened histogram matrix (1D)
     # Reshape is used so that overlarge arrays
@@ -1937,6 +1969,9 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None):
         raise ValueError("ddof must be integer")
 
     X = array(m, ndmin=2, dtype=float)
+    if X.size == 0:
+        # handle empty arrays
+        return np.array(m)
     if X.shape[0] == 1:
         rowvar = 1
     if rowvar:
@@ -1984,7 +2019,7 @@ def corrcoef(x, y=None, rowvar=1, bias=0, ddof=None):
 
     Parameters
     ----------
-    m : array_like
+    x : array_like
         A 1-D or 2-D array containing multiple variables and observations.
         Each row of `m` represents a variable, and each column a single
         observation of all those variables. Also see `rowvar` below.
@@ -2018,6 +2053,9 @@ def corrcoef(x, y=None, rowvar=1, bias=0, ddof=None):
 
     """
     c = cov(x, y, rowvar, bias, ddof)
+    if c.size == 0:
+        # handle empty arrays
+        return c
     try:
         d = diag(c)
     except ValueError: # scalar covariance
@@ -2971,28 +3009,27 @@ def percentile(a, q, axis=None, out=None, overwrite_input=False):
     >>> a
     array([[10,  7,  4],
            [ 3,  2,  1]])
-    >>> np.percentile(a, 0.5)
+    >>> np.percentile(a, 50)
     3.5
     >>> np.percentile(a, 0.5, axis=0)
     array([ 6.5,  4.5,  2.5])
-    >>> np.percentile(a, 0.5, axis=1)
+    >>> np.percentile(a, 50, axis=1)
     array([ 7.,  2.])
 
-    >>> m = np.percentile(a, 0.5, axis=0)
+    >>> m = np.percentile(a, 50, axis=0)
     >>> out = np.zeros_like(m)
-    >>> np.percentile(a, 0.5, axis=0, out=m)
+    >>> np.percentile(a, 50, axis=0, out=m)
     array([ 6.5,  4.5,  2.5])
     >>> m
     array([ 6.5,  4.5,  2.5])
 
     >>> b = a.copy()
-    >>> np.percentile(b, 0.5, axis=1, overwrite_input=True)
+    >>> np.percentile(b, 50, axis=1, overwrite_input=True)
     array([ 7.,  2.])
     >>> assert not np.all(a==b)
     >>> b = a.copy()
-    >>> np.percentile(b, 0.5, axis=None, overwrite_input=True)
+    >>> np.percentile(b, 50, axis=None, overwrite_input=True)
     3.5
-    >>> assert not np.all(a==b)
 
     """
     a = np.asarray(a)
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 9fbacaa226c6..9c385bb9dfca 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -565,6 +565,10 @@ def _getconv(dtype):
     typ = dtype.type
     if issubclass(typ, np.bool_):
         return lambda x: bool(int(x))
+    if issubclass(typ, np.uint64):
+        return np.uint64
+    if issubclass(typ, np.int64):
+        return np.int64
     if issubclass(typ, np.integer):
         return lambda x: int(float(x))
     elif issubclass(typ, np.floating):
@@ -579,7 +583,8 @@ def _getconv(dtype):
 
 
 def loadtxt(fname, dtype=float, comments='#', delimiter=None,
-            converters=None, skiprows=0, usecols=None, unpack=False):
+            converters=None, skiprows=0, usecols=None, unpack=False,
+            ndmin=0):
     """
     Load data from a text file.
 
@@ -588,16 +593,18 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     Parameters
     ----------
     fname : file or str
-        File or filename to read.  If the filename extension is ``.gz`` or
-        ``.bz2``, the file is first decompressed.
+        File, filename, or generator to read.  If the filename extension is
+        ``.gz`` or ``.bz2``, the file is first decompressed. Note that
+        generators should return byte strings for Python 3k.
     dtype : data-type, optional
-        Data-type of the resulting array; default: float.  If this is a record
-        data-type, the resulting array will be 1-dimensional, and each row
-        will be interpreted as an element of the array.  In this case, the
-        number of columns used must match the number of fields in the
-        data-type.
+        Data-type of the resulting array; default: float.  If this is a
+        record data-type, the resulting array will be 1-dimensional, and
+        each row will be interpreted as an element of the array.  In this
+        case, the number of columns used must match the number of fields in
+        the data-type.
     comments : str, optional
-        The character used to indicate the start of a comment; default: '#'.
+        The character used to indicate the start of a comment;
+        default: '#'.
     delimiter : str, optional
         The string used to separate values.  By default, this is any
         whitespace.
@@ -605,8 +612,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         A dictionary mapping column number to a function that will convert
         that column to a float.  E.g., if column 0 is a date string:
         ``converters = {0: datestr2num}``.  Converters can also be used to
-        provide a default value for missing data:
-        ``converters = {3: lambda s: float(s or 0)}``.  Default: None.
+        provide a default value for missing data (but see also `genfromtxt`):
+        ``converters = {3: lambda s: float(s.strip() or 0)}``.  Default: None.
     skiprows : int, optional
         Skip the first `skiprows` lines; default: 0.
     usecols : sequence, optional
@@ -615,7 +622,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         The default, None, results in all columns being read.
     unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
-        unpacked using ``x, y, z = loadtxt(...)``.  The default is False.
+        unpacked using ``x, y, z = loadtxt(...)``.  When used with a record
+        data-type, arrays are returned for each field.  Default is False.
+    ndmin : int, optional
+        The returned array will have at least `ndmin` dimensions.
+        Otherwise mono-dimensional axes will be squeezed. 
+        Legal values: 0 (default), 1 or 2.
+        .. versionadded:: 1.6.0
 
     Returns
     -------
@@ -658,28 +671,27 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     """
     # Type conversions for Py3 convenience
     comments = asbytes(comments)
+    user_converters = converters
     if delimiter is not None:
         delimiter = asbytes(delimiter)
-
-    user_converters = converters
-
     if usecols is not None:
         usecols = list(usecols)
 
-    own_fh = False
-    if _is_string_like(fname):
-        own_fh = True
-        if fname.endswith('.gz'):
-            fh = seek_gzip_factory(fname)
-        elif fname.endswith('.bz2'):
-            import bz2
-            fh = bz2.BZ2File(fname)
+    fown = False
+    try:
+        if _is_string_like(fname):
+            fown = True
+            if fname.endswith('.gz'):
+                fh = iter(seek_gzip_factory(fname))
+            elif fname.endswith('.bz2'):
+                import bz2
+                fh = iter(bz2.BZ2File(fname))
+            else:
+                fh = iter(open(fname, 'U'))
         else:
-            fh = open(fname, 'U')
-    elif hasattr(fname, 'readline'):
-        fh = fname
-    else:
-        raise ValueError('fname must be a string or file handle')
+            fh = iter(fname)
+    except TypeError:
+        raise ValueError('fname must be a string, file handle, or generator')
     X = []
 
     def flatten_dtype(dt):
@@ -724,7 +736,7 @@ def pack_items(items, packing):
 
     def split_line(line):
         """Chop off comments, strip, and split at delimiter."""
-        line = asbytes(line).split(comments)[0].strip()
+        line = asbytes(line).split(comments)[0].strip(asbytes('\r\n'))
         if line:
             return line.split(delimiter)
         else:
@@ -737,16 +749,19 @@ def split_line(line):
 
         # Skip the first `skiprows` lines
         for i in xrange(skiprows):
-            fh.readline()
+            fh.next()
 
         # Read until we find a line with some values, and use
         # it to estimate the number of columns, N.
         first_vals = None
-        while not first_vals:
-            first_line = fh.readline()
-            if not first_line: # EOF reached
-                raise IOError('End-of-file reached before encountering data.')
-            first_vals = split_line(first_line)
+        try:
+            while not first_vals:
+                first_line = fh.next()
+                first_vals = split_line(first_line)
+        except StopIteration:
+            # End of lines reached
+            first_line = ''
+            first_vals = []
         N = len(usecols or first_vals)
 
         dtype_types, packing = flatten_dtype(dtype)
@@ -775,25 +790,44 @@ def split_line(line):
             vals = split_line(line)
             if len(vals) == 0:
                 continue
-
             if usecols:
                 vals = [vals[i] for i in usecols]
-
             # Convert each value according to its column and store
             items = [conv(val) for (conv, val) in zip(converters, vals)]
             # Then pack it according to the dtype's nesting
             items = pack_items(items, packing)
-
             X.append(items)
     finally:
-        if own_fh:
+        if fown:
             fh.close()
 
     X = np.array(X, dtype)
+    # Multicolumn data are returned with shape (1, N, M), i.e. 
+    # (1, 1, M) for a single row - remove the singleton dimension there
+    if X.ndim == 3 and X.shape[:2] == (1, 1):
+        X.shape = (1, -1)
+
+    # Verify that the array has at least dimensions `ndmin`.
+    # Check correctness of the values of `ndmin`
+    if not ndmin in [0, 1, 2]:
+        raise ValueError('Illegal value of ndmin keyword: %s' % ndmin)
+    # Tweak the size and shape of the arrays - remove extraneous dimensions
+    if X.ndim > ndmin:
+        X = np.squeeze(X)
+    # and ensure we have the minimum number of dimensions asked for
+    # - has to be in this order for the odd case ndmin=1, X.squeeze().ndim=0
+    if X.ndim < ndmin:
+        if ndmin == 1:
+            X = np.atleast_1d(X)
+        elif ndmin == 2:
+            X = np.atleast_2d(X).T
 
-    X = np.squeeze(X)
     if unpack:
-        return X.T
+        if len(dtype_types) > 1:
+            # For structured arrays, return an array for each field.
+            return [X[field] for field in dtype.names]
+        else:
+            return X.T
     else:
         return X
 
@@ -1055,8 +1089,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     Parameters
     ----------
     fname : file or str
-        File or filename to read.  If the filename extension is `.gz` or
-        `.bz2`, the file is first decompressed.
+        File, filename, or generator to read.  If the filename extension is
+        `.gz` or `.bz2`, the file is first decompressed. Note that
+        generators must return byte strings in Python 3k.
     dtype : dtype, optional
         Data type of the resulting array.
         If None, the dtypes will be determined by the contents of each
@@ -1201,14 +1236,16 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     # Initialize the filehandle, the LineSplitter and the NameValidator
     own_fhd = False
-    if isinstance(fname, basestring):
-        fhd = np.lib._datasource.open(fname, 'U')
-        own_fhd = True
-    elif not hasattr(fname, 'read'):
-        raise TypeError("The input should be a string or a filehandle. "\
+    try:
+        if isinstance(fname, basestring):
+            fhd = iter(np.lib._datasource.open(fname, 'rbU'))
+            own_fhd = True
+        else:
+            fhd = iter(fname)
+    except TypeError:
+        raise TypeError("fname mustbe a string, filehandle, or generator. "\
                         "(got %s instead)" % type(fname))
-    else:
-        fhd = fname
+
     split_line = LineSplitter(delimiter=delimiter, comments=comments,
                               autostrip=autostrip)._handyman
     validate_names = NameValidator(excludelist=excludelist,
@@ -1225,17 +1262,21 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         skip_header = skiprows
     # Skip the first `skip_header` rows
     for i in xrange(skip_header):
-        fhd.readline()
+        fhd.next()
+
     # Keep on until we find the first valid values
     first_values = None
-    while not first_values:
-        first_line = fhd.readline()
-        if not first_line:
-            raise IOError('End-of-file reached before encountering data.')
-        if names is True:
-            if comments in first_line:
-                first_line = asbytes('').join(first_line.split(comments)[1:])
-        first_values = split_line(first_line)
+    try:
+        while not first_values:
+            first_line = fhd.next()
+            if names is True:
+                if comments in first_line:
+                    first_line = asbytes('').join(first_line.split(comments)[1:])
+            first_values = split_line(first_line)
+    except StopIteration:
+        # might want to return empty array instead of raising error.
+        raise IOError('End-of-file reached before encountering data.')
+
     # Should we take the first values as names ?
     if names is True:
         fval = first_values[0].strip()
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 544057e3a600..853d060877b9 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -218,6 +218,20 @@ def test_keep_missing_values(self):
                                     missing_values=asbytes("N/A"))
         assert_equal(converter.missing_values, set(asbytes_nested(['', 'N/A'])))
 
+    def test_int64_dtype(self):
+        "Check that int64 integer types can be specified"
+        converter = StringConverter(np.int64, default=0)
+        val = asbytes("-9223372036854775807")
+        assert_(converter(val) == -9223372036854775807)
+        val = asbytes("9223372036854775807")
+        assert_(converter(val) == 9223372036854775807)
+
+    def test_uint64_dtype(self):
+        "Check that uint64 integer types can be specified"
+        converter = StringConverter(np.uint64, default=0)
+        val = asbytes("9223372043271415339")
+        assert_(converter(val) == 9223372043271415339)
+
 #-------------------------------------------------------------------------------
 
 class TestMiscFunctions(TestCase):
@@ -309,3 +323,5 @@ def test_flatten_dtype(self):
         dt_flat = flatten_dtype(dt)
         assert_equal(dt_flat, [float, float])
 
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 13930c79e407..05675230ac83 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -556,6 +556,10 @@ def test_one_bin(self):
         hist, edges = histogram([1, 2, 3, 4], [1, 2])
         assert_array_equal(hist, [2, ])
         assert_array_equal(edges, [1, 2])
+        assert_raises(ValueError, histogram, [1, 2], bins=0)
+        h, e = histogram([1,2], bins=1)
+        assert_equal(h, array([2]))
+        assert_allclose(e, array([1., 2.]))
 
     def test_normed(self):
         # Check that the integral of the density equals 1.
@@ -565,12 +569,25 @@ def test_normed(self):
         area = sum(a * diff(b))
         assert_almost_equal(area, 1)
 
-        warnings.filterwarnings('ignore',
-            message="\s*This release of NumPy fixes a normalization bug")
+        # Check with non-constant bin widths (buggy but backwards compatible)
+        v = np.arange(10)
+        bins = [0, 1, 5, 9, 10]
+        a, b = histogram(v, bins, normed=True)
+        area = sum(a * diff(b))
+        assert_almost_equal(area, 1)
+
+    def test_density(self):
+        # Check that the integral of the density equals 1.
+        n = 100
+        v = rand(n)
+        a, b = histogram(v, density=True)
+        area = sum(a * diff(b))
+        assert_almost_equal(area, 1)
+
         # Check with non-constant bin widths
         v = np.arange(10)
         bins = [0,1,3,6,10]
-        a, b = histogram(v, bins, normed=True)
+        a, b = histogram(v, bins, density=True)
         assert_array_equal(a, .1)
         assert_equal(sum(a*diff(b)), 1)
 
@@ -578,14 +595,13 @@ def test_normed(self):
         # infinities.
         v = np.arange(10)
         bins = [0,1,3,6,np.inf]
-        a, b = histogram(v, bins, normed=True)
+        a, b = histogram(v, bins, density=True)
         assert_array_equal(a, [.1,.1,.1,0.])
 
         # Taken from a bug report from N. Becker on the numpy-discussion
         # mailing list Aug. 6, 2010.
-        counts, dmy = np.histogram([1,2,3,4], [0.5,1.5,np.inf], normed=True)
+        counts, dmy = np.histogram([1,2,3,4], [0.5,1.5,np.inf], density=True)
         assert_equal(counts, [.25, 0])
-        warnings.filters.pop(0)
 
     def test_outliers(self):
         # Check that outliers are not tallied
@@ -648,13 +664,15 @@ def test_weights(self):
         wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True)
         assert_array_almost_equal(wa, array([4, 5, 0, 1]) / 10. / 3. * 4)
 
-        warnings.filterwarnings('ignore', \
-            message="\s*This release of NumPy fixes a normalization bug")
         # Check weights with non-uniform bin widths
         a,b = histogram(np.arange(9), [0,1,3,6,10], \
-                        weights=[2,1,1,1,1,1,1,1,1], normed=True)
+                        weights=[2,1,1,1,1,1,1,1,1], density=True)
         assert_almost_equal(a, [.2, .1, .1, .075])
-        warnings.filters.pop(0)
+
+    def test_empty(self):
+        a, b = histogram([], bins=([0,1]))
+        assert_array_equal(a, array([0]))
+        assert_array_equal(b, array([0, 1]))
 
 
 class TestHistogramdd(TestCase):
@@ -729,6 +747,31 @@ def test_identical_samples(self):
         hist, edges = histogramdd(x, bins=2)
         assert_array_equal(edges[0], array([-0.5, 0. , 0.5]))
 
+    def test_empty(self):
+        a, b = histogramdd([[], []], bins=([0,1], [0,1]))
+        assert_array_max_ulp(a, array([ 0., 0.]))
+
+    def test_bins_errors(self):
+        """There are two ways to specify bins. Check for the right errors when
+        mixing those."""
+        x = np.arange(8).reshape(2, 4)
+        assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
+        assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
+        assert_raises(ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
+        assert_raises(ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
+        assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
+
+    def test_inf_edges(self):
+        """Test using +/-inf bin edges works. See #1788."""
+        x = np.arange(6).reshape(3, 2)
+        expected = np.array([[1, 0], [0, 1], [0, 1]])
+        h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]])
+        assert_allclose(h, expected)
+        h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])])
+        assert_allclose(h, expected)
+        h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]])
+        assert_allclose(h, expected)
+
 
 class TestUnique(TestCase):
     def test_simple(self):
@@ -901,6 +944,20 @@ def test_ddof(self):
         assert_almost_equal(corrcoef(self.A, ddof=-1), self.res1)
         assert_almost_equal(corrcoef(self.A, self.B, ddof=-1), self.res2)
 
+    def test_empty(self):
+        assert_equal(corrcoef(np.array([])).size, 0)
+        assert_equal(corrcoef(np.array([]).reshape(0, 2)).shape, (0, 2))
+
+
+class TestCov(TestCase):
+    def test_basic(self):
+        x = np.array([[0, 2], [1, 1], [2, 0]]).T
+        assert_allclose(np.cov(x), np.array([[ 1.,-1.], [-1.,1.]]))
+
+    def test_empty(self):
+        assert_equal(cov(np.array([])).size, 0)
+        assert_equal(cov(np.array([]).reshape(0, 2)).shape, (0, 2))
+
 
 class Test_i0(TestCase):
     def test_simple(self):
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 5001f6bac3f8..cb42b66a7919 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -1,7 +1,8 @@
 import numpy as np
 import numpy.ma as ma
-from numpy.ma.testutils import *
-from numpy.testing import assert_warns
+from numpy.ma.testutils import (TestCase, assert_equal, assert_array_equal,
+    assert_raises, run_module_suite)
+from numpy.testing import assert_warns, assert_
 
 import sys
 
@@ -392,7 +393,12 @@ def test_3d_shaped_dtype(self):
 
     def test_empty_file(self):
         c = StringIO()
-        assert_raises(IOError, np.loadtxt, c)
+        x = np.loadtxt(c)
+        assert_equal(x.shape, (0,))
+        x = np.loadtxt(c, dtype=np.int64)
+        assert_equal(x.shape, (0,))
+        assert_(x.dtype == np.int64)
+
 
     def test_unused_converter(self):
         c = StringIO()
@@ -411,10 +417,8 @@ def test_dtype_with_object(self):
         "Test using an explicit dtype with an object"
         from datetime import date
         import time
-        data = """
-        1; 2001-01-01
-        2; 2002-01-31
-        """
+        data = asbytes(""" 1; 2001-01-01
+                           2; 2002-01-31 """)
         ndtype = [('idx', int), ('code', np.object)]
         func = lambda s: strptime(s.strip(), "%Y-%m-%d")
         converters = {1: func}
@@ -424,6 +428,22 @@ def test_dtype_with_object(self):
                            dtype=ndtype)
         assert_equal(test, control)
 
+    def test_uint64_type(self):
+        tgt = (9223372043271415339, 9223372043271415853)
+        c = StringIO()
+        c.write(asbytes("%s %s" % tgt))
+        c.seek(0)
+        res = np.loadtxt(c, dtype=np.uint64)
+        assert_equal(res, tgt)
+
+    def test_int64_type(self):
+        tgt = (-9223372036854775807, 9223372036854775807)
+        c = StringIO()
+        c.write(asbytes("%s %s" % tgt))
+        c.seek(0)
+        res = np.loadtxt(c, dtype=np.int64)
+        assert_equal(res, tgt)
+
     def test_universal_newline(self):
         f, name = mkstemp()
         os.write(f, asbytes('1 21\r3 42\r'))
@@ -435,6 +455,71 @@ def test_universal_newline(self):
         finally:
             os.unlink(name)
 
+    def test_empty_field_after_tab(self):
+        c = StringIO()
+        c.write(asbytes('1 \t2 \t3\tstart \n4\t5\t6\t  \n7\t8\t9.5\t'))
+        c.seek(0)
+        dt = { 'names': ('x', 'y', 'z', 'comment'),
+               'formats': ('<i4', '<i4', '<f4', '|S8')}
+        x = np.loadtxt(c, dtype=dt, delimiter='\t')
+        a = np.array([asbytes('start '), asbytes('  '), asbytes('')])
+        assert_array_equal(x['comment'], a)
+
+    def test_structure_unpack(self):
+        txt = StringIO(asbytes("M 21 72\nF 35 58"))
+        dt = { 'names': ('a', 'b', 'c'), 'formats': ('|S1', '<i4', '<f4')}
+        a, b, c = np.loadtxt(txt, dtype=dt, unpack=True)
+        assert_(a.dtype.str == '|S1')
+        assert_(b.dtype.str == '<i4')
+        assert_(c.dtype.str == '<f4')
+        assert_array_equal(a, np.array([asbytes('M'), asbytes('F')]))
+        assert_array_equal(b, np.array([21, 35]))
+        assert_array_equal(c, np.array([ 72.,  58.]))
+
+    def test_ndmin_keyword(self):
+        c = StringIO()
+        c.write(asbytes('1,2,3\n4,5,6'))
+        c.seek(0)
+        assert_raises(ValueError, np.loadtxt, c, ndmin=3)
+        c.seek(0)
+        assert_raises(ValueError, np.loadtxt, c, ndmin=1.5)
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',', ndmin=1)
+        a = np.array([[1, 2, 3], [4, 5, 6]])
+        assert_array_equal(x, a)
+        d = StringIO()
+        d.write(asbytes('0,1,2'))
+        d.seek(0)
+        x = np.loadtxt(d, dtype=int, delimiter=',', ndmin=2)
+        assert_(x.shape == (1, 3))
+        d.seek(0)
+        x = np.loadtxt(d, dtype=int, delimiter=',', ndmin=1)
+        assert_(x.shape == (3,))
+        d.seek(0)
+        x = np.loadtxt(d, dtype=int, delimiter=',', ndmin=0)
+        assert_(x.shape == (3,))
+        e = StringIO()
+        e.write(asbytes('0\n1\n2'))
+        e.seek(0)
+        x = np.loadtxt(e, dtype=int, delimiter=',', ndmin=2)
+        assert_(x.shape == (3, 1))
+        e.seek(0)
+        x = np.loadtxt(e, dtype=int, delimiter=',', ndmin=1)
+        assert_(x.shape == (3,))
+        e.seek(0)
+        x = np.loadtxt(e, dtype=int, delimiter=',', ndmin=0)
+        assert_(x.shape == (3,))
+        f = StringIO()
+        assert_(np.loadtxt(f, ndmin=2).shape == (0, 1,))
+        assert_(np.loadtxt(f, ndmin=1).shape == (0,))
+
+    def test_generator_source(self):
+        def count():
+            for i in range(10):
+                yield asbytes("%d" % i)
+
+        res = np.loadtxt(count())
+        assert_array_equal(res, np.arange(10))
 
 class Testfromregex(TestCase):
     def test_record(self):
@@ -556,7 +641,7 @@ def test_skip_footer_with_invalid(self):
         import warnings
         basestr = '1 1\n2 2\n3 3\n4 4\n5  \n6  \n7  \n'
         warnings.filterwarnings("ignore")
-        # Footer too small to get rid of all invalid values 
+        # Footer too small to get rid of all invalid values
         assert_raises(ValueError, np.genfromtxt,
                       StringIO(basestr), skip_footer=1)
 #        except ValueError:
@@ -745,10 +830,8 @@ def test_dtype_with_object(self):
         "Test using an explicit dtype with an object"
         from datetime import date
         import time
-        data = asbytes("""
-        1; 2001-01-01
-        2; 2002-01-31
-        """)
+        data = asbytes(""" 1; 2001-01-01
+                           2; 2002-01-31 """)
         ndtype = [('idx', int), ('code', np.object)]
         func = lambda s: strptime(s.strip(), "%Y-%m-%d")
         converters = {1: func}
@@ -1248,6 +1331,32 @@ def test_recfromcsv(self):
         self.assertTrue(isinstance(test, np.recarray))
         assert_equal(test, control)
 
+    def test_gft_filename(self):
+        # Test that we can load data from a filename as well as a file object
+        data = '0 1 2\n3 4 5'
+        exp_res = np.arange(6).reshape((2,3))
+        assert_array_equal(np.genfromtxt(StringIO(data)), exp_res)
+        f, name = mkstemp()
+        # Thanks to another windows brokeness, we can't use
+        # NamedTemporaryFile: a file created from this function cannot be
+        # reopened by another open call. So we first put the string
+        # of the test reference array, write it to a securely opened file,
+        # which is then read from by the loadtxt function
+        try:
+            os.write(f, asbytes(data))
+            assert_array_equal(np.genfromtxt(name), exp_res)
+        finally:
+            os.close(f)
+            os.unlink(name)
+
+    def test_gft_generator_source(self):
+        def count():
+            for i in range(10):
+                yield asbytes("%d" % i)
+
+        res = np.genfromtxt(count())
+        assert_array_equal(res, np.arange(10))
+
 
 def test_gzip_load():
     a = np.random.random((5, 5))
diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py
index f2724a95173b..85e76a3844db 100644
--- a/numpy/lib/tests/test_twodim_base.py
+++ b/numpy/lib/tests/test_twodim_base.py
@@ -222,6 +222,10 @@ def test_all_outliers(self):
         H, xed, yed = histogram2d(r, r, (4, 5), range=([0,1], [0,1]))
         assert_array_equal(H, 0)
 
+    def test_empty(self):
+        a, edge1, edge2 = histogram2d([],[], bins=([0,1],[0,1]))
+        assert_array_max_ulp(a, array([ 0., 0.]))
+
 
 class TestTri(TestCase):
     def test_dtype(self):
@@ -232,58 +236,48 @@ def test_dtype(self):
         assert_array_equal(tri(3,dtype=bool),out.astype(bool))
 
 
-class TestMaskIndices(TestCase):
-    def test_mask_indices(self):
-        # simple test without offset
-        iu = mask_indices(3, np.triu)
-        a = np.arange(9).reshape(3, 3)
-        yield (assert_array_equal, a[iu], array([0, 1, 2, 4, 5, 8]))
-        # Now with an offset
-        iu1 = mask_indices(3, np.triu, 1)
-        yield (assert_array_equal, a[iu1], array([1, 2, 5]))
-
-
-class TestTrilIndices(TestCase):
-    def test_tril_indices(self):
-        # indices without and with offset
-        il1 = tril_indices(4)
-        il2 = tril_indices(4, 2)
-
-        a = np.array([[1, 2, 3, 4],
-                      [5, 6, 7, 8],
-                      [9, 10, 11, 12],
-                      [13, 14, 15, 16]])
-
-        # indexing:
-        yield (assert_array_equal, a[il1],
-               array([ 1,  5,  6,  9, 10, 11, 13, 14, 15, 16]) )
-
-        # And for assigning values:
-        a[il1] = -1
-        yield (assert_array_equal, a,
-        array([[-1,  2,  3,  4],
-               [-1, -1,  7,  8],
-               [-1, -1, -1, 12],
-               [-1, -1, -1, -1]]) )
-
-        # These cover almost the whole array (two diagonals right of the main one):
-        a[il2] = -10
-        yield (assert_array_equal, a,
-        array([[-10, -10, -10,   4],
-               [-10, -10, -10, -10],
-               [-10, -10, -10, -10],
-               [-10, -10, -10, -10]]) )
-
-
-class TestTrilIndicesFrom(TestCase):
-
-    def test_exceptions(self):
-        yield assert_raises(ValueError, tril_indices_from, np.ones((2,)))
-        yield assert_raises(ValueError, tril_indices_from, np.ones((2,2,2)))
-        yield assert_raises(ValueError, tril_indices_from, np.ones((2,3)))
-
-
-class TestTriuIndices(TestCase):
+def test_mask_indices():
+    # simple test without offset
+    iu = mask_indices(3, np.triu)
+    a = np.arange(9).reshape(3, 3)
+    yield (assert_array_equal, a[iu], array([0, 1, 2, 4, 5, 8]))
+    # Now with an offset
+    iu1 = mask_indices(3, np.triu, 1)
+    yield (assert_array_equal, a[iu1], array([1, 2, 5]))
+
+
+def test_tril_indices():
+    # indices without and with offset
+    il1 = tril_indices(4)
+    il2 = tril_indices(4, 2)
+
+    a = np.array([[1, 2, 3, 4],
+                  [5, 6, 7, 8],
+                  [9, 10, 11, 12],
+                  [13, 14, 15, 16]])
+
+    # indexing:
+    yield (assert_array_equal, a[il1],
+           array([ 1,  5,  6,  9, 10, 11, 13, 14, 15, 16]) )
+
+    # And for assigning values:
+    a[il1] = -1
+    yield (assert_array_equal, a,
+    array([[-1,  2,  3,  4],
+           [-1, -1,  7,  8],
+           [-1, -1, -1, 12],
+           [-1, -1, -1, -1]]) )
+
+    # These cover almost the whole array (two diagonals right of the main one):
+    a[il2] = -10
+    yield (assert_array_equal, a,
+    array([[-10, -10, -10,   4],
+           [-10, -10, -10, -10],
+           [-10, -10, -10, -10],
+           [-10, -10, -10, -10]]) )
+
+
+class TestTriuIndices:
     def test_triu_indices(self):
         iu1 = triu_indices(4)
         iu2 = triu_indices(4, 2)
@@ -300,26 +294,32 @@ def test_triu_indices(self):
         # And for assigning values:
         a[iu1] = -1
         yield (assert_array_equal, a,
-        array([[-1, -1, -1, -1],
-               [ 5, -1, -1, -1],
-               [ 9, 10, -1, -1],
-               [13, 14, 15, -1]])  )
+               array([[-1, -1, -1, -1],
+                      [ 5, -1, -1, -1],
+                      [ 9, 10, -1, -1],
+                      [13, 14, 15, -1]])  )
 
         # These cover almost the whole array (two diagonals right of the main one):
         a[iu2] = -10
         yield ( assert_array_equal, a,
-        array([[ -1,  -1, -10, -10],
-               [  5,  -1,  -1, -10],
-               [  9,  10,  -1,  -1],
-               [ 13,  14,  15,  -1]]) )
+                array([[ -1,  -1, -10, -10],
+                       [  5,  -1,  -1, -10],
+                       [  9,  10,  -1,  -1],
+                       [ 13,  14,  15,  -1]]) )
 
 
-class TestTriuIndicesFrom(TestCase):
+class TestTrilIndicesFrom:
+    def test_exceptions(self):
+        assert_raises(ValueError, tril_indices_from, np.ones((2,)))
+        assert_raises(ValueError, tril_indices_from, np.ones((2,2,2)))
+        assert_raises(ValueError, tril_indices_from, np.ones((2,3)))
+
 
+class TestTriuIndicesFrom:
     def test_exceptions(self):
-        yield assert_raises(ValueError, triu_indices_from, np.ones((2,)))
-        yield assert_raises(ValueError, triu_indices_from, np.ones((2,2,2)))
-        yield assert_raises(ValueError, triu_indices_from, np.ones((2,3)))
+        assert_raises(ValueError, triu_indices_from, np.ones((2,)))
+        assert_raises(ValueError, triu_indices_from, np.ones((2,2,2)))
+        assert_raises(ValueError, triu_indices_from, np.ones((2,3)))
 
 
 if __name__ == "__main__":
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 87be0e8a30f3..3434bd656476 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1297,7 +1297,8 @@ def svd(a, full_matrices=1, compute_uv=1):
     iwork = zeros((8*min(m, n),), fortran_int)
     if isComplexType(t):
         lapack_routine = lapack_lite.zgesdd
-        rwork = zeros((5*min(m, n)*min(m, n) + 5*min(m, n),), real_t)
+        lrwork = min(m,n)*max(5*min(m,n)+7, 2*max(m,n)+2*min(m,n)+1)
+        rwork = zeros((lrwork,), real_t)
         lwork = 1
         work = zeros((lwork,), t)
         results = lapack_routine(option, m, n, a, m, s, u, m, vt, nvt,
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index a672ed08ad4b..cccd51d1f839 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -240,7 +240,7 @@ def do(self, a, b):
         assert imply(isinstance(b, matrix), isinstance(x, matrix))
         assert imply(isinstance(b, matrix), isinstance(residuals, matrix))
 
-class TestMatrixPower(TestCase):
+class TestMatrixPower:
     R90 = array([[0,1],[-1,0]])
     Arb22 = array([[4,-7],[-2,10]])
     noninv = array([[1,0],[0,0]])
@@ -290,8 +290,8 @@ def tz(M):
 
     def test_invert_noninvertible(self):
         import numpy.linalg
-        self.assertRaises(numpy.linalg.linalg.LinAlgError,
-                lambda: matrix_power(self.noninv,-1))
+        assert_raises(numpy.linalg.linalg.LinAlgError,
+                      lambda: matrix_power(self.noninv,-1))
 
 class TestBoolPower(TestCase):
     def test_square(self):
@@ -416,7 +416,7 @@ def test_matrix_rank():
     # Full rank matrix
     yield assert_equal, 4, matrix_rank(np.eye(4))
     # rank deficient matrix
-    I=np.eye(4); I[-1,-1] = 0. 
+    I=np.eye(4); I[-1,-1] = 0.
     yield assert_equal, matrix_rank(I), 3
     # All zeros - zero rank
     yield assert_equal, matrix_rank(np.zeros((4,4))), 0
@@ -429,7 +429,7 @@ def test_matrix_rank():
     yield assert_raises, TypeError, matrix_rank, np.zeros((2,2,2))
     # works on scalar
     yield assert_equal, matrix_rank(1), 1
-    
-    
+
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/ma/tests/test_regression.py b/numpy/ma/tests/test_regression.py
index 4fe26367f7b4..62e4ee0ae560 100644
--- a/numpy/ma/tests/test_regression.py
+++ b/numpy/ma/tests/test_regression.py
@@ -37,3 +37,13 @@ def test_masked_array_repr_unicode(self):
         """Ticket #1256"""
         repr(np.ma.array(u"Unicode"))
 
+    def test_atleast_2d(self):
+        """Ticket #1559"""
+        a = np.ma.masked_array([0.0, 1.2, 3.5], mask=[False, True, False])
+        b = np.atleast_2d(a)
+        assert_(a.mask.ndim == 1)
+        assert_(b.mask.ndim == 2)
+
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py
index 851bde10956b..48c679ce1a18 100644
--- a/numpy/polynomial/__init__.py
+++ b/numpy/polynomial/__init__.py
@@ -298,10 +298,13 @@ def getdomain(x) :
     from numpy.polynomial.polyutils import getdomain
     return getdomain(x)
 
-@deprecate(message='Please import mapparms from numpy.polynomial.polyutils')
-def mapparms(old, new) :
-    from numpy.polynomial.polyutils import mapparms
-    return mapparms(old, new)
+# Just remove this function as it screws up the documentation of the same
+# named class method.
+#
+#@deprecate(message='Please import mapparms from numpy.polynomial.polyutils')
+#def mapparms(old, new) :
+#    from numpy.polynomial.polyutils import mapparms
+#    return mapparms(old, new)
 
 @deprecate(message='Please import mapdomain from numpy.polynomial.polyutils')
 def mapdomain(x, old, new) :
diff --git a/numpy/polynomial/polytemplate.py b/numpy/polynomial/polytemplate.py
index 657b48508fce..9d56074673e3 100644
--- a/numpy/polynomial/polytemplate.py
+++ b/numpy/polynomial/polytemplate.py
@@ -25,6 +25,10 @@
 class $name(pu.PolyBase) :
     """A $name series class.
 
+    $name instances provide the standard Python numerical methods '+',
+    '-', '*', '//', '%', 'divmod', '**', and '()' as well as the listed
+    methods.
+
     Parameters
     ----------
     coef : array_like
@@ -60,10 +64,10 @@ class $name(pu.PolyBase) :
 
     Notes
     -----
-    It is important to specify the domain for many uses of graded polynomial,
-    for instance in fitting data. This is because many of the important
-    properties of the polynomial basis only hold in a specified interval and
-    thus the data must be mapped into that domain in order to benefit.
+    It is important to specify the domain in many cases, for instance in
+    fitting data, because many of the important properties of the
+    polynomial basis only hold in a specified interval and consequently
+    the data must be mapped into that interval in order to benefit.
 
     Examples
     --------
@@ -173,15 +177,13 @@ def __repr__(self):
         format = "%s(%s, %s, %s)"
         coef = repr(self.coef)[6:-1]
         domain = repr(self.domain)[6:-1]
-        window = repr(self.domain)[6:-1]
+        window = repr(self.window)[6:-1]
         return format % ('$name', coef, domain, window)
 
     def __str__(self) :
-        format = "%s(%s, %s, %s)"
-        coef = str(self.coef)[6:-1]
-        domain = str(self.domain)[6:-1]
-        window = str(self.domain)[6:-1]
-        return format % ('$nick', coef, domain, window)
+        format = "%s(%s)"
+        coef = str(self.coef)
+        return format % ('$nick', coef)
 
     # Pickle and copy
 
@@ -408,13 +410,12 @@ def __ne__(self, other) :
     def copy(self) :
         """Return a copy.
 
-        A new instance of $name is returned that has the same
-        coefficients and domain as the current instance.
+        Return a copy of the current $name instance.
 
         Returns
         -------
         new_instance : $name
-            New instance of $name with the same coefficients and domain.
+            Copy of current instance.
 
         """
         return self.__class__(self.coef, self.domain, self.window)
@@ -514,12 +515,15 @@ def convert(self, domain=None, kind=None, window=None) :
         Parameters
         ----------
         domain : array_like, optional
-            The domain of the new series type instance. If the value is None,
-            then the default domain of `kind` is used.
+            The domain of the converted series. If the value is None,
+            the default domain of `kind` is used.
         kind : class, optional
             The polynomial series type class to which the current instance
             should be converted. If kind is None, then the class of the
             current instance is used.
+        window : array_like, optional
+            The window of the converted series. If the value is None,
+            the default window of `kind` is used.
 
         Returns
         -------
@@ -767,9 +771,21 @@ def fit(x, y, deg, domain=None, rcond=None, full=False, w=None,
 
     @staticmethod
     def fromroots(roots, domain=$domain, window=$domain) :
-        """Return $name object with specified roots.
+        """Return $name instance with specified roots.
+
+        Returns an instance of $name representing the product
+        ``(x - r[0])*(x - r[1])*...*(x - r[n-1])``, where ``r`` is the
+        list of roots.
+
+        Parameters
+        ----------
+        roots : array_like
+            List of roots.
 
-        See ${nick}fromroots for full documentation.
+        Returns
+        -------
+        object : $name
+            Series with the specified roots.
 
         See Also
         --------
diff --git a/numpy/random/mtrand/distributions.c b/numpy/random/mtrand/distributions.c
index 39bd82f4a756..d792cf86da11 100644
--- a/numpy/random/mtrand/distributions.c
+++ b/numpy/random/mtrand/distributions.c
@@ -231,8 +231,8 @@ double rk_f(rk_state *state, double dfnum, double dfden)
 
 double rk_noncentral_f(rk_state *state, double dfnum, double dfden, double nonc)
 {
-    return ((rk_noncentral_chisquare(state, dfnum, nonc)*dfden) /
-            (rk_chisquare(state, dfden)*dfnum));
+    double t = rk_noncentral_chisquare(state, dfnum, nonc) * dfden;
+    return t / (rk_chisquare(state, dfden) * dfnum);
 }
 
 long rk_binomial_btpe(rk_state *state, long n, double p)
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index 7f98982ad7e2..2782cd91d042 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -152,7 +152,7 @@ def test_chisquare(self):
         desired = np.array([[ 63.87858175501090585,  68.68407748911370447],
                             [ 65.77116116901505904,  47.09686762438974483],
                             [ 72.3828403199695174 ,  74.18408615260374006]])
-        np.testing.assert_array_almost_equal(actual, desired, decimal=14)
+        np.testing.assert_array_almost_equal(actual, desired, decimal=13)
 
     def test_dirichlet(self):
         np.random.seed(self.seed)
diff --git a/numpy/tests/test_ctypeslib.py b/numpy/tests/test_ctypeslib.py
index 78403df9ecfe..dfe7e90aaedd 100644
--- a/numpy/tests/test_ctypeslib.py
+++ b/numpy/tests/test_ctypeslib.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 from numpy.ctypeslib import ndpointer, load_library
+from numpy.distutils.misc_util import get_shared_lib_extension
 from numpy.testing import *
 
 try:
@@ -29,12 +30,7 @@ def test_basic2(self):
         (including extension) does not work."""
         try:
             try:
-                from distutils import sysconfig
-                so = sysconfig.get_config_var('SO')
-                # fix long extension for Python >=3.2, see PEP 3149.
-                if 'SOABI' in sysconfig.get_config_vars():
-                    so = so.replace('.'+sysconfig.get_config_var('SOABI'), '', 1)
-
+                so = get_shared_lib_extension(is_python_ext=True)
                 cdll = load_library('multiarray%s' % so,
                                     np.core.multiarray.__file__)
             except ImportError:
diff --git a/pavement.py b/pavement.py
index ff0eabc74b53..5e49bad1876d 100644
--- a/pavement.py
+++ b/pavement.py
@@ -77,7 +77,18 @@
 sys.path.insert(0, os.path.dirname(__file__))
 try:
     setup_py = __import__("setup")
-    FULLVERSION = setup_py.FULLVERSION
+    FULLVERSION = setup_py.VERSION
+    # This is duplicated from setup.py
+    if os.path.exists('.git'):
+        GIT_REVISION = setup_py.git_version()
+    elif os.path.exists('numpy/version.py'):
+        # must be a source distribution, use existing version file
+        from numpy.version import git_revision as GIT_REVISION
+    else:
+        GIT_REVISION = "Unknown"
+
+    if not setup_py.ISRELEASED:
+        FULLVERSION += '.dev-' + GIT_REVISION[:7]
 finally:
     sys.path.pop(0)
 
@@ -87,11 +98,11 @@
 #-----------------------------------
 
 # Source of the release notes
-RELEASE_NOTES = 'doc/release/2.0.0-notes.rst'
+RELEASE_NOTES = 'doc/release/1.6.0-notes.rst'
 
 # Start/end of the log (from git)
-LOG_START = 'svn/tags/1.5.0'
-LOG_END = 'master'
+LOG_START = 'v1.5.0'
+LOG_END = 'v1.6.1'
 
 
 #-------------------------------------------------------
@@ -387,8 +398,22 @@ def build_pdf():
 #------------------
 # Mac OS X targets
 #------------------
-def dmg_name(fullversion, pyver):
-    return "numpy-%s-py%s-python.org.dmg" % (fullversion, pyver)
+def dmg_name(fullversion, pyver, osxver=None):
+    """Return name for dmg installer.
+
+    Notes
+    -----
+    Python 2.7 has two binaries, one for 10.3 (ppc, i386) and one for 10.6
+    (i386, x86_64). All other Python versions at python.org at the moment
+    have binaries for 10.3 only. The "macosx%s" part of the dmg name should
+    correspond to the python.org naming scheme.
+    """
+    # assume that for the py2.7/osx10.6 build the deployment target is set
+    # (should be done in the release script).
+    if not osxver:
+        osxver = os.environ.get('MACOSX_DEPLOYMENT_TARGET', '10.3')
+    return "numpy-%s-py%s-python.org-macosx%s.dmg" % (fullversion, pyver,
+                                                      osxver)
 
 def macosx_version():
     if not sys.platform == 'darwin':
@@ -474,6 +499,7 @@ def dmg(options):
     ref = os.path.join(options.doc.destdir_pdf, "reference.pdf")
     user = os.path.join(options.doc.destdir_pdf, "userguide.pdf")
     if (not os.path.exists(ref)) or (not os.path.exists(user)):
+        import warnings
         warnings.warn("Docs need to be built first! Can't find them.")
 
     # Build the mpkg package
@@ -568,7 +594,7 @@ def write_release_task(options, filename='NOTES.txt'):
 
 def write_log_task(options, filename='Changelog'):
     st = subprocess.Popen(
-            ['git', 'svn', 'log',  '%s..%s' % (LOG_START, LOG_END)],
+            ['git', 'log',  '%s..%s' % (LOG_START, LOG_END)],
             stdout=subprocess.PIPE)
 
     out = st.communicate()[0]
diff --git a/release.sh b/release.sh
index bb5a375db985..5f1f31ebbef9 100644
--- a/release.sh
+++ b/release.sh
@@ -5,6 +5,17 @@
 # downloads, i.e. two versions for Python 2.7. The Intel 32/64-bit version is
 # for OS X 10.6+, the other dmg installers are for 10.3+ and are built on 10.5
 
+# Check we're using the correct g++/c++ for the 32-bit 2.6 version we build for
+# the docs and the 64-bit 2.7 dmg installer.
+# We do this because for Python 2.6 we use a symlink on the PATH to select
+# /usr/bin/g++-4.0, while for Python 2.7 we need the default 4.2 version.
+export PATH=~/Code/tmp/gpp40temp/:$PATH
+gpp="$(g++ --version | grep "4.0")"
+if [ -z "$gpp" ]; then
+    echo "Wrong g++ version, we need 4.0 to compile scipy with Python 2.6"
+    exit 1
+fi
+
 # bootstrap needed to ensure we build the docs from the right scipy version
 paver bootstrap
 source bootstrap/bin/activate
@@ -19,6 +30,14 @@ paver pdf
 paver sdist
 
 export MACOSX_DEPLOYMENT_TARGET=10.6
+# Use GCC 4.2 for 64-bit OS X installer for Python 2.7
+export PATH=~/Code/tmp/gpp42temp/:$PATH
+gpp="$(g++ --version | grep "4.2")"
+if [ -z "$gpp" ]; then
+    echo "Wrong g++ version, we need 4.2 for 64-bit binary for Python 2.7"
+    exit 1
+fi
+
 paver dmg -p 2.7   # 32/64-bit version
 
 paver bdist_superpack -p 3.2
diff --git a/setup.py b/setup.py
index 318633bebc61..1efb2fed9ba6 100755
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,21 @@
 else:
     import builtins
 
+#DISTNUMPY
+#If CC is not set we use mpicc for compiling.
+if 'CC' not in os.environ or len(os.environ['CC']) > 0:
+    os.environ['CC'] = 'mpicc'
+#If LDSHARED is not set we use mpicc for linking.
+if 'LDSHARED' not in os.environ or len(os.environ['LDSHARED']) > 0:
+    os.environ['LDSHARED'] = 'mpicc'
+    flags = ''
+    try:
+        flags = os.environ['LDFLAGS']
+    except KeyError:
+        pass
+    os.environ['LDFLAGS'] = ' -shared ' + flags
+
+
 CLASSIFIERS = """\
 Development Status :: 5 - Production/Stable
 Intended Audience :: Science/Research
@@ -56,8 +71,8 @@
 PLATFORMS           = ["Windows", "Linux", "Solaris", "Mac OS-X", "Unix"]
 MAJOR               = 1
 MINOR               = 6
-MICRO               = 0
-ISRELEASED          = False
+MICRO               = 1
+ISRELEASED          = True
 VERSION             = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
 
 # Return the git revision as a string
@@ -94,19 +109,6 @@ def _minimal_ext_cmd(cmd):
 # a lot more robust than what was previously being used.
 builtins.__NUMPY_SETUP__ = True
 
-# Construct full version info. Needs to be in setup.py namespace, otherwise it
-# can't be accessed from pavement.py at build time.
-FULLVERSION = VERSION
-if not ISRELEASED:
-    if os.path.exists('.git'):
-        GIT_REVISION = git_version()
-    elif os.path.exists('numpy/version.py'):
-        # must be a source distribution, use existing version file
-        from numpy.version import git_revision as GIT_REVISION
-    else:
-        GIT_REVISION = "Unknown"
-
-    FULLVERSION += '.dev-' + GIT_REVISION[:7]
 
 def write_version_py(filename='numpy/version.py'):
     cnt = """
@@ -120,6 +122,20 @@ def write_version_py(filename='numpy/version.py'):
 if not release:
     version = full_version
 """
+    # Adding the git rev number needs to be done inside write_version_py(),
+    # otherwise the import of numpy.version messes up the build under Python 3.
+    FULLVERSION = VERSION
+    if os.path.exists('.git'):
+        GIT_REVISION = git_version()
+    elif os.path.exists('numpy/version.py'):
+        # must be a source distribution, use existing version file
+        from numpy.version import git_revision as GIT_REVISION
+    else:
+        GIT_REVISION = "Unknown"
+
+    if not ISRELEASED:
+        FULLVERSION += '.dev-' + GIT_REVISION[:7]
+
     a = open(filename, 'w')
     try:
         a.write(cnt % {'version': VERSION,
@@ -140,12 +156,6 @@ def configuration(parent_package='',top_path=None):
 
     config.add_subpackage('numpy')
 
-    # we want these files also in binaries/installed files, so it belongs here
-    # instead of in Manifest.in
-    config.add_data_files(('doc/cython/'),
-                          ('doc/pyrex/'),
-                          ('doc/swig/'))
-
     config.get_version('numpy/version.py') # sets config.version
 
     return config
diff --git a/tools/py3tool.py b/tools/py3tool.py
index bb0a66b30b3b..5be7d6c8e29d 100755
--- a/tools/py3tool.py
+++ b/tools/py3tool.py
@@ -268,7 +268,7 @@ def sync_2to3(src, dst, patchfile=None, clean=False):
         _old_stdout = sys.stdout
         try:
             sys.stdout = StringIO()
-            lib2to3.main.main("lib2to3.fixes", ['-w'] + flags.split()+filenames)
+            lib2to3.main.main("lib2to3.fixes", ['-w', '-n'] + flags.split()+filenames)
         finally:
             sys.stdout = _old_stdout