From 08734b1ab72f67c4e5ee5df8f3c8e41edd10e2be Mon Sep 17 00:00:00 2001
From: Allan Haldane <allan.haldane@gmail.com>
Date: Sat, 20 Feb 2021 21:11:14 -0500
Subject: [PATCH 1/2] ENH: output trailing padding in PEP3118 format strings

Fixes #7797
---
 numpy/core/_internal.py             | 13 ++++++++--
 numpy/core/src/multiarray/buffer.c  | 40 ++++++++++++++++++++++++-----
 numpy/core/tests/test_multiarray.py | 37 ++++++++++++++------------
 3 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 449926f586ac..261b2fe0f10b 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -689,8 +689,17 @@ def __dtype_from_pep3118(stream, is_subdtype):
 
         field_spec['itemsize'] = offset
 
-    # extra final padding for aligned types
-    if stream.byteorder == '@':
+    # extra final padding for aligned types:
+    # Inside of T{}, if in aligned mode, we add trailing padding like in a
+    # C struct so the end of the struct is aligned.
+    # Note that this behavior is *not* described by the PEP3118 spec, which
+    # does not say anything about T{} trailing padding. Note also that the Py
+    # struct docs say that trailing padding should *not* be implicitly added
+    # when outside of T{}, and the user should explicitly add a 0-sized
+    # trailing field to add padding, however struct does not implement T{}. So,
+    # here numpy is taking the initiative to specify how trailing padding works
+    # inside T{}, while we mimic struct outside of T{}.
+    if is_subdtype and stream.byteorder == '@':
         field_spec['itemsize'] += (-offset) % common_alignment
 
     # Check if this was a simple 1-item type, and unwrap it
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 813850224714..e99f0580b0a3 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -88,6 +88,19 @@ _append_str(_tmp_string_t *s, char const *p)
     return 0;
 }
 
+static int
+_append_int(_tmp_string_t *s, unsigned int n)
+{
+    /* even on 64bit, int strings are at most 20 bytes; 256 is overkill */
+    static char buf[256];
+    int nw = snprintf(buf, 256, "%u", n);
+    if (nw < 0 || nw >= 256) {
+        return -1;
+    }
+    return _append_str(s, buf);
+}
+
+
 /*
  * Append a PEP3118-formatted field name, ":name:", to str
  */
@@ -276,15 +289,20 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             /* Insert padding manually */
             if (*offset > new_offset) {
                 PyErr_SetString(
-                    PyExc_ValueError,
-                    "dtypes with overlapping or out-of-order fields are not "
-                    "representable as buffers. Consider reordering the fields."
-                );
+                    PyExc_ValueError, "The buffer interface does not support "
+                                      "overlapping fields or out-of-order "
+                                      "fields");
                 return -1;
             }
-            while (*offset < new_offset) {
+            /* add padding bytes: repeat-count plus 'x' */
+            if (*offset < new_offset) {
+                if (new_offset - (*offset) > 1) {
+                    if (_append_int(str, new_offset - (*offset)) < 0) {
+                        return -1;
+                    }
+                }
                 if (_append_char(str, 'x') < 0) return -1;
-                ++*offset;
+                *offset = new_offset;
             }
 
             /* Insert child item */
@@ -297,6 +315,16 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             /* Insert field name */
             if (_append_field_name(str, name) < 0) return -1;
         }
+
+        /* Add any trailing padding */
+        if (*offset < descr->elsize) {
+            if (descr->elsize - (*offset) > 1) {
+                if (_append_int(str, descr->elsize - (*offset)) < 0) return -1;
+            }
+            if (_append_char(str, 'x') < 0) return -1;
+            *offset = descr->elsize;
+        }
+
         if (_append_char(str, '}') < 0) return -1;
     }
     else {
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b30fcb812aa5..fe49299227b1 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -7023,8 +7023,6 @@ def test_native_padding_2(self):
         self._check('^x3T{xi}', {'f0': (({'f0': ('i', 1)}, (3,)), 1)})
 
     def test_trailing_padding(self):
-        # Trailing padding should be included, *and*, the item size
-        # should match the alignment if in aligned mode
         align = np.dtype('i').alignment
         size = np.dtype('i').itemsize
 
@@ -7033,11 +7031,17 @@ def aligned(n):
 
         base = dict(formats=['i'], names=['f0'])
 
-        self._check('ix',    dict(itemsize=aligned(size + 1), **base))
-        self._check('ixx',   dict(itemsize=aligned(size + 2), **base))
-        self._check('ixxx',  dict(itemsize=aligned(size + 3), **base))
-        self._check('ixxxx', dict(itemsize=aligned(size + 4), **base))
-        self._check('i7x',   dict(itemsize=aligned(size + 7), **base))
+        self._check('ix',    dict(itemsize=size + 1, **base))
+        self._check('ixx',   dict(itemsize=size + 2, **base))
+        self._check('ixxx',  dict(itemsize=size + 3, **base))
+        self._check('ixxxx', dict(itemsize=size + 4, **base))
+        self._check('i7x',   dict(itemsize=size + 7, **base))
+
+        self._check('T{i:f0:x}',    dict(itemsize=aligned(size + 1), **base))
+        self._check('T{i:f0:xx}',   dict(itemsize=aligned(size + 2), **base))
+        self._check('T{i:f0:xxx}',  dict(itemsize=aligned(size + 3), **base))
+        self._check('T{i:f0:xxxx}', dict(itemsize=aligned(size + 4), **base))
+        self._check('T{i:f0:7x}',   dict(itemsize=aligned(size + 7), **base))
 
         self._check('^ix',    dict(itemsize=size + 1, **base))
         self._check('^ixx',   dict(itemsize=size + 2, **base))
@@ -7045,6 +7049,13 @@ def aligned(n):
         self._check('^ixxxx', dict(itemsize=size + 4, **base))
         self._check('^i7x',   dict(itemsize=size + 7, **base))
 
+        # check we can convert to memoryview and back, aligned and unaligned
+        arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=True))
+        assert_equal(arr.dtype, np.array(memoryview(arr)).dtype)
+
+        arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=False))
+        assert_equal(arr.dtype, np.array(memoryview(arr)).dtype)
+
     def test_native_padding_3(self):
         dt = np.dtype(
                 [('a', 'b'), ('b', 'i'),
@@ -7075,15 +7086,9 @@ def test_intra_padding(self):
         align = np.dtype('i').alignment
         size = np.dtype('i').itemsize
 
-        def aligned(n):
-            return (align*(1 + (n-1)//align))
-
-        self._check('(3)T{ix}', (dict(
-            names=['f0'],
-            formats=['i'],
-            offsets=[0],
-            itemsize=aligned(size + 1)
-        ), (3,)))
+        expected_dtype = {'names': ['f0'], 'formats': ['i'],
+                          'itemsize': np.dtype('i,V1', align=True).itemsize}
+        self._check('(3)T{ix}', (expected_dtype, (3,)))
 
     def test_char_vs_string(self):
         dt = np.dtype('c')

From dfc8ec7789ff5115c12f95cc235f9b209eadc47b Mon Sep 17 00:00:00 2001
From: Allan Haldane <allan.haldane@gmail.com>
Date: Sun, 21 Feb 2021 09:55:44 -0500
Subject: [PATCH 2/2] ENH: allow 0-sized elements in PEP3118 format strings to
 align

---
 .../upcoming_changes/7798.improvement.rst     | 16 +++++++
 numpy/core/_internal.py                       | 27 +++++++----
 numpy/core/src/multiarray/buffer.c            |  7 +--
 numpy/core/tests/test_multiarray.py           | 46 ++++++++++++++++---
 4 files changed, 77 insertions(+), 19 deletions(-)
 create mode 100644 doc/release/upcoming_changes/7798.improvement.rst

diff --git a/doc/release/upcoming_changes/7798.improvement.rst b/doc/release/upcoming_changes/7798.improvement.rst
new file mode 100644
index 000000000000..46caca10d5e2
--- /dev/null
+++ b/doc/release/upcoming_changes/7798.improvement.rst
@@ -0,0 +1,16 @@
+Trailing Padding now supported in PEP3118 buffer inferface
+----------------------------------------------------------
+Previously, structured types with trailing padding such as
+`np.dtype({'formats': ['i1'], 'names': ['a'], 'itemsize': 4})` could not
+roundtrip through the PEP3118 interface using a memoryview, as in
+`a == np.array(memoryview(a))`.  Now, such trailing padding is preserved.
+
+More technically, the PEP3118 interface now supports PEP3118 format strings as
+follows: Within "T{}", in aligned @ mode, trailing padding is automatically
+assumed in the same way as C structs and numpy aligned dtypes. Outside of T{}
+trailing padding is not automatically added or assumed in inputs, following
+python's struct module, but is explicitly added by padding with "x" or unnamed
+zero-sized trailing elements. 0-sized unnamed elements, like "0i", can now be
+added anywhere in the format string, and in @ mode this will add padding bytes
+up to that type's alignment offset, and otherwise is ignored, as described in
+the python struct docs.
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 261b2fe0f10b..4196bc02203e 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -563,6 +563,9 @@ def _dtype_from_pep3118(spec):
     return dtype
 
 def __dtype_from_pep3118(stream, is_subdtype):
+    # numpy interprets pep3118 formats which includes named fields as
+    # structured dtypes, even if not enclosed by "T{}"
+
     field_spec = dict(
         names=[],
         formats=[],
@@ -613,8 +616,7 @@ def __dtype_from_pep3118(stream, is_subdtype):
         is_padding = False
 
         if stream.consume('T{'):
-            value, align = __dtype_from_pep3118(
-                stream, is_subdtype=True)
+            value, align = __dtype_from_pep3118(stream, is_subdtype=True)
         elif stream.next in type_map_chars:
             if stream.next == 'Z':
                 typechar = stream.advance(2)
@@ -638,12 +640,10 @@ def __dtype_from_pep3118(stream, is_subdtype):
         else:
             raise ValueError("Unknown PEP 3118 data type specifier %r" % stream.s)
 
-        #
         # Native alignment may require padding
         #
         # Here we assume that the presence of a '@' character implicitly implies
         # that the start of the array is *already* aligned.
-        #
         extra_offset = 0
         if stream.byteorder == '@':
             start_padding = (-offset) % align
@@ -663,6 +663,19 @@ def __dtype_from_pep3118(stream, is_subdtype):
             # Update common alignment
             common_alignment = _lcm(align, common_alignment)
 
+        # Field name
+        if stream.consume(':'):
+            name = stream.consume_until(':')
+        else:
+            name = None
+
+        # struct docs explicitly say that repeat-0 elements are for padding or
+        # alignment. We further interpret this applies only to unnamed fields
+        if name is None and itemsize == 0:
+            offset += extra_offset
+            field_spec['itemsize'] = offset
+            continue
+
         # Convert itemsize to sub-array
         if itemsize != 1:
             value = dtype((value, (itemsize,)))
@@ -671,12 +684,6 @@ def __dtype_from_pep3118(stream, is_subdtype):
         if shape is not None:
             value = dtype((value, shape))
 
-        # Field name
-        if stream.consume(':'):
-            name = stream.consume_until(':')
-        else:
-            name = None
-
         if not (is_padding and name is None):
             if name is not None and name in field_spec['names']:
                 raise RuntimeError(f"Duplicate field name '{name}' in PEP3118 format")
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index e99f0580b0a3..bb6410656990 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -289,9 +289,10 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             /* Insert padding manually */
             if (*offset > new_offset) {
                 PyErr_SetString(
-                    PyExc_ValueError, "The buffer interface does not support "
-                                      "overlapping fields or out-of-order "
-                                      "fields");
+                    PyExc_ValueError,
+                    "dtypes with overlapping or out-of-order fields are not "
+                    "representable as buffers. Consider reordering the fields."
+                );
                 return -1;
             }
             /* add padding bytes: repeat-count plus 'x' */
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index fe49299227b1..df0db2371941 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -7030,24 +7030,36 @@ def aligned(n):
             return align*(1 + (n-1)//align)
 
         base = dict(formats=['i'], names=['f0'])
+        bbase = dict(formats=['b'], names=['f0'])
 
         self._check('ix',    dict(itemsize=size + 1, **base))
         self._check('ixx',   dict(itemsize=size + 2, **base))
         self._check('ixxx',  dict(itemsize=size + 3, **base))
         self._check('ixxxx', dict(itemsize=size + 4, **base))
         self._check('i7x',   dict(itemsize=size + 7, **base))
-
-        self._check('T{i:f0:x}',    dict(itemsize=aligned(size + 1), **base))
-        self._check('T{i:f0:xx}',   dict(itemsize=aligned(size + 2), **base))
-        self._check('T{i:f0:xxx}',  dict(itemsize=aligned(size + 3), **base))
-        self._check('T{i:f0:xxxx}', dict(itemsize=aligned(size + 4), **base))
-        self._check('T{i:f0:7x}',   dict(itemsize=aligned(size + 7), **base))
+        self._check('ix0i',  dict(itemsize=2*size, **base))
+        self._check('b0i',   dict(itemsize=size, **bbase))
+
+        # Our intepretaton of the PEP3118/struct spec is that trailing
+        # padding for alignment is assumed only inside of T{}.
+        self._check('T{ix}',    dict(itemsize=aligned(size + 1), **base))
+        self._check('T{ixx}',   dict(itemsize=aligned(size + 2), **base))
+        self._check('T{ixxx}',  dict(itemsize=aligned(size + 3), **base))
+        self._check('T{ixxxx}', dict(itemsize=aligned(size + 4), **base))
+        self._check('T{i7x}',   dict(itemsize=aligned(size + 7), **base))
+        self._check('T{ix0i}',  dict(itemsize=2*size, **base))
+        self._check('T{b0i}',   dict(itemsize=size, **bbase))
+
+        # check that alignment mode affects assumed trailing padding in T{}
+        self._check('T{=ix}',   dict(itemsize=size + 1, **base))
 
         self._check('^ix',    dict(itemsize=size + 1, **base))
         self._check('^ixx',   dict(itemsize=size + 2, **base))
         self._check('^ixxx',  dict(itemsize=size + 3, **base))
         self._check('^ixxxx', dict(itemsize=size + 4, **base))
         self._check('^i7x',   dict(itemsize=size + 7, **base))
+        self._check('^ixx0i', dict(itemsize=size + 2, **base))
+        self._check('^b0i', np.dtype('b'))
 
         # check we can convert to memoryview and back, aligned and unaligned
         arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=True))
@@ -7056,6 +7068,28 @@ def aligned(n):
         arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=False))
         assert_equal(arr.dtype, np.array(memoryview(arr)).dtype)
 
+        a = np.empty(0,  np.dtype({'formats': ['u1'], 'offsets': [0],
+                                   'names': ['x'], 'itemsize': 4}))
+        assert_equal(a, np.array(memoryview(a)))
+
+        # check that 0-sized elements act as padding in @ alignment and not =
+        # outside of T{} (see python struct docs, example at very end)
+        self._check('B:f0:B:f1:', [('f0', 'u1'), ('f1', 'u1')])
+        self._check('B:f0:0iB:f1:0i', {'names': ['f0','f1'],
+                                       'formats': ['u1','u1'],
+                                       'offsets': [0,4],
+                                       'itemsize': 8})
+        self._check('=B:f0:0iB:f1:0i', [('f0', 'u1'), ('f1', 'u1')])
+
+        # PEP3118 cannot support overlapping/out-of-order fields
+        # (update these tests if it is improved to allow this)
+        a = np.empty(3, dtype={'names': ['a', 'b'],
+                               'formats': ['i4', 'i2'],
+                               'offsets': [0, 2]})
+        assert_raises(ValueError, memoryview, a)
+        a = np.empty(3, dtype='i4,i4')[['f1', 'f0']]
+        assert_raises(ValueError, memoryview, a)
+
     def test_native_padding_3(self):
         dt = np.dtype(
                 [('a', 'b'), ('b', 'i'),