numpy · ahaldane · Feb 21, 2021 · Feb 21, 2021 · eric-wieser · May 9, 2017
diff --git a/doc/release/upcoming_changes/7798.improvement.rst b/doc/release/upcoming_changes/7798.improvement.rst
@@ -0,0 +1,16 @@
+Trailing Padding now supported in PEP3118 buffer inferface
+----------------------------------------------------------
+Previously, structured types with trailing padding such as
+`np.dtype({'formats': ['i1'], 'names': ['a'], 'itemsize': 4})` could not
+roundtrip through the PEP3118 interface using a memoryview, as in
+`a == np.array(memoryview(a))`.  Now, such trailing padding is preserved.
+
+More technically, the PEP3118 interface now supports PEP3118 format strings as
+follows: Within "T{}", in aligned @ mode, trailing padding is automatically
+assumed in the same way as C structs and numpy aligned dtypes. Outside of T{}
+trailing padding is not automatically added or assumed in inputs, following
+python's struct module, but is explicitly added by padding with "x" or unnamed
+zero-sized trailing elements. 0-sized unnamed elements, like "0i", can now be
+added anywhere in the format string, and in @ mode this will add padding bytes
+up to that type's alignment offset, and otherwise is ignored, as described in
+the python struct docs.
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
@@ -563,6 +563,9 @@ def _dtype_from_pep3118(spec):
     return dtype
 
 def __dtype_from_pep3118(stream, is_subdtype):
+    # numpy interprets pep3118 formats which includes named fields as
+    # structured dtypes, even if not enclosed by "T{}"
+
     field_spec = dict(
         names=[],
         formats=[],
@@ -613,8 +616,7 @@ def __dtype_from_pep3118(stream, is_subdtype):
         is_padding = False
 
         if stream.consume('T{'):
-            value, align = __dtype_from_pep3118(
-                stream, is_subdtype=True)
+            value, align = __dtype_from_pep3118(stream, is_subdtype=True)
         elif stream.next in type_map_chars:
             if stream.next == 'Z':
                 typechar = stream.advance(2)
@@ -638,12 +640,10 @@ def __dtype_from_pep3118(stream, is_subdtype):
         else:
             raise ValueError("Unknown PEP 3118 data type specifier %r" % stream.s)
 
-        #
         # Native alignment may require padding
         #
         # Here we assume that the presence of a '@' character implicitly implies
         # that the start of the array is *already* aligned.
-        #
         extra_offset = 0
         if stream.byteorder == '@':
             start_padding = (-offset) % align
@@ -663,6 +663,19 @@ def __dtype_from_pep3118(stream, is_subdtype):
             # Update common alignment
             common_alignment = _lcm(align, common_alignment)
 
+        # Field name
+        if stream.consume(':'):
+            name = stream.consume_until(':')
+        else:
+            name = None
+
+        # struct docs explicitly say that repeat-0 elements are for padding or
+        # alignment. We further interpret this applies only to unnamed fields
+        if name is None and itemsize == 0:
+            offset += extra_offset
+            field_spec['itemsize'] = offset
+            continue
+
         # Convert itemsize to sub-array
         if itemsize != 1:
             value = dtype((value, (itemsize,)))
@@ -671,12 +684,6 @@ def __dtype_from_pep3118(stream, is_subdtype):
         if shape is not None:
             value = dtype((value, shape))
 
-        # Field name
-        if stream.consume(':'):
-            name = stream.consume_until(':')
-        else:
-            name = None
-
         if not (is_padding and name is None):
             if name is not None and name in field_spec['names']:
                 raise RuntimeError(f"Duplicate field name '{name}' in PEP3118 format")
@@ -689,8 +696,17 @@ def __dtype_from_pep3118(stream, is_subdtype):
 
         field_spec['itemsize'] = offset
 
-    # extra final padding for aligned types
-    if stream.byteorder == '@':
+    # extra final padding for aligned types:
+    # Inside of T{}, if in aligned mode, we add trailing padding like in a
+    # C struct so the end of the struct is aligned.
+    # Note that this behavior is *not* described by the PEP3118 spec, which
+    # does not say anything about T{} trailing padding. Note also that the Py
+    # struct docs say that trailing padding should *not* be implicitly added
+    # when outside of T{}, and the user should explicitly add a 0-sized
+    # trailing field to add padding, however struct does not implement T{}. So,
+    # here numpy is taking the initiative to specify how trailing padding works
+    # inside T{}, while we mimic struct outside of T{}.
+    if is_subdtype and stream.byteorder == '@':
         field_spec['itemsize'] += (-offset) % common_alignment
 
     # Check if this was a simple 1-item type, and unwrap it

diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
@@ -88,6 +88,19 @@ _append_str(_tmp_string_t *s, char const *p)
     return 0;
 }
 
+static int
+_append_int(_tmp_string_t *s, unsigned int n)
+{
+    /* even on 64bit, int strings are at most 20 bytes; 256 is overkill */
+    static char buf[256];
+    int nw = snprintf(buf, 256, "%u", n);
+    if (nw < 0 || nw >= 256) {
+        return -1;
+    }
+    return _append_str(s, buf);
+}
+
+
 /*
  * Append a PEP3118-formatted field name, ":name:", to str
  */
@@ -282,9 +295,15 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
                 );
                 return -1;
             }
-            while (*offset < new_offset) {
+            /* add padding bytes: repeat-count plus 'x' */
+            if (*offset < new_offset) {
+                if (new_offset - (*offset) > 1) {
+                    if (_append_int(str, new_offset - (*offset)) < 0) {
+                        return -1;
+                    }
+                }
                 if (_append_char(str, 'x') < 0) return -1;
-                ++*offset;
+                *offset = new_offset;
             }
 
             /* Insert child item */
@@ -297,6 +316,16 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             /* Insert field name */
             if (_append_field_name(str, name) < 0) return -1;
         }
+
+        /* Add any trailing padding */
+        if (*offset < descr->elsize) {
+            if (descr->elsize - (*offset) > 1) {
+                if (_append_int(str, descr->elsize - (*offset)) < 0) return -1;
+            }
+            if (_append_char(str, 'x') < 0) return -1;
+            *offset = descr->elsize;
+        }
+
         if (_append_char(str, '}') < 0) return -1;
     }
     else {

diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
@@ -7023,27 +7023,72 @@ def test_native_padding_2(self):
         self._check('^x3T{xi}', {'f0': (({'f0': ('i', 1)}, (3,)), 1)})
 
     def test_trailing_padding(self):
-        # Trailing padding should be included, *and*, the item size
-        # should match the alignment if in aligned mode
         align = np.dtype('i').alignment
         size = np.dtype('i').itemsize
 
         def aligned(n):
             return align*(1 + (n-1)//align)
 
         base = dict(formats=['i'], names=['f0'])
-
-        self._check('ix',    dict(itemsize=aligned(size + 1), **base))
-        self._check('ixx',   dict(itemsize=aligned(size + 2), **base))
-        self._check('ixxx',  dict(itemsize=aligned(size + 3), **base))
-        self._check('ixxxx', dict(itemsize=aligned(size + 4), **base))
-        self._check('i7x',   dict(itemsize=aligned(size + 7), **base))
+        bbase = dict(formats=['b'], names=['f0'])
+
+        self._check('ix',    dict(itemsize=size + 1, **base))
+        self._check('ixx',   dict(itemsize=size + 2, **base))
+        self._check('ixxx',  dict(itemsize=size + 3, **base))
+        self._check('ixxxx', dict(itemsize=size + 4, **base))
+        self._check('i7x',   dict(itemsize=size + 7, **base))
+        self._check('ix0i',  dict(itemsize=2*size, **base))
+        self._check('b0i',   dict(itemsize=size, **bbase))
+
+        # Our intepretaton of the PEP3118/struct spec is that trailing
+        # padding for alignment is assumed only inside of T{}.
+        self._check('T{ix}',    dict(itemsize=aligned(size + 1), **base))
+        self._check('T{ixx}',   dict(itemsize=aligned(size + 2), **base))
+        self._check('T{ixxx}',  dict(itemsize=aligned(size + 3), **base))
+        self._check('T{ixxxx}', dict(itemsize=aligned(size + 4), **base))
+        self._check('T{i7x}',   dict(itemsize=aligned(size + 7), **base))
+        self._check('T{ix0i}',  dict(itemsize=2*size, **base))
+        self._check('T{b0i}',   dict(itemsize=size, **bbase))
+
+        # check that alignment mode affects assumed trailing padding in T{}
+        self._check('T{=ix}',   dict(itemsize=size + 1, **base))
 
         self._check('^ix',    dict(itemsize=size + 1, **base))
         self._check('^ixx',   dict(itemsize=size + 2, **base))
         self._check('^ixxx',  dict(itemsize=size + 3, **base))
         self._check('^ixxxx', dict(itemsize=size + 4, **base))
         self._check('^i7x',   dict(itemsize=size + 7, **base))
+        self._check('^ixx0i', dict(itemsize=size + 2, **base))
+        self._check('^b0i', np.dtype('b'))
+
+        # check we can convert to memoryview and back, aligned and unaligned
+        arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=True))
+        assert_equal(arr.dtype, np.array(memoryview(arr)).dtype)
+
+        arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=False))
+        assert_equal(arr.dtype, np.array(memoryview(arr)).dtype)
+
+        a = np.empty(0,  np.dtype({'formats': ['u1'], 'offsets': [0],
+                                   'names': ['x'], 'itemsize': 4}))
+        assert_equal(a, np.array(memoryview(a)))
+
+        # check that 0-sized elements act as padding in @ alignment and not =
+        # outside of T{} (see python struct docs, example at very end)
+        self._check('B:f0:B:f1:', [('f0', 'u1'), ('f1', 'u1')])
+        self._check('B:f0:0iB:f1:0i', {'names': ['f0','f1'],
+                                       'formats': ['u1','u1'],
+                                       'offsets': [0,4],
+                                       'itemsize': 8})
+        self._check('=B:f0:0iB:f1:0i', [('f0', 'u1'), ('f1', 'u1')])
+
+        # PEP3118 cannot support overlapping/out-of-order fields
+        # (update these tests if it is improved to allow this)
+        a = np.empty(3, dtype={'names': ['a', 'b'],
+                               'formats': ['i4', 'i2'],
+                               'offsets': [0, 2]})
+        assert_raises(ValueError, memoryview, a)
+        a = np.empty(3, dtype='i4,i4')[['f1', 'f0']]
+        assert_raises(ValueError, memoryview, a)
 
     def test_native_padding_3(self):
         dt = np.dtype(
@@ -7075,15 +7120,9 @@ def test_intra_padding(self):
         align = np.dtype('i').alignment
         size = np.dtype('i').itemsize
 
-        def aligned(n):
-            return (align*(1 + (n-1)//align))
-
-        self._check('(3)T{ix}', (dict(
-            names=['f0'],
-            formats=['i'],
-            offsets=[0],
-            itemsize=aligned(size + 1)
-        ), (3,)))
+        expected_dtype = {'names': ['f0'], 'formats': ['i'],
+                          'itemsize': np.dtype('i,V1', align=True).itemsize}
+        self._check('(3)T{ix}', (expected_dtype, (3,)))
 
     def test_char_vs_string(self):
         dt = np.dtype('c')