From 08734b1ab72f67c4e5ee5df8f3c8e41edd10e2be Mon Sep 17 00:00:00 2001 From: Allan Haldane Date: Sat, 20 Feb 2021 21:11:14 -0500 Subject: [PATCH 1/2] ENH: output trailing padding in PEP3118 format strings Fixes #7797 --- numpy/core/_internal.py | 13 ++++++++-- numpy/core/src/multiarray/buffer.c | 40 ++++++++++++++++++++++++----- numpy/core/tests/test_multiarray.py | 37 ++++++++++++++------------ 3 files changed, 66 insertions(+), 24 deletions(-) diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py index 449926f586ac..261b2fe0f10b 100644 --- a/numpy/core/_internal.py +++ b/numpy/core/_internal.py @@ -689,8 +689,17 @@ def __dtype_from_pep3118(stream, is_subdtype): field_spec['itemsize'] = offset - # extra final padding for aligned types - if stream.byteorder == '@': + # extra final padding for aligned types: + # Inside of T{}, if in aligned mode, we add trailing padding like in a + # C struct so the end of the struct is aligned. + # Note that this behavior is *not* described by the PEP3118 spec, which + # does not say anything about T{} trailing padding. Note also that the Py + # struct docs say that trailing padding should *not* be implicitly added + # when outside of T{}, and the user should explicitly add a 0-sized + # trailing field to add padding, however struct does not implement T{}. So, + # here numpy is taking the initiative to specify how trailing padding works + # inside T{}, while we mimic struct outside of T{}. + if is_subdtype and stream.byteorder == '@': field_spec['itemsize'] += (-offset) % common_alignment # Check if this was a simple 1-item type, and unwrap it diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c index 813850224714..e99f0580b0a3 100644 --- a/numpy/core/src/multiarray/buffer.c +++ b/numpy/core/src/multiarray/buffer.c @@ -88,6 +88,19 @@ _append_str(_tmp_string_t *s, char const *p) return 0; } +static int +_append_int(_tmp_string_t *s, unsigned int n) +{ + /* even on 64bit, int strings are at most 20 bytes; 256 is overkill */ + static char buf[256]; + int nw = snprintf(buf, 256, "%u", n); + if (nw < 0 || nw >= 256) { + return -1; + } + return _append_str(s, buf); +} + + /* * Append a PEP3118-formatted field name, ":name:", to str */ @@ -276,15 +289,20 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str, /* Insert padding manually */ if (*offset > new_offset) { PyErr_SetString( - PyExc_ValueError, - "dtypes with overlapping or out-of-order fields are not " - "representable as buffers. Consider reordering the fields." - ); + PyExc_ValueError, "The buffer interface does not support " + "overlapping fields or out-of-order " + "fields"); return -1; } - while (*offset < new_offset) { + /* add padding bytes: repeat-count plus 'x' */ + if (*offset < new_offset) { + if (new_offset - (*offset) > 1) { + if (_append_int(str, new_offset - (*offset)) < 0) { + return -1; + } + } if (_append_char(str, 'x') < 0) return -1; - ++*offset; + *offset = new_offset; } /* Insert child item */ @@ -297,6 +315,16 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str, /* Insert field name */ if (_append_field_name(str, name) < 0) return -1; } + + /* Add any trailing padding */ + if (*offset < descr->elsize) { + if (descr->elsize - (*offset) > 1) { + if (_append_int(str, descr->elsize - (*offset)) < 0) return -1; + } + if (_append_char(str, 'x') < 0) return -1; + *offset = descr->elsize; + } + if (_append_char(str, '}') < 0) return -1; } else { diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index b30fcb812aa5..fe49299227b1 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -7023,8 +7023,6 @@ def test_native_padding_2(self): self._check('^x3T{xi}', {'f0': (({'f0': ('i', 1)}, (3,)), 1)}) def test_trailing_padding(self): - # Trailing padding should be included, *and*, the item size - # should match the alignment if in aligned mode align = np.dtype('i').alignment size = np.dtype('i').itemsize @@ -7033,11 +7031,17 @@ def aligned(n): base = dict(formats=['i'], names=['f0']) - self._check('ix', dict(itemsize=aligned(size + 1), **base)) - self._check('ixx', dict(itemsize=aligned(size + 2), **base)) - self._check('ixxx', dict(itemsize=aligned(size + 3), **base)) - self._check('ixxxx', dict(itemsize=aligned(size + 4), **base)) - self._check('i7x', dict(itemsize=aligned(size + 7), **base)) + self._check('ix', dict(itemsize=size + 1, **base)) + self._check('ixx', dict(itemsize=size + 2, **base)) + self._check('ixxx', dict(itemsize=size + 3, **base)) + self._check('ixxxx', dict(itemsize=size + 4, **base)) + self._check('i7x', dict(itemsize=size + 7, **base)) + + self._check('T{i:f0:x}', dict(itemsize=aligned(size + 1), **base)) + self._check('T{i:f0:xx}', dict(itemsize=aligned(size + 2), **base)) + self._check('T{i:f0:xxx}', dict(itemsize=aligned(size + 3), **base)) + self._check('T{i:f0:xxxx}', dict(itemsize=aligned(size + 4), **base)) + self._check('T{i:f0:7x}', dict(itemsize=aligned(size + 7), **base)) self._check('^ix', dict(itemsize=size + 1, **base)) self._check('^ixx', dict(itemsize=size + 2, **base)) @@ -7045,6 +7049,13 @@ def aligned(n): self._check('^ixxxx', dict(itemsize=size + 4, **base)) self._check('^i7x', dict(itemsize=size + 7, **base)) + # check we can convert to memoryview and back, aligned and unaligned + arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=True)) + assert_equal(arr.dtype, np.array(memoryview(arr)).dtype) + + arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=False)) + assert_equal(arr.dtype, np.array(memoryview(arr)).dtype) + def test_native_padding_3(self): dt = np.dtype( [('a', 'b'), ('b', 'i'), @@ -7075,15 +7086,9 @@ def test_intra_padding(self): align = np.dtype('i').alignment size = np.dtype('i').itemsize - def aligned(n): - return (align*(1 + (n-1)//align)) - - self._check('(3)T{ix}', (dict( - names=['f0'], - formats=['i'], - offsets=[0], - itemsize=aligned(size + 1) - ), (3,))) + expected_dtype = {'names': ['f0'], 'formats': ['i'], + 'itemsize': np.dtype('i,V1', align=True).itemsize} + self._check('(3)T{ix}', (expected_dtype, (3,))) def test_char_vs_string(self): dt = np.dtype('c') From dfc8ec7789ff5115c12f95cc235f9b209eadc47b Mon Sep 17 00:00:00 2001 From: Allan Haldane Date: Sun, 21 Feb 2021 09:55:44 -0500 Subject: [PATCH 2/2] ENH: allow 0-sized elements in PEP3118 format strings to align --- .../upcoming_changes/7798.improvement.rst | 16 +++++++ numpy/core/_internal.py | 27 +++++++---- numpy/core/src/multiarray/buffer.c | 7 +-- numpy/core/tests/test_multiarray.py | 46 ++++++++++++++++--- 4 files changed, 77 insertions(+), 19 deletions(-) create mode 100644 doc/release/upcoming_changes/7798.improvement.rst diff --git a/doc/release/upcoming_changes/7798.improvement.rst b/doc/release/upcoming_changes/7798.improvement.rst new file mode 100644 index 000000000000..46caca10d5e2 --- /dev/null +++ b/doc/release/upcoming_changes/7798.improvement.rst @@ -0,0 +1,16 @@ +Trailing Padding now supported in PEP3118 buffer inferface +---------------------------------------------------------- +Previously, structured types with trailing padding such as +`np.dtype({'formats': ['i1'], 'names': ['a'], 'itemsize': 4})` could not +roundtrip through the PEP3118 interface using a memoryview, as in +`a == np.array(memoryview(a))`. Now, such trailing padding is preserved. + +More technically, the PEP3118 interface now supports PEP3118 format strings as +follows: Within "T{}", in aligned @ mode, trailing padding is automatically +assumed in the same way as C structs and numpy aligned dtypes. Outside of T{} +trailing padding is not automatically added or assumed in inputs, following +python's struct module, but is explicitly added by padding with "x" or unnamed +zero-sized trailing elements. 0-sized unnamed elements, like "0i", can now be +added anywhere in the format string, and in @ mode this will add padding bytes +up to that type's alignment offset, and otherwise is ignored, as described in +the python struct docs. diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py index 261b2fe0f10b..4196bc02203e 100644 --- a/numpy/core/_internal.py +++ b/numpy/core/_internal.py @@ -563,6 +563,9 @@ def _dtype_from_pep3118(spec): return dtype def __dtype_from_pep3118(stream, is_subdtype): + # numpy interprets pep3118 formats which includes named fields as + # structured dtypes, even if not enclosed by "T{}" + field_spec = dict( names=[], formats=[], @@ -613,8 +616,7 @@ def __dtype_from_pep3118(stream, is_subdtype): is_padding = False if stream.consume('T{'): - value, align = __dtype_from_pep3118( - stream, is_subdtype=True) + value, align = __dtype_from_pep3118(stream, is_subdtype=True) elif stream.next in type_map_chars: if stream.next == 'Z': typechar = stream.advance(2) @@ -638,12 +640,10 @@ def __dtype_from_pep3118(stream, is_subdtype): else: raise ValueError("Unknown PEP 3118 data type specifier %r" % stream.s) - # # Native alignment may require padding # # Here we assume that the presence of a '@' character implicitly implies # that the start of the array is *already* aligned. - # extra_offset = 0 if stream.byteorder == '@': start_padding = (-offset) % align @@ -663,6 +663,19 @@ def __dtype_from_pep3118(stream, is_subdtype): # Update common alignment common_alignment = _lcm(align, common_alignment) + # Field name + if stream.consume(':'): + name = stream.consume_until(':') + else: + name = None + + # struct docs explicitly say that repeat-0 elements are for padding or + # alignment. We further interpret this applies only to unnamed fields + if name is None and itemsize == 0: + offset += extra_offset + field_spec['itemsize'] = offset + continue + # Convert itemsize to sub-array if itemsize != 1: value = dtype((value, (itemsize,))) @@ -671,12 +684,6 @@ def __dtype_from_pep3118(stream, is_subdtype): if shape is not None: value = dtype((value, shape)) - # Field name - if stream.consume(':'): - name = stream.consume_until(':') - else: - name = None - if not (is_padding and name is None): if name is not None and name in field_spec['names']: raise RuntimeError(f"Duplicate field name '{name}' in PEP3118 format") diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c index e99f0580b0a3..bb6410656990 100644 --- a/numpy/core/src/multiarray/buffer.c +++ b/numpy/core/src/multiarray/buffer.c @@ -289,9 +289,10 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str, /* Insert padding manually */ if (*offset > new_offset) { PyErr_SetString( - PyExc_ValueError, "The buffer interface does not support " - "overlapping fields or out-of-order " - "fields"); + PyExc_ValueError, + "dtypes with overlapping or out-of-order fields are not " + "representable as buffers. Consider reordering the fields." + ); return -1; } /* add padding bytes: repeat-count plus 'x' */ diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index fe49299227b1..df0db2371941 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -7030,24 +7030,36 @@ def aligned(n): return align*(1 + (n-1)//align) base = dict(formats=['i'], names=['f0']) + bbase = dict(formats=['b'], names=['f0']) self._check('ix', dict(itemsize=size + 1, **base)) self._check('ixx', dict(itemsize=size + 2, **base)) self._check('ixxx', dict(itemsize=size + 3, **base)) self._check('ixxxx', dict(itemsize=size + 4, **base)) self._check('i7x', dict(itemsize=size + 7, **base)) - - self._check('T{i:f0:x}', dict(itemsize=aligned(size + 1), **base)) - self._check('T{i:f0:xx}', dict(itemsize=aligned(size + 2), **base)) - self._check('T{i:f0:xxx}', dict(itemsize=aligned(size + 3), **base)) - self._check('T{i:f0:xxxx}', dict(itemsize=aligned(size + 4), **base)) - self._check('T{i:f0:7x}', dict(itemsize=aligned(size + 7), **base)) + self._check('ix0i', dict(itemsize=2*size, **base)) + self._check('b0i', dict(itemsize=size, **bbase)) + + # Our intepretaton of the PEP3118/struct spec is that trailing + # padding for alignment is assumed only inside of T{}. + self._check('T{ix}', dict(itemsize=aligned(size + 1), **base)) + self._check('T{ixx}', dict(itemsize=aligned(size + 2), **base)) + self._check('T{ixxx}', dict(itemsize=aligned(size + 3), **base)) + self._check('T{ixxxx}', dict(itemsize=aligned(size + 4), **base)) + self._check('T{i7x}', dict(itemsize=aligned(size + 7), **base)) + self._check('T{ix0i}', dict(itemsize=2*size, **base)) + self._check('T{b0i}', dict(itemsize=size, **bbase)) + + # check that alignment mode affects assumed trailing padding in T{} + self._check('T{=ix}', dict(itemsize=size + 1, **base)) self._check('^ix', dict(itemsize=size + 1, **base)) self._check('^ixx', dict(itemsize=size + 2, **base)) self._check('^ixxx', dict(itemsize=size + 3, **base)) self._check('^ixxxx', dict(itemsize=size + 4, **base)) self._check('^i7x', dict(itemsize=size + 7, **base)) + self._check('^ixx0i', dict(itemsize=size + 2, **base)) + self._check('^b0i', np.dtype('b')) # check we can convert to memoryview and back, aligned and unaligned arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=True)) @@ -7056,6 +7068,28 @@ def aligned(n): arr = np.zeros(3, dtype=np.dtype('u1,i4,u1', align=False)) assert_equal(arr.dtype, np.array(memoryview(arr)).dtype) + a = np.empty(0, np.dtype({'formats': ['u1'], 'offsets': [0], + 'names': ['x'], 'itemsize': 4})) + assert_equal(a, np.array(memoryview(a))) + + # check that 0-sized elements act as padding in @ alignment and not = + # outside of T{} (see python struct docs, example at very end) + self._check('B:f0:B:f1:', [('f0', 'u1'), ('f1', 'u1')]) + self._check('B:f0:0iB:f1:0i', {'names': ['f0','f1'], + 'formats': ['u1','u1'], + 'offsets': [0,4], + 'itemsize': 8}) + self._check('=B:f0:0iB:f1:0i', [('f0', 'u1'), ('f1', 'u1')]) + + # PEP3118 cannot support overlapping/out-of-order fields + # (update these tests if it is improved to allow this) + a = np.empty(3, dtype={'names': ['a', 'b'], + 'formats': ['i4', 'i2'], + 'offsets': [0, 2]}) + assert_raises(ValueError, memoryview, a) + a = np.empty(3, dtype='i4,i4')[['f1', 'f0']] + assert_raises(ValueError, memoryview, a) + def test_native_padding_3(self): dt = np.dtype( [('a', 'b'), ('b', 'i'),