From f1fba70edd1829c64e3290fa6b1a20d01e9d9674 Mon Sep 17 00:00:00 2001 From: Allan Haldane Date: Tue, 30 Jan 2018 19:15:54 -0500 Subject: [PATCH 1/3] ENH: add multi-field assignment helpers in np.lib.recfunctions Adds helper functions for the copy->view transition for multi-field indexes. Adds `structured_to_unstructured`, `apply_along_fields`, `assign_fields_by_name`, `require_fields`. --- numpy/doc/structured_arrays.py | 9 + numpy/lib/recfunctions.py | 281 ++++++++++++++++++++++++++- numpy/lib/tests/test_recfunctions.py | 53 ++++- 3 files changed, 341 insertions(+), 2 deletions(-) diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py index ab97c5df6141..42711a7c0d9d 100644 --- a/numpy/doc/structured_arrays.py +++ b/numpy/doc/structured_arrays.py @@ -443,6 +443,15 @@ >>> repack_fields(a[['a','c']]).view('i8') # supported 1.15 and 1.16 array([0, 0, 0]) + The :module:`numpy.lib.recfunctions` module has other new methods + introduced in numpy 1.16 to help users account for this change. These are + :func:`numpy.lib.recfunctions.structured_to_unstructured`, + :func:`numpy.lib.recfunctions.unstructured_to_structured`, + :func:`numpy.lib.recfunctions.apply_along_fields`, + :func:`numpy.lib.recfunctions.assign_fields_by_name`, and + :func:`numpy.lib.recfunctions.require_fields`. + + Assigning to an array with a multi-field index will behave the same in Numpy 1.15 and Numpy 1.16. In both versions the assignment will modify the original array:: diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py index 53a586f56941..11c04f03d862 100644 --- a/numpy/lib/recfunctions.py +++ b/numpy/lib/recfunctions.py @@ -102,7 +102,7 @@ def get_fieldspec(dtype): fields = ((name, dtype.fields[name]) for name in dtype.names) # keep any titles, if present return [ - (name if len(f) == 2 else (f[2], name), f[0]) + (name if len(f) == 2 else (f[2], name), f[0]) for name, f in fields ] @@ -870,6 +870,285 @@ def repack_fields(a, align=False, recurse=False): dt = np.dtype(fieldinfo, align=align) return np.dtype((a.type, dt)) +def _get_fields_and_offsets(dt, offset=0): + """ + Returns a flat list of (name, dtype, count, offset) tuples of all the + scalar fields in the dtype "dt", including nested fields, in left + to right order. + """ + fields = [] + for name in dt.names: + field = dt.fields[name] + if field[0].names is None: + count = 1 + for size in field[0].shape: + count *= size + fields.append((name, field[0], count, field[1] + offset)) + else: + fields.extend(_get_fields_and_offsets(field[0], field[1] + offset)) + return fields + +def structured_to_unstructured(arr, dtype=None): + """ + Converts and n-D structured array into an (n+1)-D unstructured array. + + The new array will have a new last dimension equal in size to the + number of field-elements of the input array. If not supplied, the output + datatype is determined from the numpy type promotion rules applied to all + the field datatypes. + + Nested fields, as well as each element of any subarray fields, all count + as a single field-elements. + + Parameters + ---------- + arr : ndarray + Structured array or dtype to convert. + dtype : dtype, optional + The dtype of the output unstructured array + + Returns + ------- + unstructured : ndarray + Unstructured array with one more dimension. + + Examples + -------- + + >>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)]) + >>> a + array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]), + (0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])], + dtype=[('a', '>> structured_to_unstructured(arr) + array([[0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0.]]) + + >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)], + ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')]) + >>> np.mean(structured_to_unstructured(b[['x', 'z']]), axis=-1) + array([ 3. , 5.5, 9. , 11. ]) + + """ + if not arr.dtype.names: + raise ValueError('arr must be a structured array') + + fields = _get_fields_and_offsets(arr.dtype) + n_elem = sum(f[2] for f in fields) + + if dtype is None: + out_dtype = np.result_type(*[f[1].base for f in fields]) + else: + out_dtype = dtype + + out = np.empty(arr.shape + (n_elem,), dtype=out_dtype) + + index = 0 + for name, dt, count, offset in fields: + if count == 1: + out[...,index] = arr.getfield(dt, offset) + index += 1 + else: + out[...,index:index+count] = arr.getfield(dt, offset) + index += count + + return out + +def unstructured_to_structured(arr, dtype=None, names=None, align=False): + """ + Converts and n-D unstructured array into an (n-1)-D structured array. + + The last dimension of the array is converted into a structure, with + number of field-elements equal to the size of the last dimension of the + input array. By default all fields will have the same dtype as the + original array, but you may supply a custom dtype with the right + number of fields-elements. + + Nested fields, as well as each element of any subarray fields, all count + towards the number of field-elements. + + Parameters + ---------- + arr : ndarray + Unstructured array or dtype to convert. + dtype : dtype, optional + The structured dtype of the output array + names : list of strings, optional + If dtype is not supplied, this specifies the field names for the output + dtype, in order. The field dtypes will be the same as the input array. + align : boolean, optional + If dtype is not supplied, whether to create an aligned memory layout. + + Returns + ------- + structured : ndarray + Structured array with fewer dimensions. + + Examples + -------- + + >>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)]) + >>> a = np.arange(20).reshape((4,5)) + >>> a + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14], + [15, 16, 17, 18, 19]]) + >>> unstructured_to_structured(a, dt) + array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]), + (10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])], + dtype=[('a', '>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)], + ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')]) + >>> apply_along_fields(np.mean, b) + array([ 2.66666667, 5.33333333, 8.66666667, 11. ]) + >>> apply_along_fields(np.mean, b[['x', 'z']]) + array([ 3. , 5.5, 9. , 11. ]) + + """ + if not arr.dtype.names: + raise ValueError('arr must be a structured array') + + uarr = structured_to_unstructured(arr) + return func(uarr, axis=-1) + # works and avoids axis requirement, but very, very slow: + #return np.apply_along_axis(func, -1, uarr) + +def assign_fields_by_name(dst, src, zero_unassigned=True): + """ + Assigns values from one structured array to another by field name. + + Normally in numpy >= 1.14, assignment of one structured array to another + copies fields "by position", meaning that the first field from the src is + copied to the first field of the dst, and so on, regardless of field name. + + This function instead copies "by field name", such that fields in the dst + are assigned from the identically named field in the src. This applies + recursively for nested structures. This is how structure assignment worked + in numpy >= 1.6 to <= 1.13. + + Parameters + ---------- + dst : ndarray + src : ndarray + The source and destination arrays during assignment. + zero_unassigned : bool, optional + If True, fields in the dst for which there was no matching + field in the src are filled with the value 0 (zero). This + was the behavior of numpy <= 1.13. If False, those fields + are not modified. + """ + + if dst.dtype.names is None: + dst[:] = src + return + + for name in dst.dtype.names: + if name not in src.dtype.names: + if zero_unassigned: + dst[name] = 0 + else: + assign_fields_by_name(dst[name], src[name], + zero_unassigned) + +def require_fields(array, required_dtype): + """ + Casts the array to the required dtype using assignment by field-name. + + Normal structured array casting/assignment works "by position" in numpy + 1.14+, meaning that the first field from the source's dtype is copied to + the first field of the destination's dtype, and so on. + + This function assigns by name instead, so the value of a field in the + output array is the value of the field with the same name in the source + array. + + If a field name in the required_dtype does not exist in the + input array, that field is set to 0 in the output array. + + Parameters + ---------- + a : ndarray + array to cast + required_dtype : dtype + datatype for output array + + Returns + ------- + out : ndarray + array with the new dtype, with field values copied from the fields in + the input array with the same name + + Examples + -------- + + >>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')]) + >>> require_fields(a, [('b', 'f4'), ('c', 'u1')]) + """ + out = np.empty(array.shape, dtype=required_dtype) + assign_fields_by_name(out, array) + return out + def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None, asrecarray=None, autoconvert=None): diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py index 5585a95f9162..8b61ba0bff23 100644 --- a/numpy/lib/tests/test_recfunctions.py +++ b/numpy/lib/tests/test_recfunctions.py @@ -10,7 +10,8 @@ from numpy.lib.recfunctions import ( drop_fields, rename_fields, get_fieldstructure, recursive_fill_fields, find_duplicates, merge_arrays, append_fields, stack_arrays, join_by, - repack_fields) + repack_fields, unstructured_to_structured, structured_to_unstructured, + apply_along_fields, require_fields, assign_fields_by_name) get_names = np.lib.recfunctions.get_names get_names_flat = np.lib.recfunctions.get_names_flat zip_descr = np.lib.recfunctions.zip_descr @@ -204,6 +205,56 @@ def test_repack_fields(self): dt = np.dtype((np.record, dt)) assert_(repack_fields(dt).type is np.record) + def test_structured_to_unstructured(self): + a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)]) + out = structured_to_unstructured(a) + assert_equal(out, np.zeros((4,5), dtype='f8')) + + b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)], + dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')]) + out = np.mean(structured_to_unstructured(b[['x', 'z']]), axis=-1) + assert_equal(out, np.array([ 3. , 5.5, 9. , 11. ])) + + c = np.arange(20).reshape((4,5)) + out = unstructured_to_structured(c, a.dtype) + want = np.array([( 0, ( 1., 2), [ 3., 4.]), + ( 5, ( 6., 7), [ 8., 9.]), + (10, (11., 12), [13., 14.]), + (15, (16., 17), [18., 19.])], + dtype=[('a', ' Date: Wed, 31 Oct 2018 17:23:27 -0400 Subject: [PATCH 2/3] ENH: Fixups to multi-field assignment helpers --- doc/release/1.16.0-notes.rst | 5 ++ numpy/lib/recfunctions.py | 106 ++++++++++++++++----------- numpy/lib/tests/test_recfunctions.py | 8 ++ 3 files changed, 77 insertions(+), 42 deletions(-) diff --git a/doc/release/1.16.0-notes.rst b/doc/release/1.16.0-notes.rst index 89a4623e19f5..5710cebe9386 100644 --- a/doc/release/1.16.0-notes.rst +++ b/doc/release/1.16.0-notes.rst @@ -19,6 +19,11 @@ Highlights New functions ============= + * New functions in the `numpy.lib.recfunctions` module to ease the structured + assignment changes: `assign_fields_by_name`, `structured_to_unstructured`, + `unstructured_to_structured`, `apply_along_fields`, and `require_fields`. + See the user guide at + for more info. Deprecations ============ diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py index 11c04f03d862..461bc2bfeae5 100644 --- a/numpy/lib/recfunctions.py +++ b/numpy/lib/recfunctions.py @@ -888,7 +888,7 @@ def _get_fields_and_offsets(dt, offset=0): fields.extend(_get_fields_and_offsets(field[0], field[1] + offset)) return fields -def structured_to_unstructured(arr, dtype=None): +def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'): """ Converts and n-D structured array into an (n+1)-D unstructured array. @@ -903,9 +903,15 @@ def structured_to_unstructured(arr, dtype=None): Parameters ---------- arr : ndarray - Structured array or dtype to convert. + Structured array or dtype to convert. Cannot contain object datatype. dtype : dtype, optional The dtype of the output unstructured array + copy : bool, optional + See copy argument to `ndarray.astype`. If true, always return a copy. + If false, and `dtype` requirements are satisfied, a view is returned. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + See casting argument of `ndarray.astype`. Controls what kind of data + casting may occur. Returns ------- @@ -932,39 +938,46 @@ def structured_to_unstructured(arr, dtype=None): array([ 3. , 5.5, 9. , 11. ]) """ - if not arr.dtype.names: + if arr.dtype.names is None: raise ValueError('arr must be a structured array') fields = _get_fields_and_offsets(arr.dtype) - n_elem = sum(f[2] for f in fields) + names, dts, counts, offsets = zip(*fields) + n_fields = len(names) if dtype is None: - out_dtype = np.result_type(*[f[1].base for f in fields]) + out_dtype = np.result_type(*[dt.base for dt in dts]) else: out_dtype = dtype - out = np.empty(arr.shape + (n_elem,), dtype=out_dtype) + # Use a series of views and casts to convert to an unstructured array: - index = 0 - for name, dt, count, offset in fields: - if count == 1: - out[...,index] = arr.getfield(dt, offset) - index += 1 - else: - out[...,index:index+count] = arr.getfield(dt, offset) - index += count + # first view using flattened fields (doesn't work for object arrays) + # Note: dts may include a shape for subarrays + flattened_fields = np.dtype({'names': names, + 'formats': dts, + 'offsets': offsets, + 'itemsize': arr.dtype.itemsize}) + arr = arr.view(flattened_fields) - return out + # next cast to a packed format with all fields converted to new dtype + packed_fields = np.dtype({'names': names, + 'formats': [(out_dtype, c) for c in counts]}) + arr = arr.astype(packed_fields, copy=copy, casting=casting) -def unstructured_to_structured(arr, dtype=None, names=None, align=False): + # finally is it safe to view the packed fields as the unstructured type + return arr.view((out_dtype, sum(counts))) + +def unstructured_to_structured(arr, dtype=None, names=None, align=False, + copy=False, casting='unsafe'): """ Converts and n-D unstructured array into an (n-1)-D structured array. - The last dimension of the array is converted into a structure, with + The last dimension of the input array is converted into a structure, with number of field-elements equal to the size of the last dimension of the - input array. By default all fields will have the same dtype as the - original array, but you may supply a custom dtype with the right - number of fields-elements. + input array. By default all output fields have the input array's dtype, but + an output structured dtype with an equal number of fields-elements can be + supplied instead. Nested fields, as well as each element of any subarray fields, all count towards the number of field-elements. @@ -979,7 +992,13 @@ def unstructured_to_structured(arr, dtype=None, names=None, align=False): If dtype is not supplied, this specifies the field names for the output dtype, in order. The field dtypes will be the same as the input array. align : boolean, optional - If dtype is not supplied, whether to create an aligned memory layout. + Whether to create an aligned memory layout. + copy : bool, optional + See copy argument to `ndarray.astype`. If true, always return a copy. + If false, and `dtype` requirements are satisfied, a view is returned. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + See casting argument of `ndarray.astype`. Controls what kind of data + casting may occur. Returns ------- @@ -1011,29 +1030,36 @@ def unstructured_to_structured(arr, dtype=None, names=None, align=False): names = ['f{}'.format(n) for n in range(n_elem)] out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align) fields = _get_fields_and_offsets(out_dtype) + names, dts, counts, offsets = zip(*fields) else: if names is not None: raise ValueError("don't supply both dtype and names") # sanity check of the input dtype fields = _get_fields_and_offsets(dtype) - n_fields = sum(f[2] for f in fields) - if n_fields != n_elem: + names, dts, counts, offsets = zip(*fields) + if n_elem != sum(counts): raise ValueError('The length of the last dimension of arr must ' 'be equal to the number of fields in dtype') out_dtype = dtype + if align and not out_dtype.isalignedstruct: + raise ValueError("align was True but dtype is not aligned") - out = np.empty(arr.shape[:-1], dtype=out_dtype) + # Use a series of views and casts to convert to a structured array: - n = 0 - for name, dt, count, offset in fields: - if count == 1: - out.setfield(arr[...,n], dt, offset) - n += 1 - else: - out.setfield(arr[...,n:n+count], dt, offset) - n += count + # first view as a packed structured array of one dtype + packed_fields = np.dtype({'names': names, + 'formats': [(arr.dtype, c) for c in counts]}) + arr = np.ascontiguousarray(arr).view(packed_fields) - return out + # next cast to an unpacked but flattened format with varied dtypes + flattened_fields = np.dtype({'names': names, + 'formats': dts, + 'offsets': offsets, + 'itemsize': out_dtype.itemsize}) + arr = arr.astype(flattened_fields, copy=copy, casting=casting) + + # finally view as the final nested dtype and remove the last axis + return arr.view(out_dtype)[..., 0] def apply_along_fields(func, arr): """ @@ -1066,7 +1092,7 @@ def apply_along_fields(func, arr): array([ 3. , 5.5, 9. , 11. ]) """ - if not arr.dtype.names: + if arr.dtype.names is None: raise ValueError('arr must be a structured array') uarr = structured_to_unstructured(arr) @@ -1113,15 +1139,11 @@ def assign_fields_by_name(dst, src, zero_unassigned=True): def require_fields(array, required_dtype): """ - Casts the array to the required dtype using assignment by field-name. - - Normal structured array casting/assignment works "by position" in numpy - 1.14+, meaning that the first field from the source's dtype is copied to - the first field of the destination's dtype, and so on. + Casts a structured array to a new dtype using assignment by field-name. - This function assigns by name instead, so the value of a field in the - output array is the value of the field with the same name in the source - array. + This function assigns to from the old to the new array by name, so the + value of a field in the output array is the value of the field with the + same name in the source array. If a field name in the required_dtype does not exist in the input array, that field is set to 0 in the output array. diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py index 8b61ba0bff23..7ec33d92a345 100644 --- a/numpy/lib/tests/test_recfunctions.py +++ b/numpy/lib/tests/test_recfunctions.py @@ -233,6 +233,14 @@ def test_structured_to_unstructured(self): assert_equal(apply_along_fields(np.mean, d[['x', 'z']]), np.array([ 3. , 5.5, 9. , 11. ])) + # check that for uniform field dtypes we get a view, not a copy: + d = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)], + dtype=[('x', 'i4'), ('y', 'i4'), ('z', 'i4')]) + dd = structured_to_unstructured(d) + ddd = unstructured_to_structured(dd, d.dtype) + assert_(dd.base is d) + assert_(ddd.base is d) + def test_field_assignment_by_name(self): a = np.ones(2, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')]) newdt = [('b', 'f4'), ('c', 'u1')] From 61371de744b363eacdb2ae277c33d365164380f3 Mon Sep 17 00:00:00 2001 From: Allan Haldane Date: Thu, 22 Nov 2018 18:58:32 -0500 Subject: [PATCH 3/3] MAINT: Add new recfunctions to numpy function API --- numpy/lib/recfunctions.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py index 461bc2bfeae5..20e91af5f0e6 100644 --- a/numpy/lib/recfunctions.py +++ b/numpy/lib/recfunctions.py @@ -888,6 +888,12 @@ def _get_fields_and_offsets(dt, offset=0): fields.extend(_get_fields_and_offsets(field[0], field[1] + offset)) return fields + +def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None, + casting=None): + return (arr,) + +@array_function_dispatch(_structured_to_unstructured_dispatcher) def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'): """ Converts and n-D structured array into an (n+1)-D unstructured array. @@ -968,6 +974,11 @@ def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'): # finally is it safe to view the packed fields as the unstructured type return arr.view((out_dtype, sum(counts))) +def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None, + align=None, copy=None, casting=None): + return (arr,) + +@array_function_dispatch(_unstructured_to_structured_dispatcher) def unstructured_to_structured(arr, dtype=None, names=None, align=False, copy=False, casting='unsafe'): """ @@ -1061,6 +1072,10 @@ def unstructured_to_structured(arr, dtype=None, names=None, align=False, # finally view as the final nested dtype and remove the last axis return arr.view(out_dtype)[..., 0] +def _apply_along_fields_dispatcher(func, arr): + return (arr,) + +@array_function_dispatch(_apply_along_fields_dispatcher) def apply_along_fields(func, arr): """ Apply function 'func' as a reduction across fields of a structured array. @@ -1100,6 +1115,10 @@ def apply_along_fields(func, arr): # works and avoids axis requirement, but very, very slow: #return np.apply_along_axis(func, -1, uarr) +def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None): + return dst, src + +@array_function_dispatch(_assign_fields_by_name_dispatcher) def assign_fields_by_name(dst, src, zero_unassigned=True): """ Assigns values from one structured array to another by field name. @@ -1137,6 +1156,10 @@ def assign_fields_by_name(dst, src, zero_unassigned=True): assign_fields_by_name(dst[name], src[name], zero_unassigned) +def _require_fields_dispatcher(array, required_dtype): + return (array,) + +@array_function_dispatch(_require_fields_dispatcher) def require_fields(array, required_dtype): """ Casts a structured array to a new dtype using assignment by field-name.