From 198df77c93d9813ceaa39db66c2d9393f96c7acd Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 2 Oct 2018 17:11:48 -0400 Subject: [PATCH 1/5] MAINT: provide an algorithm that blocks matrices with a single memory copy. --- numpy/core/shape_base.py | 203 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 195 insertions(+), 8 deletions(-) diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py index fde23076ba4e..a928dc72be8f 100644 --- a/numpy/core/shape_base.py +++ b/numpy/core/shape_base.py @@ -3,6 +3,8 @@ __all__ = ['atleast_1d', 'atleast_2d', 'atleast_3d', 'block', 'hstack', 'stack', 'vstack'] +import functools +import operator from . import numeric as _nx from .numeric import array, asanyarray, newaxis @@ -432,6 +434,10 @@ def _block_check_depths_match(arrays, parent_index=[]): refer to it, and the last index along the empty axis will be `None`. max_arr_ndim : int The maximum of the ndims of the arrays nested in `arrays`. + final_size: int + The number of elements in the final array. This is used the motivate + the choice of algorithm used using benchmarking wisdom. + """ if type(arrays) is tuple: # not strictly necessary, but saves us from: @@ -450,8 +456,9 @@ def _block_check_depths_match(arrays, parent_index=[]): idxs_ndims = (_block_check_depths_match(arr, parent_index + [i]) for i, arr in enumerate(arrays)) - first_index, max_arr_ndim = next(idxs_ndims) - for index, ndim in idxs_ndims: + first_index, max_arr_ndim, final_size = next(idxs_ndims) + for index, ndim, size in idxs_ndims: + final_size += size if ndim > max_arr_ndim: max_arr_ndim = ndim if len(index) != len(first_index): @@ -466,13 +473,15 @@ def _block_check_depths_match(arrays, parent_index=[]): # propagate our flag that indicates an empty list at the bottom if index[-1] is None: first_index = index - return first_index, max_arr_ndim + + return first_index, max_arr_ndim, final_size elif type(arrays) is list and len(arrays) == 0: # We've 'bottomed out' on an empty list - return parent_index + [None], 0 + return parent_index + [None], 0, 0 else: # We've 'bottomed out' - arrays is either a scalar or an array - return parent_index, _nx.ndim(arrays) + size = _nx.size(arrays) + return parent_index, _nx.ndim(arrays), size def _atleast_nd(a, ndim): @@ -481,9 +490,132 @@ def _atleast_nd(a, ndim): return array(a, ndmin=ndim, copy=False, subok=True) +def _accumulate(values): + # Helper function because Python 2.7 doesn't have + # itertools.accumulate + value = 0 + accumulated = [] + for v in values: + value += v + accumulated.append(value) + return accumulated + + +def _concatenate_shapes(shapes, axis): + """Given array shapes, return the resulting shape and slices prefixes. + + These help in nested concatation. + Returns + ------- + shape: tuple of int + This tuple satisfies: + ``` + shape, _ = _concatenate_shapes([arr.shape for shape in arrs], axis) + shape == concatenate(arrs, axis).shape + ``` + + slice_prefixes: tuple of (slice(start, end), ) + For a list of arrays being concatenated, this returns the slice + in the larger array at axis that needs to be sliced into. + + For example, the following holds: + ``` + ret = concatenate([a, b, c], axis) + _, (sl_a, sl_b, sl_c) = concatenate_slices([a, b, c], axis) + + ret[(slice(None),) * axis + sl_a] == a + ret[(slice(None),) * axis + sl_b] == b + ret[(slice(None),) * axis + sl_c] == c + ``` + + Thses are called slice prefixes since they are used in the recursive + blocking algorithm to compute the left-most slices during the + recursion. Therefore, they must be prepended to rest of the slice + that was computed deeper in the recusion. + + These are returned as tuples to ensure that they can quickly be added + to existing slice tuple without creating a new tuple everytime. + + """ + # Cache a result that will be reused. + shape_at_axis = [shape[axis] for shape in shapes] + + # Take a shape, any shape + first_shape = shapes[0] + first_shape_pre = first_shape[:axis] + first_shape_post = first_shape[axis+1:] + + if any(shape[:axis] != first_shape_pre or + shape[axis+1:] != first_shape_post for shape in shapes): + raise ValueError( + 'Mismatched array shapes in block along axis {}.'.format(axis)) + + shape = (first_shape_pre + (sum(shape_at_axis),) + first_shape[axis+1:]) + + offsets_at_axis = _accumulate(shape_at_axis) + slice_prefixes = [(slice(start, end),) + for start, end in zip([0] + offsets_at_axis, + offsets_at_axis)] + return shape, slice_prefixes + + +def _block_info_recursion(arrays, max_depth, result_ndim, depth=0): + """ + Returns the shape of the final array, along with a list + of slices and a list of arrays that can be used for assignment inside the + new array + + Parameters + ---------- + arrays : nested list of arrays + The arrays to check + max_depth : list of int + The number of nested lists + result_ndim: int + The number of dimensions in thefinal array. + + Returns + ------- + shape : tuple of int + The shape that the final array will take on. + slices: list of tuple of slices + The slices into the full array required for assignment. These are + required to be prepended with ``(Ellipsis, )`` to obtain to correct + final index. + arrays: list of ndarray + The data to assign to each slice of the full array + + """ + if depth < max_depth: + shapes, slices, arrays = zip( + *[_block_info_recursion(arr, max_depth, result_ndim, depth+1) + for arr in arrays]) + + axis = result_ndim - max_depth + depth + shape, slice_prefixes = _concatenate_shapes(shapes, axis) + + # Prepend the slice prefix and flatten the slices + slices = [slice_prefix + the_slice + for slice_prefix, inner_slices in zip(slice_prefixes, slices) + for the_slice in inner_slices] + + # Flatten the array list + arrays = functools.reduce(operator.add, arrays) + + return shape, slices, arrays + else: + # We've 'bottomed out' - arrays is either a scalar or an array + # type(arrays) is not list + # Return the slice and the array inside a list to be consistent with + # the recursive case. + arr = _atleast_nd(arrays, result_ndim) + return arr.shape, [()], [arr] + + def _block(arrays, max_depth, result_ndim, depth=0): """ - Internal implementation of block. `arrays` is the argument passed to + Internal implementation of block based on repeated concatenation. + `arrays` is the argument passed to block. `max_depth` is the depth of nested lists within `arrays` and `result_ndim` is the greatest of the dimensions of the arrays in `arrays` and the depth of the lists in `arrays` (see block docstring @@ -648,7 +780,38 @@ def block(arrays): """ - bottom_index, arr_ndim = _block_check_depths_match(arrays) + arrays, list_ndim, result_ndim, final_size = _block_setup(arrays) + + # It was found through benchmarking that making an array of final size + # around 256x256 was faster by straight concatenation on a + # i7-7700HQ processor and dual channel ram 2400MHz. + # It didn't seem to matter heavily on the dtype used. + # + # A 2D array using repeated concatenation requires 2 copies of the array. + # + # The fastest algorithm will depend on the ratio of CPU power to memory + # speed. + # One can monitor the results of the benchmark + # https://pv.github.io/numpy-bench/#bench_shape_base.Block2D.time_block2d + # to tune this parameter until a C version of the `_block_info_recursion` + # algorithm is implemented which would likely be faster than the python + # version. + if list_ndim * final_size > (2 * 512 * 512): + return _block_slicing(arrays, list_ndim, result_ndim) + else: + return _block_concatenate(arrays, list_ndim, result_ndim) + + +# Theses helper functions are mostly used for testing. +# They allow us to write tests that directly call `_block_slicing` +# or `_block_concatenate` wtihout blocking large arrays to forse the wisdom +# to trigger the desired path. +def _block_setup(arrays): + """ + Returns + (`arrays`, list_ndim, result_ndim, final_size) + """ + bottom_index, arr_ndim, final_size = _block_check_depths_match(arrays) list_ndim = len(bottom_index) if bottom_index and bottom_index[-1] is None: raise ValueError( @@ -656,7 +819,31 @@ def block(arrays): _block_format_index(bottom_index) ) ) - result = _block(arrays, list_ndim, max(arr_ndim, list_ndim)) + result_ndim = max(arr_ndim, list_ndim) + return arrays, list_ndim, result_ndim, final_size + + +def _block_slicing(arrays, list_ndim, result_ndim): + shape, slices, arrays = _block_info_recursion( + arrays, list_ndim, result_ndim) + dtype = _nx.result_type(*[arr.dtype for arr in arrays]) + + # Test preferring F only in the case that all input arrays are F + F_order = all(arr.flags['F_CONTIGUOUS'] for arr in arrays) + C_order = all(arr.flags['C_CONTIGUOUS'] for arr in arrays) + order = 'F' if F_order and not C_order else 'C' + result = _nx.empty(shape=shape, dtype=dtype, order=order) + # Note: In a c implementation, the function + # PyArray_CreateMultiSortedStridePerm could be used for more advanced + # guessing of the desired order. + + for the_slice, arr in zip(slices, arrays): + result[(Ellipsis,) + the_slice] = arr + return result + + +def _block_concatenate(arrays, list_ndim, result_ndim): + result = _block(arrays, list_ndim, result_ndim) if list_ndim == 0: # Catch an edge case where _block returns a view because # `arrays` is a single numpy array and not a list of numpy arrays. From 6d4715e5358bc639faa8ddff9be19ff05823a0c3 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 2 Oct 2018 20:43:14 -0400 Subject: [PATCH 2/5] TST: Block test: Trigger both code paths. --- numpy/core/tests/test_shape_base.py | 126 ++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 37 deletions(-) diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py index df819b73f6ca..f396b25af76f 100644 --- a/numpy/core/tests/test_shape_base.py +++ b/numpy/core/tests/test_shape_base.py @@ -6,6 +6,9 @@ array, arange, atleast_1d, atleast_2d, atleast_3d, block, vstack, hstack, newaxis, concatenate, stack ) + +from numpy.core.shape_base import (_block_setup, + _block_concatenate, _block_slicing) from numpy.testing import ( assert_, assert_raises, assert_array_equal, assert_equal, assert_raises_regex, assert_almost_equal @@ -372,14 +375,63 @@ def test_stack(): stack, [np.arange(2), np.arange(3)]) +# See for more information on how to parametrize a whole class +# https://docs.pytest.org/en/latest/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration +def pytest_generate_tests(metafunc): + # called once per each test function + if hasattr(metafunc.cls, 'params'): + arglist = metafunc.cls.params + argnames = sorted(arglist[0]) + metafunc.parametrize(argnames, + [[funcargs[name] for name in argnames] + for funcargs in arglist]) + + +# blocking small arrays and large arrays go through different paths. +# the algorithm is triggered depending on the number of element +# copies required. +# We define a test fixture that forces most tests to go through +# both code paths. +# Ultimately, this should be removed if a single algorithm is found +# to be faster for both small and large arrays.s +def _block_force_concatenate(arrays): + arrays, list_ndim, result_ndim, _ = _block_setup(arrays) + return _block_concatenate(arrays, list_ndim, result_ndim) + + +def _block_force_slicing(arrays): + arrays, list_ndim, result_ndim, _ = _block_setup(arrays) + return _block_slicing(arrays, list_ndim, result_ndim) + + class TestBlock(object): - def test_returns_copy(self): + params = [dict(block=block), + dict(block=_block_force_concatenate), + dict(block=_block_force_slicing)] + + def test_returns_copy(self, block): a = np.eye(3) - b = np.block(a) + b = block(a) b[0, 0] = 2 assert b[0, 0] != a[0, 0] - def test_block_simple_row_wise(self): + def test_block_total_size_estimate(self, block): + _, _, _, total_size = _block_setup([1]) + assert total_size == 1 + + _, _, _, total_size = _block_setup([[1]]) + assert total_size == 1 + + _, _, _, total_size = _block_setup([[1, 1]]) + assert total_size == 2 + + _, _, _, total_size = _block_setup([[1], [1]]) + assert total_size == 2 + + _, _, _, total_size = _block_setup([[1, 2], [3, 4]]) + assert total_size == 4 + + def test_block_simple_row_wise(self, block): a_2d = np.ones((2, 2)) b_2d = 2 * a_2d desired = np.array([[1, 1, 2, 2], @@ -387,7 +439,7 @@ def test_block_simple_row_wise(self): result = block([a_2d, b_2d]) assert_equal(desired, result) - def test_block_simple_column_wise(self): + def test_block_simple_column_wise(self, block): a_2d = np.ones((2, 2)) b_2d = 2 * a_2d expected = np.array([[1, 1], @@ -397,7 +449,7 @@ def test_block_simple_column_wise(self): result = block([[a_2d], [b_2d]]) assert_equal(expected, result) - def test_block_with_1d_arrays_row_wise(self): + def test_block_with_1d_arrays_row_wise(self, block): # # # 1-D vectors are treated as row arrays a = np.array([1, 2, 3]) b = np.array([2, 3, 4]) @@ -405,7 +457,7 @@ def test_block_with_1d_arrays_row_wise(self): result = block([a, b]) assert_equal(expected, result) - def test_block_with_1d_arrays_multiple_rows(self): + def test_block_with_1d_arrays_multiple_rows(self, block): a = np.array([1, 2, 3]) b = np.array([2, 3, 4]) expected = np.array([[1, 2, 3, 2, 3, 4], @@ -413,7 +465,7 @@ def test_block_with_1d_arrays_multiple_rows(self): result = block([[a, b], [a, b]]) assert_equal(expected, result) - def test_block_with_1d_arrays_column_wise(self): + def test_block_with_1d_arrays_column_wise(self, block): # # # 1-D vectors are treated as row arrays a_1d = np.array([1, 2, 3]) b_1d = np.array([2, 3, 4]) @@ -422,7 +474,7 @@ def test_block_with_1d_arrays_column_wise(self): result = block([[a_1d], [b_1d]]) assert_equal(expected, result) - def test_block_mixed_1d_and_2d(self): + def test_block_mixed_1d_and_2d(self, block): a_2d = np.ones((2, 2)) b_1d = np.array([2, 2]) result = block([[a_2d], [b_1d]]) @@ -431,7 +483,7 @@ def test_block_mixed_1d_and_2d(self): [2, 2]]) assert_equal(expected, result) - def test_block_complicated(self): + def test_block_complicated(self, block): # a bit more complicated one_2d = np.array([[1, 1, 1]]) two_2d = np.array([[2, 2, 2]]) @@ -455,7 +507,7 @@ def test_block_complicated(self): [zero_2d]]) assert_equal(result, expected) - def test_nested(self): + def test_nested(self, block): one = np.array([1, 1, 1]) two = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]) three = np.array([3, 3, 3]) @@ -464,9 +516,9 @@ def test_nested(self): six = np.array([6, 6, 6, 6, 6]) zero = np.zeros((2, 6)) - result = np.block([ + result = block([ [ - np.block([ + block([ [one], [three], [four] @@ -485,7 +537,7 @@ def test_nested(self): assert_equal(result, expected) - def test_3d(self): + def test_3d(self, block): a000 = np.ones((2, 2, 2), int) * 1 a100 = np.ones((3, 2, 2), int) * 2 @@ -498,7 +550,7 @@ def test_3d(self): a111 = np.ones((3, 3, 3), int) * 8 - result = np.block([ + result = block([ [ [a000, a001], [a010, a011], @@ -540,53 +592,53 @@ def test_3d(self): assert_array_equal(result, expected) - def test_block_with_mismatched_shape(self): + def test_block_with_mismatched_shape(self, block): a = np.array([0, 0]) b = np.eye(2) - assert_raises(ValueError, np.block, [a, b]) - assert_raises(ValueError, np.block, [b, a]) + assert_raises(ValueError, block, [a, b]) + assert_raises(ValueError, block, [b, a]) - def test_no_lists(self): - assert_equal(np.block(1), np.array(1)) - assert_equal(np.block(np.eye(3)), np.eye(3)) + def test_no_lists(self, block): + assert_equal(block(1), np.array(1)) + assert_equal(block(np.eye(3)), np.eye(3)) - def test_invalid_nesting(self): + def test_invalid_nesting(self, block): msg = 'depths are mismatched' - assert_raises_regex(ValueError, msg, np.block, [1, [2]]) - assert_raises_regex(ValueError, msg, np.block, [1, []]) - assert_raises_regex(ValueError, msg, np.block, [[1], 2]) - assert_raises_regex(ValueError, msg, np.block, [[], 2]) - assert_raises_regex(ValueError, msg, np.block, [ + assert_raises_regex(ValueError, msg, block, [1, [2]]) + assert_raises_regex(ValueError, msg, block, [1, []]) + assert_raises_regex(ValueError, msg, block, [[1], 2]) + assert_raises_regex(ValueError, msg, block, [[], 2]) + assert_raises_regex(ValueError, msg, block, [ [[1], [2]], [[3, 4]], [5] # missing brackets ]) - def test_empty_lists(self): - assert_raises_regex(ValueError, 'empty', np.block, []) - assert_raises_regex(ValueError, 'empty', np.block, [[]]) - assert_raises_regex(ValueError, 'empty', np.block, [[1], []]) + def test_empty_lists(self, block): + assert_raises_regex(ValueError, 'empty', block, []) + assert_raises_regex(ValueError, 'empty', block, [[]]) + assert_raises_regex(ValueError, 'empty', block, [[1], []]) - def test_tuple(self): - assert_raises_regex(TypeError, 'tuple', np.block, ([1, 2], [3, 4])) - assert_raises_regex(TypeError, 'tuple', np.block, [(1, 2), (3, 4)]) + def test_tuple(self, block): + assert_raises_regex(TypeError, 'tuple', block, ([1, 2], [3, 4])) + assert_raises_regex(TypeError, 'tuple', block, [(1, 2), (3, 4)]) - def test_different_ndims(self): + def test_different_ndims(self, block): a = 1. b = 2 * np.ones((1, 2)) c = 3 * np.ones((1, 1, 3)) - result = np.block([a, b, c]) + result = block([a, b, c]) expected = np.array([[[1., 2., 2., 3., 3., 3.]]]) assert_equal(result, expected) - def test_different_ndims_depths(self): + def test_different_ndims_depths(self, block): a = 1. b = 2 * np.ones((1, 2)) c = 3 * np.ones((1, 2, 3)) - result = np.block([[a, b], [c]]) + result = block([[a, b], [c]]) expected = np.array([[[1., 2., 2.], [3., 3., 3.], [3., 3., 3.]]]) From d9824ad5723522b4802e3fc2593d001c6e4580a7 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Thu, 4 Oct 2018 20:18:17 -0700 Subject: [PATCH 3/5] TST: Add a test to block that checks for mismatched shapes in 2D --- numpy/core/tests/test_shape_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py index f396b25af76f..7a6cb7c7480d 100644 --- a/numpy/core/tests/test_shape_base.py +++ b/numpy/core/tests/test_shape_base.py @@ -598,6 +598,9 @@ def test_block_with_mismatched_shape(self, block): assert_raises(ValueError, block, [a, b]) assert_raises(ValueError, block, [b, a]) + to_block = [[np.ones((2,3)), np.ones((2,2))], + [np.ones((2,2)), np.ones((2,2))]] + assert_raises(ValueError, block, to_block) def test_no_lists(self, block): assert_equal(block(1), np.array(1)) assert_equal(block(np.eye(3)), np.eye(3)) From c5f21f6a2279ca7f207f3298c30316f8b886a5d4 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Fri, 5 Oct 2018 20:35:13 -0400 Subject: [PATCH 4/5] DOC: Add a release note about the slice based blocking algorithm --- doc/release/1.16.0-notes.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/release/1.16.0-notes.rst b/doc/release/1.16.0-notes.rst index 599123f97c6a..60980b122914 100644 --- a/doc/release/1.16.0-notes.rst +++ b/doc/release/1.16.0-notes.rst @@ -246,6 +246,14 @@ Previously we had a broken default that sometimes would not report underflow, overflow, and invalid floating point operations. Now we can support non-glibc distrubutions like Alpine Linux as long as they ship `fenv.h`. +Speedup ``np.block`` for large arrays +------------------------------------- +Large arrays (greater than ``512 * 512``) now use a blocking algorithm based on +copying the data directly into the appropriate slice of the resulting array. +This results in significant speedups for these large arrays, particularly for +arrays being blocked along more than 2 dimensions. + + Changes ======= From f164d2e90cce62d901c1cce881684863fefde91f Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sat, 20 Oct 2018 20:33:51 -0400 Subject: [PATCH 5/5] TST: Add a test to ensure the memory order is respected when after a call to ``np.block``. --- numpy/core/tests/test_shape_base.py | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py index 7a6cb7c7480d..2c74627befd5 100644 --- a/numpy/core/tests/test_shape_base.py +++ b/numpy/core/tests/test_shape_base.py @@ -647,3 +647,33 @@ def test_different_ndims_depths(self, block): [3., 3., 3.]]]) assert_equal(result, expected) + + def test_block_memory_order(self, block): + # 3D + arr_c = np.zeros((3,)*3, order='C') + arr_f = np.zeros((3,)*3, order='F') + + b_c = [[[arr_c, arr_c], + [arr_c, arr_c]], + [[arr_c, arr_c], + [arr_c, arr_c]]] + + b_f = [[[arr_f, arr_f], + [arr_f, arr_f]], + [[arr_f, arr_f], + [arr_f, arr_f]]] + + assert block(b_c).flags['C_CONTIGUOUS'] + assert block(b_f).flags['F_CONTIGUOUS'] + + arr_c = np.zeros((3, 3), order='C') + arr_f = np.zeros((3, 3), order='F') + # 2D + b_c = [[arr_c, arr_c], + [arr_c, arr_c]] + + b_f = [[arr_f, arr_f], + [arr_f, arr_f]] + + assert block(b_c).flags['C_CONTIGUOUS'] + assert block(b_f).flags['F_CONTIGUOUS']