From 6a1210104cbcba7748650894929c9ad04d7ab67a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 13 Nov 2021 18:02:18 +0100 Subject: [PATCH 01/17] TST use contiguous memoryview --- sklearn/utils/_readonly_array_wrapper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx index 55ac82f9d80fd..03074661a622b 100644 --- a/sklearn/utils/_readonly_array_wrapper.pyx +++ b/sklearn/utils/_readonly_array_wrapper.pyx @@ -48,7 +48,7 @@ cdef class ReadonlyArrayWrapper: PyBuffer_Release(buffer) -def _test_sum(NUM_TYPES[:] x): +def _test_sum(NUM_TYPES[::1] x): """This function is for testing only. As this function does not modify x, we would like to define it as From d9f351ebcc8e3b23caa67e9de72db6c38b22d1b4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 13 Nov 2021 18:52:39 +0100 Subject: [PATCH 02/17] DEBUG set compiler_directives boundscheck to False --- sklearn/_build_utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index 67b5f2c662eb0..05aacd704794a 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -78,7 +78,7 @@ def cythonize_extensions(top_path, config): }, compiler_directives={ "language_level": 3, - "boundscheck": False, + "boundscheck": True, "wraparound": False, "initializedcheck": False, "nonecheck": False, From e70770f7e0a56bc37beac8ae04f7f10418de36d1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Nov 2021 12:03:39 +0100 Subject: [PATCH 03/17] FIX __releasebuffer__ in ReadonlyArrayWrapper --- sklearn/utils/_readonly_array_wrapper.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx index 03074661a622b..dc3a1375f046f 100644 --- a/sklearn/utils/_readonly_array_wrapper.pyx +++ b/sklearn/utils/_readonly_array_wrapper.pyx @@ -13,6 +13,7 @@ This way, we can use it on arrays that we don't touch. from cpython cimport Py_buffer from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE +from cpython.ref cimport Py_INCREF import numpy as np cimport numpy as np @@ -43,8 +44,15 @@ cdef class ReadonlyArrayWrapper: if request_for_writeable: # The following is a lie when self.wraps is readonly! buffer.readonly = False + buffer.obj = self def __releasebuffer__(self, Py_buffer *buffer): + # restore the state when the buffer was created + # because reassigning buffer.obj decrefs self, and the specification of + # __releasebuffer__ ways we shouldn't do that + Py_INCREF(self) + buffer.obj = self.wraps + buffer.readonly = True PyBuffer_Release(buffer) From bb31851ca613778debb2114db4274446b5ad9112 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Nov 2021 12:37:03 +0100 Subject: [PATCH 04/17] Revert "DEBUG set compiler_directives boundscheck to False" This reverts commit d9f351ebcc8e3b23caa67e9de72db6c38b22d1b4. --- sklearn/_build_utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index 05aacd704794a..67b5f2c662eb0 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -78,7 +78,7 @@ def cythonize_extensions(top_path, config): }, compiler_directives={ "language_level": 3, - "boundscheck": True, + "boundscheck": False, "wraparound": False, "initializedcheck": False, "nonecheck": False, From 4be72025eff1a4da6766eb3a6e6f920126446218 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Nov 2021 16:38:49 +0100 Subject: [PATCH 05/17] Revert "FIX __releasebuffer__ in ReadonlyArrayWrapper" This reverts commit e70770f7e0a56bc37beac8ae04f7f10418de36d1. --- sklearn/utils/_readonly_array_wrapper.pyx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx index dc3a1375f046f..03074661a622b 100644 --- a/sklearn/utils/_readonly_array_wrapper.pyx +++ b/sklearn/utils/_readonly_array_wrapper.pyx @@ -13,7 +13,6 @@ This way, we can use it on arrays that we don't touch. from cpython cimport Py_buffer from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE -from cpython.ref cimport Py_INCREF import numpy as np cimport numpy as np @@ -44,15 +43,8 @@ cdef class ReadonlyArrayWrapper: if request_for_writeable: # The following is a lie when self.wraps is readonly! buffer.readonly = False - buffer.obj = self def __releasebuffer__(self, Py_buffer *buffer): - # restore the state when the buffer was created - # because reassigning buffer.obj decrefs self, and the specification of - # __releasebuffer__ ways we shouldn't do that - Py_INCREF(self) - buffer.obj = self.wraps - buffer.readonly = True PyBuffer_Release(buffer) From 69b9bf2b7ba0c845bc308fe825f6e0adad1b1111 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Nov 2021 16:46:59 +0100 Subject: [PATCH 06/17] TST add test_contig_mmapped --- sklearn/utils/tests/test_readonly_wrapper.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index c385e1db51cd3..d22e7e1777e8e 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -33,3 +33,12 @@ def test_readonly_array_wrapper(readonly, dtype): x_readonly = ReadonlyArrayWrapper(x_readonly) sum_readonly = _test_sum(x_readonly) assert sum_readonly == pytest.approx(sum_origin, rel=1e-11) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) +def test_contig_mmapped(dtype): + x = np.arange(10).astype(dtype) + sum_origin = _test_sum(x) + x_mmap = create_memmap_backed_data(x, mmap_mode="w+") + sum_mmap = _test_sum(x_mmap) + assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) From 6ea910648f3a41cc735294811acf21eb0c0a6f42 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Nov 2021 16:47:57 +0100 Subject: [PATCH 07/17] DEBUG skip test_readonly_array_wrapper --- sklearn/utils/tests/test_readonly_wrapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index d22e7e1777e8e..fcc8b10607d48 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -13,6 +13,7 @@ def _readonly_array_copy(x): return y +@pytest.mark.skip @pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data]) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_readonly_array_wrapper(readonly, dtype): From eb3d274f9d82e2328295151a201e262e522cf07c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Nov 2021 08:45:06 +0100 Subject: [PATCH 08/17] Revert "DEBUG skip test_readonly_array_wrapper" This reverts commit 6ea910648f3a41cc735294811acf21eb0c0a6f42. --- sklearn/utils/tests/test_readonly_wrapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index fcc8b10607d48..d22e7e1777e8e 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -13,7 +13,6 @@ def _readonly_array_copy(x): return y -@pytest.mark.skip @pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data]) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_readonly_array_wrapper(readonly, dtype): From 5d884df87b856bff07acd6101b37d5fbf4f35f8b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Nov 2021 10:46:45 +0100 Subject: [PATCH 09/17] FIX aligned arg for create_memmap_backed_data --- sklearn/utils/_testing.py | 25 +++++++++-- sklearn/utils/tests/test_readonly_wrapper.py | 17 +++----- sklearn/utils/tests/test_testing.py | 46 ++++++++++++++++---- 3 files changed, 66 insertions(+), 22 deletions(-) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 1e4ecdd53e136..1724063be2f43 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -def create_memmap_backed_data(data, mmap_mode="r", return_folder=False): +def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False): """ Parameters ---------- data mmap_mode : str, default='r' return_folder : bool, default=False + aligned : bool, default=False + If True, if input is a single numpy array and if the input array is aligned, + the memory mapped array will also be aligned. This is a workaround for + https://github.com/joblib/joblib/issues/563. """ temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_") atexit.register(functools.partial(_delete_folder, temp_folder, warn=True)) - filename = op.join(temp_folder, "data.pkl") - joblib.dump(data, filename) - memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) + if aligned: + if isinstance(data, np.ndarray) and data.flags.aligned: + # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html + filename = op.join(temp_folder, "data.dat") + fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape) + fp[:] = data[:] # write data to memmap array + fp.flush() + memmap_backed_data = np.memmap( + filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape + ) + else: + raise ValueError("If aligned=True, input must be a single numpy array.") + else: + filename = op.join(temp_folder, "data.pkl") + joblib.dump(data, filename) + memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) result = ( memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder) ) diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index d22e7e1777e8e..38163cc2461ce 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -13,7 +13,13 @@ def _readonly_array_copy(x): return y -@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data]) +def _create_memmap_backed_data(data): + return create_memmap_backed_data( + data, mmap_mode="r", return_folder=False, aligned=True + ) + + +@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data]) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_readonly_array_wrapper(readonly, dtype): """Test that ReadonlyWrapper allows working with fused-typed.""" @@ -33,12 +39,3 @@ def test_readonly_array_wrapper(readonly, dtype): x_readonly = ReadonlyArrayWrapper(x_readonly) sum_readonly = _test_sum(x_readonly) assert sum_readonly == pytest.approx(sum_origin, rel=1e-11) - - -@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) -def test_contig_mmapped(dtype): - x = np.arange(10).astype(dtype) - sum_origin = _test_sum(x) - x_mmap = create_memmap_backed_data(x, mmap_mode="w+") - sum_mmap = _test_sum(x_mmap) - assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index e9e9252bd5f0f..a3a42aeb4c83f 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -12,6 +12,7 @@ from sklearn.utils.deprecation import deprecated from sklearn.utils.metaestimators import available_if, if_delegate_has_method +from sklearn.utils._readonly_array_wrapper import _test_sum from sklearn.utils._testing import ( assert_raises, assert_warns, @@ -680,30 +681,59 @@ def test_tempmemmap(monkeypatch): assert registration_counter.nb_calls == 2 -def test_create_memmap_backed_data(monkeypatch): +@pytest.mark.parametrize("aligned", [False, True]) +def test_create_memmap_backed_data(monkeypatch, aligned): registration_counter = RegistrationCounter() monkeypatch.setattr(atexit, "register", registration_counter) input_array = np.ones(3) - data = create_memmap_backed_data(input_array) + data = create_memmap_backed_data(input_array, aligned=aligned) check_memmap(input_array, data) assert registration_counter.nb_calls == 1 - data, folder = create_memmap_backed_data(input_array, return_folder=True) + data, folder = create_memmap_backed_data( + input_array, return_folder=True, aligned=aligned + ) check_memmap(input_array, data) assert folder == os.path.dirname(data.filename) assert registration_counter.nb_calls == 2 mmap_mode = "r+" - data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode) + data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned) check_memmap(input_array, data, mmap_mode) assert registration_counter.nb_calls == 3 input_list = [input_array, input_array + 1, input_array + 2] - mmap_data_list = create_memmap_backed_data(input_list) - for input_array, data in zip(input_list, mmap_data_list): - check_memmap(input_array, data) - assert registration_counter.nb_calls == 4 + if aligned: + with pytest.raises( + ValueError, match="If aligned=True, input must be a single numpy array." + ): + create_memmap_backed_data(input_list, aligned=True) + else: + mmap_data_list = create_memmap_backed_data(input_list, aligned=False) + for input_array, data in zip(input_list, mmap_data_list): + check_memmap(input_array, data) + assert registration_counter.nb_calls == 4 + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) +def test_memmap_on_contiguous_data(dtype): + """Test memory mapped array on contigous memoryview.""" + x = np.arange(10).astype(dtype) + assert x.flags["C_CONTIGUOUS"] + assert x.flags["ALIGNED"] + + # _test_sum consumes contiguous arrays + # def _test_sum(NUM_TYPES[::1] x): + sum_origin = _test_sum(x) + + # now on memory mapped data + # aligned=True so avoid https://github.com/joblib/joblib/issues/563 + # without alignment, this can produce segmentation faults, see + # https://github.com/scikit-learn/scikit-learn/pull/21654 + x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True) + sum_mmap = _test_sum(x_mmap) + assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) @pytest.mark.parametrize( From 3ad49c10edb231cce7ea84c9186e2dc0c18b2893 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Nov 2021 17:48:41 +0100 Subject: [PATCH 10/17] DEBUG minimal reproducer for Cython segfault on non-aligned data --- sklearn/utils/tests/test_testing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index a3a42aeb4c83f..7ef5c44f53a85 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -731,7 +731,8 @@ def test_memmap_on_contiguous_data(dtype): # aligned=True so avoid https://github.com/joblib/joblib/issues/563 # without alignment, this can produce segmentation faults, see # https://github.com/scikit-learn/scikit-learn/pull/21654 - x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True) + x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=False) + print(x_mmap.flags) sum_mmap = _test_sum(x_mmap) assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) From 890d6c3b97094845ac7a88d9f1670ad84ac28aa4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Nov 2021 18:11:30 +0100 Subject: [PATCH 11/17] Print gcc version in azure build log --- build_tools/azure/install.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5bd845147b0f3..0cbbe255795bf 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -160,6 +160,11 @@ try: except ImportError: print('pandas not installed') " + +if [[ $(type -P "gcc") ]]; then + gcc --version +fi + # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=3 From b8f1e782de34f71abb6003c21868e00cb31719f3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Nov 2021 18:11:45 +0100 Subject: [PATCH 12/17] Faster debug iteration --- build_tools/azure/test_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 44b06db6621c9..32d01ad3f63bb 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -55,5 +55,5 @@ if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then fi set -x -eval "$TEST_CMD --pyargs sklearn" +eval "$TEST_CMD -v -k test_memmap_on_contiguous_data --pyargs sklearn.utils.tests.test_testing" set +x From b280de4198cbb4ed39481ea8a6dacb923484bc04 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Nov 2021 18:12:37 +0100 Subject: [PATCH 13/17] disable xdist --- build_tools/azure/test_script.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 32d01ad3f63bb..f649f53f5f3ba 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -46,9 +46,9 @@ if [[ -n "$CHECK_WARNINGS" ]]; then TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning" fi -if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then - TEST_CMD="$TEST_CMD -n2" -fi +# if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then +# TEST_CMD="$TEST_CMD -n2" +# fi if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then TEST_CMD="$TEST_CMD -ra" From eebaa638ce50e0ae1b88b4fbf80ea9f3d1ed007c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Nov 2021 18:35:02 +0100 Subject: [PATCH 14/17] Do not capture stdout --- build_tools/azure/test_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index f649f53f5f3ba..50eb24aab5098 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -55,5 +55,5 @@ if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then fi set -x -eval "$TEST_CMD -v -k test_memmap_on_contiguous_data --pyargs sklearn.utils.tests.test_testing" +eval "$TEST_CMD -s -v -k test_memmap_on_contiguous_data --pyargs sklearn.utils.tests.test_testing" set +x From 65180f7f6daf78c993e37434f74c59ee1302f43d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Nov 2021 19:12:04 +0100 Subject: [PATCH 15/17] [cd build] From 61ca684f1349d422f762d75a923f2a7c47299cb8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 16 Nov 2021 11:09:12 +0100 Subject: [PATCH 16/17] Try to force upload wheel artifacts in case of test failure --- .github/workflows/wheels.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 1f133d701ca53..2e04b65493f5f 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -110,6 +110,7 @@ jobs: - name: Store artifacts uses: actions/upload-artifact@v2 + if: failure() with: path: wheelhouse/*.whl From f90670fd3cc50bde4a5d10b88345390c46548811 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 16 Nov 2021 11:09:23 +0100 Subject: [PATCH 17/17] [cd build]