diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 1f133d701ca53..2e04b65493f5f 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -110,6 +110,7 @@ jobs: - name: Store artifacts uses: actions/upload-artifact@v2 + if: failure() with: path: wheelhouse/*.whl diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5bd845147b0f3..0cbbe255795bf 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -160,6 +160,11 @@ try: except ImportError: print('pandas not installed') " + +if [[ $(type -P "gcc") ]]; then + gcc --version +fi + # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=3 diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 44b06db6621c9..50eb24aab5098 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -46,14 +46,14 @@ if [[ -n "$CHECK_WARNINGS" ]]; then TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning" fi -if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then - TEST_CMD="$TEST_CMD -n2" -fi +# if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then +# TEST_CMD="$TEST_CMD -n2" +# fi if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then TEST_CMD="$TEST_CMD -ra" fi set -x -eval "$TEST_CMD --pyargs sklearn" +eval "$TEST_CMD -s -v -k test_memmap_on_contiguous_data --pyargs sklearn.utils.tests.test_testing" set +x diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx index 55ac82f9d80fd..03074661a622b 100644 --- a/sklearn/utils/_readonly_array_wrapper.pyx +++ b/sklearn/utils/_readonly_array_wrapper.pyx @@ -48,7 +48,7 @@ cdef class ReadonlyArrayWrapper: PyBuffer_Release(buffer) -def _test_sum(NUM_TYPES[:] x): +def _test_sum(NUM_TYPES[::1] x): """This function is for testing only. As this function does not modify x, we would like to define it as diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 1e4ecdd53e136..1724063be2f43 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -def create_memmap_backed_data(data, mmap_mode="r", return_folder=False): +def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False): """ Parameters ---------- data mmap_mode : str, default='r' return_folder : bool, default=False + aligned : bool, default=False + If True, if input is a single numpy array and if the input array is aligned, + the memory mapped array will also be aligned. This is a workaround for + https://github.com/joblib/joblib/issues/563. """ temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_") atexit.register(functools.partial(_delete_folder, temp_folder, warn=True)) - filename = op.join(temp_folder, "data.pkl") - joblib.dump(data, filename) - memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) + if aligned: + if isinstance(data, np.ndarray) and data.flags.aligned: + # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html + filename = op.join(temp_folder, "data.dat") + fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape) + fp[:] = data[:] # write data to memmap array + fp.flush() + memmap_backed_data = np.memmap( + filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape + ) + else: + raise ValueError("If aligned=True, input must be a single numpy array.") + else: + filename = op.join(temp_folder, "data.pkl") + joblib.dump(data, filename) + memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) result = ( memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder) ) diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index c385e1db51cd3..38163cc2461ce 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -13,7 +13,13 @@ def _readonly_array_copy(x): return y -@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data]) +def _create_memmap_backed_data(data): + return create_memmap_backed_data( + data, mmap_mode="r", return_folder=False, aligned=True + ) + + +@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data]) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_readonly_array_wrapper(readonly, dtype): """Test that ReadonlyWrapper allows working with fused-typed.""" diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index e9e9252bd5f0f..7ef5c44f53a85 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -12,6 +12,7 @@ from sklearn.utils.deprecation import deprecated from sklearn.utils.metaestimators import available_if, if_delegate_has_method +from sklearn.utils._readonly_array_wrapper import _test_sum from sklearn.utils._testing import ( assert_raises, assert_warns, @@ -680,30 +681,60 @@ def test_tempmemmap(monkeypatch): assert registration_counter.nb_calls == 2 -def test_create_memmap_backed_data(monkeypatch): +@pytest.mark.parametrize("aligned", [False, True]) +def test_create_memmap_backed_data(monkeypatch, aligned): registration_counter = RegistrationCounter() monkeypatch.setattr(atexit, "register", registration_counter) input_array = np.ones(3) - data = create_memmap_backed_data(input_array) + data = create_memmap_backed_data(input_array, aligned=aligned) check_memmap(input_array, data) assert registration_counter.nb_calls == 1 - data, folder = create_memmap_backed_data(input_array, return_folder=True) + data, folder = create_memmap_backed_data( + input_array, return_folder=True, aligned=aligned + ) check_memmap(input_array, data) assert folder == os.path.dirname(data.filename) assert registration_counter.nb_calls == 2 mmap_mode = "r+" - data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode) + data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned) check_memmap(input_array, data, mmap_mode) assert registration_counter.nb_calls == 3 input_list = [input_array, input_array + 1, input_array + 2] - mmap_data_list = create_memmap_backed_data(input_list) - for input_array, data in zip(input_list, mmap_data_list): - check_memmap(input_array, data) - assert registration_counter.nb_calls == 4 + if aligned: + with pytest.raises( + ValueError, match="If aligned=True, input must be a single numpy array." + ): + create_memmap_backed_data(input_list, aligned=True) + else: + mmap_data_list = create_memmap_backed_data(input_list, aligned=False) + for input_array, data in zip(input_list, mmap_data_list): + check_memmap(input_array, data) + assert registration_counter.nb_calls == 4 + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) +def test_memmap_on_contiguous_data(dtype): + """Test memory mapped array on contigous memoryview.""" + x = np.arange(10).astype(dtype) + assert x.flags["C_CONTIGUOUS"] + assert x.flags["ALIGNED"] + + # _test_sum consumes contiguous arrays + # def _test_sum(NUM_TYPES[::1] x): + sum_origin = _test_sum(x) + + # now on memory mapped data + # aligned=True so avoid https://github.com/joblib/joblib/issues/563 + # without alignment, this can produce segmentation faults, see + # https://github.com/scikit-learn/scikit-learn/pull/21654 + x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=False) + print(x_mmap.flags) + sum_mmap = _test_sum(x_mmap) + assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) @pytest.mark.parametrize(