Thanks to visit codestin.com
Credit goes to github.com

Skip to content

DEBUG try to reproduce Cython segfault on non-aligned data #21677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ jobs:

- name: Store artifacts
uses: actions/upload-artifact@v2
if: failure()
with:
path: wheelhouse/*.whl

Expand Down
5 changes: 5 additions & 0 deletions build_tools/azure/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,11 @@ try:
except ImportError:
print('pandas not installed')
"

if [[ $(type -P "gcc") ]]; then
gcc --version
fi

# Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI
# workers with 2 cores when building the compiled extensions of scikit-learn.
export SKLEARN_BUILD_PARALLEL=3
Expand Down
8 changes: 4 additions & 4 deletions build_tools/azure/test_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ if [[ -n "$CHECK_WARNINGS" ]]; then
TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning"
fi

if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
TEST_CMD="$TEST_CMD -n2"
fi
# if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
# TEST_CMD="$TEST_CMD -n2"
# fi

if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then
TEST_CMD="$TEST_CMD -ra"
fi

set -x
eval "$TEST_CMD --pyargs sklearn"
eval "$TEST_CMD -s -v -k test_memmap_on_contiguous_data --pyargs sklearn.utils.tests.test_testing"
set +x
2 changes: 1 addition & 1 deletion sklearn/utils/_readonly_array_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ cdef class ReadonlyArrayWrapper:
PyBuffer_Release(buffer)


def _test_sum(NUM_TYPES[:] x):
def _test_sum(NUM_TYPES[::1] x):
"""This function is for testing only.

As this function does not modify x, we would like to define it as
Expand Down
25 changes: 21 additions & 4 deletions sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb):
_delete_folder(self.temp_folder)


def create_memmap_backed_data(data, mmap_mode="r", return_folder=False):
def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
"""
Parameters
----------
data
mmap_mode : str, default='r'
return_folder : bool, default=False
aligned : bool, default=False
If True, if input is a single numpy array and if the input array is aligned,
the memory mapped array will also be aligned. This is a workaround for
https://github.com/joblib/joblib/issues/563.
"""
temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
filename = op.join(temp_folder, "data.pkl")
joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
if aligned:
if isinstance(data, np.ndarray) and data.flags.aligned:
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
filename = op.join(temp_folder, "data.dat")
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
fp[:] = data[:] # write data to memmap array
fp.flush()
memmap_backed_data = np.memmap(
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
)
else:
raise ValueError("If aligned=True, input must be a single numpy array.")
else:
filename = op.join(temp_folder, "data.pkl")
joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
result = (
memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)
)
Expand Down
8 changes: 7 additions & 1 deletion sklearn/utils/tests/test_readonly_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ def _readonly_array_copy(x):
return y


@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data])
def _create_memmap_backed_data(data):
return create_memmap_backed_data(
data, mmap_mode="r", return_folder=False, aligned=True
)


@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_readonly_array_wrapper(readonly, dtype):
"""Test that ReadonlyWrapper allows working with fused-typed."""
Expand Down
47 changes: 39 additions & 8 deletions sklearn/utils/tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from sklearn.utils.deprecation import deprecated
from sklearn.utils.metaestimators import available_if, if_delegate_has_method
from sklearn.utils._readonly_array_wrapper import _test_sum
from sklearn.utils._testing import (
assert_raises,
assert_warns,
Expand Down Expand Up @@ -680,30 +681,60 @@ def test_tempmemmap(monkeypatch):
assert registration_counter.nb_calls == 2


def test_create_memmap_backed_data(monkeypatch):
@pytest.mark.parametrize("aligned", [False, True])
def test_create_memmap_backed_data(monkeypatch, aligned):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, "register", registration_counter)

input_array = np.ones(3)
data = create_memmap_backed_data(input_array)
data = create_memmap_backed_data(input_array, aligned=aligned)
check_memmap(input_array, data)
assert registration_counter.nb_calls == 1

data, folder = create_memmap_backed_data(input_array, return_folder=True)
data, folder = create_memmap_backed_data(
input_array, return_folder=True, aligned=aligned
)
check_memmap(input_array, data)
assert folder == os.path.dirname(data.filename)
assert registration_counter.nb_calls == 2

mmap_mode = "r+"
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned)
check_memmap(input_array, data, mmap_mode)
assert registration_counter.nb_calls == 3

input_list = [input_array, input_array + 1, input_array + 2]
mmap_data_list = create_memmap_backed_data(input_list)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4
if aligned:
with pytest.raises(
ValueError, match="If aligned=True, input must be a single numpy array."
):
create_memmap_backed_data(input_list, aligned=True)
else:
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4


@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_memmap_on_contiguous_data(dtype):
"""Test memory mapped array on contigous memoryview."""
x = np.arange(10).astype(dtype)
assert x.flags["C_CONTIGUOUS"]
assert x.flags["ALIGNED"]

# _test_sum consumes contiguous arrays
# def _test_sum(NUM_TYPES[::1] x):
sum_origin = _test_sum(x)

# now on memory mapped data
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
# without alignment, this can produce segmentation faults, see
# https://github.com/scikit-learn/scikit-learn/pull/21654
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=False)
print(x_mmap.flags)
sum_mmap = _test_sum(x_mmap)
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)


@pytest.mark.parametrize(
Expand Down