diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 1f133d701ca53..2e04b65493f5f 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -110,6 +110,7 @@ jobs:
 
       - name: Store artifacts
         uses: actions/upload-artifact@v2
+        if: failure()
         with:
           path: wheelhouse/*.whl
 
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 5bd845147b0f3..0cbbe255795bf 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -160,6 +160,11 @@ try:
 except ImportError:
     print('pandas not installed')
 "
+
+if [[ $(type -P "gcc") ]]; then
+    gcc --version
+fi
+
 # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI
 # workers with 2 cores when building the compiled extensions of scikit-learn.
 export SKLEARN_BUILD_PARALLEL=3
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index 44b06db6621c9..50eb24aab5098 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -46,14 +46,14 @@ if [[ -n "$CHECK_WARNINGS" ]]; then
     TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning"
 fi
 
-if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
-    TEST_CMD="$TEST_CMD -n2"
-fi
+# if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
+#     TEST_CMD="$TEST_CMD -n2"
+# fi
 
 if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then
     TEST_CMD="$TEST_CMD -ra"
 fi
 
 set -x
-eval "$TEST_CMD --pyargs sklearn"
+eval "$TEST_CMD -s -v -k test_memmap_on_contiguous_data --pyargs sklearn.utils.tests.test_testing"
 set +x
diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx
index 55ac82f9d80fd..03074661a622b 100644
--- a/sklearn/utils/_readonly_array_wrapper.pyx
+++ b/sklearn/utils/_readonly_array_wrapper.pyx
@@ -48,7 +48,7 @@ cdef class ReadonlyArrayWrapper:
         PyBuffer_Release(buffer)
 
 
-def _test_sum(NUM_TYPES[:] x):
+def _test_sum(NUM_TYPES[::1] x):
     """This function is for testing only.
 
     As this function does not modify x, we would like to define it as
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 1e4ecdd53e136..1724063be2f43 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         _delete_folder(self.temp_folder)
 
 
-def create_memmap_backed_data(data, mmap_mode="r", return_folder=False):
+def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
     """
     Parameters
     ----------
     data
     mmap_mode : str, default='r'
     return_folder :  bool, default=False
+    aligned : bool, default=False
+        If True, if input is a single numpy array and if the input array is aligned,
+        the memory mapped array will also be aligned. This is a workaround for
+        https://github.com/joblib/joblib/issues/563.
     """
     temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
     atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
-    filename = op.join(temp_folder, "data.pkl")
-    joblib.dump(data, filename)
-    memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
+    if aligned:
+        if isinstance(data, np.ndarray) and data.flags.aligned:
+            # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
+            filename = op.join(temp_folder, "data.dat")
+            fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
+            fp[:] = data[:]  # write data to memmap array
+            fp.flush()
+            memmap_backed_data = np.memmap(
+                filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
+            )
+        else:
+            raise ValueError("If aligned=True, input must be a single numpy array.")
+    else:
+        filename = op.join(temp_folder, "data.pkl")
+        joblib.dump(data, filename)
+        memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
     result = (
         memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)
     )
diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py
index c385e1db51cd3..38163cc2461ce 100644
--- a/sklearn/utils/tests/test_readonly_wrapper.py
+++ b/sklearn/utils/tests/test_readonly_wrapper.py
@@ -13,7 +13,13 @@ def _readonly_array_copy(x):
     return y
 
 
-@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data])
+def _create_memmap_backed_data(data):
+    return create_memmap_backed_data(
+        data, mmap_mode="r", return_folder=False, aligned=True
+    )
+
+
+@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
 def test_readonly_array_wrapper(readonly, dtype):
     """Test that ReadonlyWrapper allows working with fused-typed."""
diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py
index e9e9252bd5f0f..7ef5c44f53a85 100644
--- a/sklearn/utils/tests/test_testing.py
+++ b/sklearn/utils/tests/test_testing.py
@@ -12,6 +12,7 @@
 
 from sklearn.utils.deprecation import deprecated
 from sklearn.utils.metaestimators import available_if, if_delegate_has_method
+from sklearn.utils._readonly_array_wrapper import _test_sum
 from sklearn.utils._testing import (
     assert_raises,
     assert_warns,
@@ -680,30 +681,60 @@ def test_tempmemmap(monkeypatch):
     assert registration_counter.nb_calls == 2
 
 
-def test_create_memmap_backed_data(monkeypatch):
+@pytest.mark.parametrize("aligned", [False, True])
+def test_create_memmap_backed_data(monkeypatch, aligned):
     registration_counter = RegistrationCounter()
     monkeypatch.setattr(atexit, "register", registration_counter)
 
     input_array = np.ones(3)
-    data = create_memmap_backed_data(input_array)
+    data = create_memmap_backed_data(input_array, aligned=aligned)
     check_memmap(input_array, data)
     assert registration_counter.nb_calls == 1
 
-    data, folder = create_memmap_backed_data(input_array, return_folder=True)
+    data, folder = create_memmap_backed_data(
+        input_array, return_folder=True, aligned=aligned
+    )
     check_memmap(input_array, data)
     assert folder == os.path.dirname(data.filename)
     assert registration_counter.nb_calls == 2
 
     mmap_mode = "r+"
-    data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
+    data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned)
     check_memmap(input_array, data, mmap_mode)
     assert registration_counter.nb_calls == 3
 
     input_list = [input_array, input_array + 1, input_array + 2]
-    mmap_data_list = create_memmap_backed_data(input_list)
-    for input_array, data in zip(input_list, mmap_data_list):
-        check_memmap(input_array, data)
-    assert registration_counter.nb_calls == 4
+    if aligned:
+        with pytest.raises(
+            ValueError, match="If aligned=True, input must be a single numpy array."
+        ):
+            create_memmap_backed_data(input_list, aligned=True)
+    else:
+        mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
+        for input_array, data in zip(input_list, mmap_data_list):
+            check_memmap(input_array, data)
+        assert registration_counter.nb_calls == 4
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
+def test_memmap_on_contiguous_data(dtype):
+    """Test memory mapped array on contigous memoryview."""
+    x = np.arange(10).astype(dtype)
+    assert x.flags["C_CONTIGUOUS"]
+    assert x.flags["ALIGNED"]
+
+    # _test_sum consumes contiguous arrays
+    # def _test_sum(NUM_TYPES[::1] x):
+    sum_origin = _test_sum(x)
+
+    # now on memory mapped data
+    # aligned=True so avoid https://github.com/joblib/joblib/issues/563
+    # without alignment, this can produce segmentation faults, see
+    # https://github.com/scikit-learn/scikit-learn/pull/21654
+    x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=False)
+    print(x_mmap.flags)
+    sum_mmap = _test_sum(x_mmap)
+    assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
 
 
 @pytest.mark.parametrize(