diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 53c416c506614..56799e7ad77e1 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -118,6 +118,15 @@ Changelog when `y=None`. :pr: `15918` by :user: `Luca Kubin `. +:mod:`sklearn.neighbors` +.............................. + +- |Fix| Fix a bug which converted a list of arrays into a 2-D object + array instead of a 1-D array containing NumPy arrays. This bug + was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`. + :pr:`16076` by :user:`Guillaume Lemaitre ` and + :user:`Alex Shacked `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 258440d20c836..f927c26868a5f 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -24,6 +24,7 @@ from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, gen_even_slices +from ..utils import _to_object_array from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..utils.validation import check_non_negative @@ -276,8 +277,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = np.array(np.split(data, indptr[1:-1]), dtype=object) - neigh_ind = np.array(np.split(indices, indptr[1:-1]), dtype=object) + neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) + neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) if return_distance: return neigh_dist, neigh_ind @@ -940,17 +941,12 @@ class from an array representing our data set and ask who's neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) neigh_dist_list = sum(neigh_dist_chunks, []) neigh_ind_list = sum(neigh_ind_chunks, []) - # See https://github.com/numpy/numpy/issues/5456 - # to understand why this is initialized this way. - neigh_dist = np.empty(len(neigh_dist_list), dtype='object') - neigh_dist[:] = neigh_dist_list - neigh_ind = np.empty(len(neigh_ind_list), dtype='object') - neigh_ind[:] = neigh_ind_list + neigh_dist = _to_object_array(neigh_dist_list) + neigh_ind = _to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind else: neigh_ind_list = sum(chunked_results, []) - results = np.empty(len(neigh_ind_list), dtype='object') - results[:] = neigh_ind_list + results = _to_object_array(neigh_ind_list) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 03c79086dfedd..88e32669777a1 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -649,6 +649,30 @@ def test_radius_neighbors_boundary_handling(): assert_array_equal(results[0], [0, 1]) +def test_radius_neighbors_returns_array_of_objects(): + # check that we can pass precomputed distances to + # NearestNeighbors.radius_neighbors() + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/16036 + X = csr_matrix(np.ones((4, 4))) + X.setdiag([0, 0, 0, 0]) + + nbrs = neighbors.NearestNeighbors(radius=0.5, algorithm='auto', + leaf_size=30, + metric='precomputed').fit(X) + neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True) + + expected_dist = np.empty(X.shape[0], dtype=object) + expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), + np.array([0])] + expected_ind = np.empty(X.shape[0], dtype=object) + expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), + np.array([3])] + + assert_array_equal(neigh_dist, expected_dist) + assert_array_equal(neigh_ind, expected_ind) + + def test_RadiusNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 6cdb198182a20..887fa90c98d61 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -14,6 +14,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings +from sklearn.utils import _to_object_array from sklearn.preprocessing._label import LabelBinarizer from sklearn.preprocessing._label import MultiLabelBinarizer @@ -433,8 +434,7 @@ def test_multilabel_binarizer_same_length_sequence(): def test_multilabel_binarizer_non_integer_labels(): - tuple_classes = np.empty(3, dtype=object) - tuple_classes[:] = [(1,), (2,), (3,)] + tuple_classes = _to_object_array([(1,), (2,), (3,)]) inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 82abff2b12183..ee38b9b924ccc 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -819,6 +819,45 @@ def tosequence(x): return list(x) +def _to_object_array(sequence): + """Convert sequence to a 1-D NumPy array of object dtype. + + numpy.array constructor has a similar use but it's output + is ambiguous. It can be 1-D NumPy array of object dtype if + the input is a ragged array, but if the input is a list of + equal length arrays, then the output is a 2D numpy.array. + _to_object_array solves this ambiguity by guarantying that + the output is a 1-D NumPy array of objects for any input. + + Parameters + ---------- + sequence : array-like of shape (n_elements,) + The sequence to be converted. + + Returns + ------- + out : ndarray of shape (n_elements,), dtype=object + The converted sequence into a 1-D NumPy array of object dtype. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.utils import _to_object_array + >>> _to_object_array([np.array([0]), np.array([1])]) + array([array([0]), array([1])], dtype=object) + >>> _to_object_array([np.array([0]), np.array([1, 2])]) + array([array([0]), array([1, 2])], dtype=object) + >>> np.array([np.array([0]), np.array([1])]) + array([[0], + [1]]) + >>> np.array([np.array([0]), np.array([1, 2])]) + array([array([0]), array([1, 2])], dtype=object) + """ + out = np.empty(len(sequence), dtype=object) + out[:] = sequence + return out + + def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 2e2711f595d11..c3ae523b32b39 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -28,6 +28,7 @@ from sklearn.utils import _message_with_time, _print_elapsed_time from sklearn.utils import get_chunk_n_rows from sklearn.utils import is_scalar_nan +from sklearn.utils import _to_object_array from sklearn.utils._mocking import MockDataFrame from sklearn import config_context @@ -646,3 +647,14 @@ def test_deprecation_joblib_api(tmpdir): from sklearn.utils._joblib import joblib del joblib.parallel.BACKENDS['failing'] + + +@pytest.mark.parametrize( + "sequence", + [[np.array(1), np.array(2)], [[1, 2], [3, 4]]] +) +def test_to_object_array(sequence): + out = _to_object_array(sequence) + assert isinstance(out, np.ndarray) + assert out.dtype.kind == 'O' + assert out.ndim == 1