From c2abc89461b48d1096115dcf9f70c15a15b11d00 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Thu, 9 Jan 2020 18:15:09 +0200 Subject: [PATCH 01/32] [MRG] Using dbscan with precomputed neighbors gives an error in 0.22.X, but not in 0.21.3 (#16036) --- sklearn/neighbors/_base.py | 11 +++++++++-- sklearn/neighbors/tests/test_neighbors.py | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 258440d20c836..a8f0134752a65 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -276,8 +276,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = np.array(np.split(data, indptr[1:-1]), dtype=object) - neigh_ind = np.array(np.split(indices, indptr[1:-1]), dtype=object) + neigh_dist = _array_of_objects(np.split(data, indptr[1:-1])) + neigh_ind = _array_of_objects(np.split(indices, indptr[1:-1])) if return_distance: return neigh_dist, neigh_ind @@ -285,6 +285,13 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): return neigh_ind +def _array_of_objects(sequence): + """ casts a sequence to an array of objects""" + aro = np.empty(len(sequence), dtype=object) + aro[:] = sequence + return aro + + class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): """Base class for nearest neighbors estimators.""" diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 03c79086dfedd..477a782d9abf5 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -649,6 +649,24 @@ def test_radius_neighbors_boundary_handling(): assert_array_equal(results[0], [0, 1]) +def test_radius_neighbors_returns_array_of_objects(): + """regression for issue #16036 + """ + X = csr_matrix(np.ones((4, 4))) + X.setdiag([0, 0, 0, 0]) + + nbrs = neighbors.NearestNeighbors(radius=0.5, algorithm='auto', + leaf_size=30, metric='precomputed', + metric_params=None, p=2, + n_jobs=None).fit(X) + results = nbrs.radius_neighbors(X, return_distance=False) + + expected = np.empty(X.shape[0], dtype=object) + expected[:] = [np.array([0]), np.array([1]), np.array([2]), + np.array([3])] + assert_array_equal(results, expected) + + def test_RadiusNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) From 06f01d4e9413515fe7526288bbab7084bf10765a Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 00:44:20 +0200 Subject: [PATCH 02/32] [MRG] Using dbscan with precomputed neighbours. (#16036) in regression testing also distances --- sklearn/neighbors/tests/test_neighbors.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 477a782d9abf5..3790a87f47bae 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -656,15 +656,19 @@ def test_radius_neighbors_returns_array_of_objects(): X.setdiag([0, 0, 0, 0]) nbrs = neighbors.NearestNeighbors(radius=0.5, algorithm='auto', - leaf_size=30, metric='precomputed', - metric_params=None, p=2, - n_jobs=None).fit(X) - results = nbrs.radius_neighbors(X, return_distance=False) - - expected = np.empty(X.shape[0], dtype=object) - expected[:] = [np.array([0]), np.array([1]), np.array([2]), + leaf_size=30, + metric='precomputed').fit(X) + neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True) + + expected_dist = np.empty(X.shape[0], dtype=object) + expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), + np.array([0])] + expected_ind = np.empty(X.shape[0], dtype=object) + expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), np.array([3])] - assert_array_equal(results, expected) + + assert_array_equal(neigh_dist, expected_dist) + assert_array_equal(neigh_ind, expected_ind) def test_RadiusNeighborsClassifier_multioutput(): From 0fca6f0983211f22d1653b263f22f794b38af0fe Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 00:50:19 +0200 Subject: [PATCH 03/32] [MRG] Using dbscan with precomputed neighbours. (#16036) PEP fixes --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 3790a87f47bae..a671cf19ea68f 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -665,7 +665,7 @@ def test_radius_neighbors_returns_array_of_objects(): np.array([0])] expected_ind = np.empty(X.shape[0], dtype=object) expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), - np.array([3])] + np.array([3])] assert_array_equal(neigh_dist, expected_dist) assert_array_equal(neigh_ind, expected_ind) From 887f154bc3bdae53b0e45238c80d0f9e5922861e Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 01:12:33 +0200 Subject: [PATCH 04/32] [MRG] Using dbscan with precomputed neighbours. (#16036) change log --- doc/whats_new/v0.23.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 53c416c506614..a6e9d62c439e3 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -118,6 +118,15 @@ Changelog when `y=None`. :pr: `15918` by :user: `Luca Kubin `. +:mod:`sklearn.neighbors` +.............................. + +- |Fix| Fixed bug in function _radius_neighbors_from_graph(), module + neighbours._base.py. The fix ensures _radius_neighbors_from_graph() + will always return a numpy.array of objects. + :pr:`16076` by :user: `Guillaume Lemaitre ` and + `Alex Shacked `. + :mod:`sklearn.preprocessing` ............................ From e582a40adb458e005cce9f9bffb84fddfe7f0f2f Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:15:58 +0200 Subject: [PATCH 05/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index a8f0134752a65..0e461514db880 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -276,7 +276,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = _array_of_objects(np.split(data, indptr[1:-1])) + neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) neigh_ind = _array_of_objects(np.split(indices, indptr[1:-1])) if return_distance: From eda47873017892a3c10c34dfabd0d2fee3b9773d Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:16:23 +0200 Subject: [PATCH 06/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 0e461514db880..ea81f7eda6bdf 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -277,7 +277,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): if return_distance: neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) - neigh_ind = _array_of_objects(np.split(indices, indptr[1:-1])) + neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) if return_distance: return neigh_dist, neigh_ind From eedad66e6a95f7424572977b8225b3b6fc8e7cc3 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:17:43 +0200 Subject: [PATCH 07/32] Update sklearn/neighbors/tests/test_neighbors.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/tests/test_neighbors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a671cf19ea68f..dc8bab1d14b14 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -650,7 +650,10 @@ def test_radius_neighbors_boundary_handling(): def test_radius_neighbors_returns_array_of_objects(): - """regression for issue #16036 + # check that we can pass precomputed distances to + # NearestNeighbors.radius_neighbors() + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/16036 """ X = csr_matrix(np.ones((4, 4))) X.setdiag([0, 0, 0, 0]) From bfeee9265f65553f0595160282f73977150d6313 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:20:10 +0200 Subject: [PATCH 08/32] Update sklearn/neighbors/tests/test_neighbors.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/tests/test_neighbors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index dc8bab1d14b14..88e32669777a1 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -654,7 +654,6 @@ def test_radius_neighbors_returns_array_of_objects(): # NearestNeighbors.radius_neighbors() # non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/16036 - """ X = csr_matrix(np.ones((4, 4))) X.setdiag([0, 0, 0, 0]) From e13297cf1787b233bf6499751a8fd38ec39d45b1 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:21:46 +0200 Subject: [PATCH 09/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index ea81f7eda6bdf..e894bd6398114 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -285,7 +285,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): return neigh_ind -def _array_of_objects(sequence): +def _to_object_array(sequence): """ casts a sequence to an array of objects""" aro = np.empty(len(sequence), dtype=object) aro[:] = sequence From a512168fc5588247bcec4adfd064e012fb7b6548 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:22:13 +0200 Subject: [PATCH 10/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index e894bd6398114..d4829f41b1cdc 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -286,7 +286,6 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): def _to_object_array(sequence): - """ casts a sequence to an array of objects""" aro = np.empty(len(sequence), dtype=object) aro[:] = sequence return aro From a538fac3a0e6d50fcb7c237058bac110f0ad6333 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:22:44 +0200 Subject: [PATCH 11/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index d4829f41b1cdc..303299be516f2 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -286,7 +286,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): def _to_object_array(sequence): - aro = np.empty(len(sequence), dtype=object) + out = np.empty(len(sequence), dtype=object) aro[:] = sequence return aro From 19b23b2ce195dcca7b6e4273395005de487b82bc Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:22:57 +0200 Subject: [PATCH 12/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 303299be516f2..04c5d2ec1dc88 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -287,7 +287,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): def _to_object_array(sequence): out = np.empty(len(sequence), dtype=object) - aro[:] = sequence + out[:] = sequence return aro From af6b74de12b108b363e4aa55606fe95251110862 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 01:23:07 +0200 Subject: [PATCH 13/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 04c5d2ec1dc88..cc5bb11149c1f 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -288,7 +288,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): def _to_object_array(sequence): out = np.empty(len(sequence), dtype=object) out[:] = sequence - return aro + return out class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): From e2d9f8ba979450c3e1cfc9cd0e1b3361a12f0de3 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 02:04:42 +0200 Subject: [PATCH 14/32] (#16036) refactoring with new function _to_object_array() --- sklearn/neighbors/_base.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index cc5bb11149c1f..adc8a48f03f8c 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -948,15 +948,12 @@ class from an array representing our data set and ask who's neigh_ind_list = sum(neigh_ind_chunks, []) # See https://github.com/numpy/numpy/issues/5456 # to understand why this is initialized this way. - neigh_dist = np.empty(len(neigh_dist_list), dtype='object') - neigh_dist[:] = neigh_dist_list - neigh_ind = np.empty(len(neigh_ind_list), dtype='object') - neigh_ind[:] = neigh_ind_list + neigh_dist = _to_object_array(neigh_dist_list) + neigh_ind = _to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind else: neigh_ind_list = sum(chunked_results, []) - results = np.empty(len(neigh_ind_list), dtype='object') - results[:] = neigh_ind_list + results = _to_object_array(neigh_ind_list) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): From 93d87d54c5b563524c578f8c26273e644cec17bb Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 15:44:54 +0200 Subject: [PATCH 15/32] Update doc/whats_new/v0.23.rst Co-Authored-By: Guillaume Lemaitre --- doc/whats_new/v0.23.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index a6e9d62c439e3..1dd6e8e53b67a 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -125,7 +125,7 @@ Changelog neighbours._base.py. The fix ensures _radius_neighbors_from_graph() will always return a numpy.array of objects. :pr:`16076` by :user: `Guillaume Lemaitre ` and - `Alex Shacked `. + :user:`Alex Shacked `. :mod:`sklearn.preprocessing` ............................ From 71f9c7faaaaa57acd298a5e186caff9afba3313f Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 15:45:18 +0200 Subject: [PATCH 16/32] Update doc/whats_new/v0.23.rst Co-Authored-By: Guillaume Lemaitre --- doc/whats_new/v0.23.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 1dd6e8e53b67a..78cd539de22c3 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -124,7 +124,7 @@ Changelog - |Fix| Fixed bug in function _radius_neighbors_from_graph(), module neighbours._base.py. The fix ensures _radius_neighbors_from_graph() will always return a numpy.array of objects. - :pr:`16076` by :user: `Guillaume Lemaitre ` and + :pr:`16076` by :user:`Guillaume Lemaitre ` and :user:`Alex Shacked `. :mod:`sklearn.preprocessing` From 33b68e5450ccabee47bb7f50b024d5d033d49510 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 15:48:38 +0200 Subject: [PATCH 17/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index adc8a48f03f8c..8c7c5e31f4a8e 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -286,6 +286,18 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): def _to_object_array(sequence): + """Convert sequence to a 1-D NumPy array of object dtype. + + Parameters + ---------- + sequence : array-like of shape (n_elements,) + The sequence to be converted + + Returns + ------- + out : ndarray of shape (n_elements,), dtype=object + The converted sequence into a 1-D NumPy array of object dtype. + """ out = np.empty(len(sequence), dtype=object) out[:] = sequence return out From 5edfeb64f0a6bac60fa3cc6fd01ae8f8f6516053 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 17:31:10 +0200 Subject: [PATCH 18/32] [MRG] Using dbscan with precomputed neighbours. (#16036) moved to_object_array() to sklearn.utils --- doc/whats_new/v0.23.rst | 6 ++--- sklearn/neighbors/_base.py | 29 +++++------------------ sklearn/preprocessing/tests/test_label.py | 4 ++-- sklearn/utils/__init__.py | 27 +++++++++++++++++++++ sklearn/utils/tests/test_utils.py | 12 ++++++++++ 5 files changed, 50 insertions(+), 28 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 78cd539de22c3..1e1772988b94f 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -121,9 +121,9 @@ Changelog :mod:`sklearn.neighbors` .............................. -- |Fix| Fixed bug in function _radius_neighbors_from_graph(), module - neighbours._base.py. The fix ensures _radius_neighbors_from_graph() - will always return a numpy.array of objects. +- |Fix| fix a bug which converted a list of arrays into a 2-D object + array instead of a 1-D array containing NumPy arrays. This bug + was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`. :pr:`16076` by :user:`Guillaume Lemaitre ` and :user:`Alex Shacked `. diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 8c7c5e31f4a8e..2ce2a519a2886 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -24,6 +24,7 @@ from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, gen_even_slices +from ..utils import to_object_array from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..utils.validation import check_non_negative @@ -276,8 +277,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) - neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) + neigh_dist = to_object_array(np.split(data, indptr[1:-1])) + neigh_ind = to_object_array(np.split(indices, indptr[1:-1])) if return_distance: return neigh_dist, neigh_ind @@ -285,24 +286,6 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): return neigh_ind -def _to_object_array(sequence): - """Convert sequence to a 1-D NumPy array of object dtype. - - Parameters - ---------- - sequence : array-like of shape (n_elements,) - The sequence to be converted - - Returns - ------- - out : ndarray of shape (n_elements,), dtype=object - The converted sequence into a 1-D NumPy array of object dtype. - """ - out = np.empty(len(sequence), dtype=object) - out[:] = sequence - return out - - class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): """Base class for nearest neighbors estimators.""" @@ -960,12 +943,12 @@ class from an array representing our data set and ask who's neigh_ind_list = sum(neigh_ind_chunks, []) # See https://github.com/numpy/numpy/issues/5456 # to understand why this is initialized this way. - neigh_dist = _to_object_array(neigh_dist_list) - neigh_ind = _to_object_array(neigh_ind_list) + neigh_dist = to_object_array(neigh_dist_list) + neigh_ind = to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind else: neigh_ind_list = sum(chunked_results, []) - results = _to_object_array(neigh_ind_list) + results = to_object_array(neigh_ind_list) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 6cdb198182a20..688db4447e52d 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -14,6 +14,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings +from sklearn.utils import to_object_array from sklearn.preprocessing._label import LabelBinarizer from sklearn.preprocessing._label import MultiLabelBinarizer @@ -433,8 +434,7 @@ def test_multilabel_binarizer_same_length_sequence(): def test_multilabel_binarizer_non_integer_labels(): - tuple_classes = np.empty(3, dtype=object) - tuple_classes[:] = [(1,), (2,), (3,)] + tuple_classes = to_object_array([(1,), (2,), (3,)]) inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 82abff2b12183..22305c32bdd7f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -819,6 +819,33 @@ def tosequence(x): return list(x) +def to_object_array(sequence): + """Convert sequence to a 1-D NumPy array of object dtype. + + Parameters + ---------- + sequence : array-like of shape (n_elements,) + The sequence to be converted + + Returns + ------- + out : ndarray of shape (n_elements,), dtype=object + The converted sequence into a 1-D NumPy array of object dtype. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.utils import to_object_array + >>> to_object_array([np.array([0]), np.array([1])]) + array([array([0]), array([1])], dtype=object) + >>> to_object_array([np.array([0]), np.array([1, 2])]) + array([array([0]), array([1, 2])], dtype=object) + """ + out = np.empty(len(sequence), dtype=object) + out[:] = sequence + return out + + def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 2e2711f595d11..69926afa1867c 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -28,6 +28,7 @@ from sklearn.utils import _message_with_time, _print_elapsed_time from sklearn.utils import get_chunk_n_rows from sklearn.utils import is_scalar_nan +from sklearn.utils import to_object_array from sklearn.utils._mocking import MockDataFrame from sklearn import config_context @@ -646,3 +647,14 @@ def test_deprecation_joblib_api(tmpdir): from sklearn.utils._joblib import joblib del joblib.parallel.BACKENDS['failing'] + + +@pytest.mark.parametrize( + "sequence", + [[np.array(1), np.array(2)], [[1, 2], [3, 4]]] +) +def test_to_object_array(sequence): + out = to_object_array(sequence) + assert isinstance(out, np.ndarray) + assert out.dtype.kind == 'O' + assert out.ndim == 1 \ No newline at end of file From d3e05ddd0caa9a262ed0b4a3e7520e2f1c353782 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 17:37:47 +0200 Subject: [PATCH 19/32] [MRG] Using dbscan with precomputed neighbours. (#16036) fix PEP8 errors --- sklearn/utils/tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 69926afa1867c..dc339b871446f 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -657,4 +657,4 @@ def test_to_object_array(sequence): out = to_object_array(sequence) assert isinstance(out, np.ndarray) assert out.dtype.kind == 'O' - assert out.ndim == 1 \ No newline at end of file + assert out.ndim == 1 From a992048343e17a754147d0ad508bdedd34521601 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:18:43 +0200 Subject: [PATCH 20/32] Update sklearn/preprocessing/tests/test_label.py Co-Authored-By: Guillaume Lemaitre --- sklearn/preprocessing/tests/test_label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 688db4447e52d..502c6dba0e712 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -434,7 +434,7 @@ def test_multilabel_binarizer_same_length_sequence(): def test_multilabel_binarizer_non_integer_labels(): - tuple_classes = to_object_array([(1,), (2,), (3,)]) + tuple_classes = _to_object_array([(1,), (2,), (3,)]) inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), From ece10a1d00acb7487210ce498ca76c7c632b7a30 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:19:12 +0200 Subject: [PATCH 21/32] Update sklearn/utils/__init__.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 22305c32bdd7f..9dcdc7df09fd9 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -819,7 +819,7 @@ def tosequence(x): return list(x) -def to_object_array(sequence): +def _to_object_array(sequence): """Convert sequence to a 1-D NumPy array of object dtype. Parameters From 6a75af4e8d5f1ee4b1b4a6b3459925de0c79e917 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:19:46 +0200 Subject: [PATCH 22/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 2ce2a519a2886..9ee2cffac682c 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -948,7 +948,7 @@ class from an array representing our data set and ask who's results = neigh_dist, neigh_ind else: neigh_ind_list = sum(chunked_results, []) - results = to_object_array(neigh_ind_list) + results = _to_object_array(neigh_ind_list) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): From 83df8c6bff33c8e7add6ab5b88cb7f2813a553b3 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:20:04 +0200 Subject: [PATCH 23/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 9ee2cffac682c..a2aff3b1c7b25 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -944,7 +944,7 @@ class from an array representing our data set and ask who's # See https://github.com/numpy/numpy/issues/5456 # to understand why this is initialized this way. neigh_dist = to_object_array(neigh_dist_list) - neigh_ind = to_object_array(neigh_ind_list) + neigh_ind = _to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind else: neigh_ind_list = sum(chunked_results, []) From 3508571317ae86905e59d418627d05c58c912660 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:20:21 +0200 Subject: [PATCH 24/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index a2aff3b1c7b25..19f3954401c64 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -943,7 +943,7 @@ class from an array representing our data set and ask who's neigh_ind_list = sum(neigh_ind_chunks, []) # See https://github.com/numpy/numpy/issues/5456 # to understand why this is initialized this way. - neigh_dist = to_object_array(neigh_dist_list) + neigh_dist = _to_object_array(neigh_dist_list) neigh_ind = _to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind else: From 8ddd3e5ed8cc51644e8f2dbaed9f1be18e604258 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:20:32 +0200 Subject: [PATCH 25/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 19f3954401c64..a248de4c7c2d1 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -278,7 +278,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): if return_distance: neigh_dist = to_object_array(np.split(data, indptr[1:-1])) - neigh_ind = to_object_array(np.split(indices, indptr[1:-1])) + neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) if return_distance: return neigh_dist, neigh_ind From c302eb9e521fabe0a517548a9b76a7fed6e1f658 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:20:46 +0200 Subject: [PATCH 26/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index a248de4c7c2d1..e2cb90fdf8e63 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -277,7 +277,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = to_object_array(np.split(data, indptr[1:-1])) + neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) if return_distance: From cf8ada9b59fdaa266245263933baba62ff8d1814 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Fri, 10 Jan 2020 20:21:24 +0200 Subject: [PATCH 27/32] Update sklearn/neighbors/_base.py Co-Authored-By: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index e2cb90fdf8e63..3f031daf76938 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -24,7 +24,7 @@ from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, gen_even_slices -from ..utils import to_object_array +from ..utils import _to_object_array from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..utils.validation import check_non_negative From 8df3f1cbcd7dc3009f4ca44416204777914950cb Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 20:27:08 +0200 Subject: [PATCH 28/32] [MRG] Using dbscan with precomputed neighbours. (#16036) removed unnecessary comment --- sklearn/neighbors/_base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3f031daf76938..f927c26868a5f 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -941,8 +941,6 @@ class from an array representing our data set and ask who's neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) neigh_dist_list = sum(neigh_dist_chunks, []) neigh_ind_list = sum(neigh_ind_chunks, []) - # See https://github.com/numpy/numpy/issues/5456 - # to understand why this is initialized this way. neigh_dist = _to_object_array(neigh_dist_list) neigh_ind = _to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind From aaf778e76fdd19b73f623a3db51fc1592fd04f35 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 20:31:36 +0200 Subject: [PATCH 29/32] fixed import statement --- sklearn/preprocessing/tests/test_label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 502c6dba0e712..887fa90c98d61 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -14,7 +14,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings -from sklearn.utils import to_object_array +from sklearn.utils import _to_object_array from sklearn.preprocessing._label import LabelBinarizer from sklearn.preprocessing._label import MultiLabelBinarizer From e5dcb2879d72af8a40a7d15b90a3f29fa90d9df6 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Fri, 10 Jan 2020 20:51:55 +0200 Subject: [PATCH 30/32] [MRG] Using dbscan with precomputed neighbours. (#16036) restored underscore --- sklearn/utils/__init__.py | 6 +++--- sklearn/utils/tests/test_utils.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 9dcdc7df09fd9..a6f66999c7b8f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -835,10 +835,10 @@ def _to_object_array(sequence): Examples -------- >>> import numpy as np - >>> from sklearn.utils import to_object_array - >>> to_object_array([np.array([0]), np.array([1])]) + >>> from sklearn.utils import _to_object_array + >>> _to_object_array([np.array([0]), np.array([1])]) array([array([0]), array([1])], dtype=object) - >>> to_object_array([np.array([0]), np.array([1, 2])]) + >>> _to_object_array([np.array([0]), np.array([1, 2])]) array([array([0]), array([1, 2])], dtype=object) """ out = np.empty(len(sequence), dtype=object) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index dc339b871446f..c3ae523b32b39 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -28,7 +28,7 @@ from sklearn.utils import _message_with_time, _print_elapsed_time from sklearn.utils import get_chunk_n_rows from sklearn.utils import is_scalar_nan -from sklearn.utils import to_object_array +from sklearn.utils import _to_object_array from sklearn.utils._mocking import MockDataFrame from sklearn import config_context @@ -654,7 +654,7 @@ def test_deprecation_joblib_api(tmpdir): [[np.array(1), np.array(2)], [[1, 2], [3, 4]]] ) def test_to_object_array(sequence): - out = to_object_array(sequence) + out = _to_object_array(sequence) assert isinstance(out, np.ndarray) assert out.dtype.kind == 'O' assert out.ndim == 1 From cf31a9e1f818c1654ee8512edcf02a999e622be1 Mon Sep 17 00:00:00 2001 From: Alex Shacked Date: Mon, 13 Jan 2020 23:19:10 +0200 Subject: [PATCH 31/32] [MRG] Using dbscan with precomputed neighbours. (#16036) improved _to_object_array() documentation --- doc/whats_new/v0.23.rst | 2 +- sklearn/utils/__init__.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 1e1772988b94f..56799e7ad77e1 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -121,7 +121,7 @@ Changelog :mod:`sklearn.neighbors` .............................. -- |Fix| fix a bug which converted a list of arrays into a 2-D object +- |Fix| Fix a bug which converted a list of arrays into a 2-D object array instead of a 1-D array containing NumPy arrays. This bug was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`. :pr:`16076` by :user:`Guillaume Lemaitre ` and diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index a6f66999c7b8f..a2813382c0c3b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -821,11 +821,17 @@ def tosequence(x): def _to_object_array(sequence): """Convert sequence to a 1-D NumPy array of object dtype. + numpy.array constructor has a similar use but it's output + is ambiguous. It can be 1-D NumPy array of object dtype if + the input is a ragged array, but if the input is a list of + equal length arrays, then the output is a 2D numpy.array. + _to_object_array solves this ambiguity by guarantying that + the output is a 1-D NumPy array of objects for any input. Parameters ---------- sequence : array-like of shape (n_elements,) - The sequence to be converted + The sequence to be converted. Returns ------- @@ -840,6 +846,11 @@ def _to_object_array(sequence): array([array([0]), array([1])], dtype=object) >>> _to_object_array([np.array([0]), np.array([1, 2])]) array([array([0]), array([1, 2])], dtype=object) + >>> np.array([np.array([0]), np.array([1])]) + array([[0], + [1]]) + >>> np.array([np.array([0]), np.array([1, 2])]) + array([array([0]), array([1, 2])], dtype=object) """ out = np.empty(len(sequence), dtype=object) out[:] = sequence From 6415d4b5e5c2b1460f62bd2f9de1b1318c25125e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 14 Jan 2020 18:05:27 -0800 Subject: [PATCH 32/32] add empty line --- sklearn/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index a2813382c0c3b..ee38b9b924ccc 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -821,6 +821,7 @@ def tosequence(x): def _to_object_array(sequence): """Convert sequence to a 1-D NumPy array of object dtype. + numpy.array constructor has a similar use but it's output is ambiguous. It can be 1-D NumPy array of object dtype if the input is a ragged array, but if the input is a list of