Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] BUG ensure object array are properly casted when dtype=object #16076

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Jan 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c2abc89
[MRG] Using dbscan with precomputed neighbors gives an error in 0.22.…
alexshacked Jan 9, 2020
06f01d4
[MRG] Using dbscan with precomputed neighbours. (#16036) in regressio…
alexshacked Jan 9, 2020
0fca6f0
[MRG] Using dbscan with precomputed neighbours. (#16036) PEP fixes
alexshacked Jan 9, 2020
887f154
[MRG] Using dbscan with precomputed neighbours. (#16036) change log
alexshacked Jan 9, 2020
e582a40
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
eda4787
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
eedad66
Update sklearn/neighbors/tests/test_neighbors.py
alexshacked Jan 9, 2020
bfeee92
Update sklearn/neighbors/tests/test_neighbors.py
alexshacked Jan 9, 2020
e13297c
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
a512168
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
a538fac
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
19b23b2
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
af6b74d
Update sklearn/neighbors/_base.py
alexshacked Jan 9, 2020
e2d9f8b
(#16036) refactoring with new function _to_object_array()
alexshacked Jan 10, 2020
93d87d5
Update doc/whats_new/v0.23.rst
alexshacked Jan 10, 2020
71f9c7f
Update doc/whats_new/v0.23.rst
alexshacked Jan 10, 2020
33b68e5
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
5edfeb6
[MRG] Using dbscan with precomputed neighbours. (#16036) moved to_obj…
alexshacked Jan 10, 2020
d3e05dd
[MRG] Using dbscan with precomputed neighbours. (#16036) fix PEP8 er…
alexshacked Jan 10, 2020
a992048
Update sklearn/preprocessing/tests/test_label.py
alexshacked Jan 10, 2020
ece10a1
Update sklearn/utils/__init__.py
alexshacked Jan 10, 2020
6a75af4
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
83df8c6
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
3508571
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
8ddd3e5
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
c302eb9
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
cf8ada9
Update sklearn/neighbors/_base.py
alexshacked Jan 10, 2020
8df3f1c
[MRG] Using dbscan with precomputed neighbours. (#16036) removed unne…
alexshacked Jan 10, 2020
aaf778e
fixed import statement
alexshacked Jan 10, 2020
e5dcb28
[MRG] Using dbscan with precomputed neighbours. (#16036) restored und…
alexshacked Jan 10, 2020
cf31a9e
[MRG] Using dbscan with precomputed neighbours. (#16036) improved _to…
alexshacked Jan 13, 2020
6415d4b
add empty line
TomDLT Jan 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/whats_new/v0.23.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ Changelog
when `y=None`.
:pr: `15918` by :user: `Luca Kubin <lkubin>`.

:mod:`sklearn.neighbors`
..............................

- |Fix| Fix a bug which converted a list of arrays into a 2-D object
array instead of a 1-D array containing NumPy arrays. This bug
was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`.
:pr:`16076` by :user:`Guillaume Lemaitre <glemaitre>` and
:user:`Alex Shacked <alexshacked>`.

:mod:`sklearn.preprocessing`
............................

Expand Down
16 changes: 6 additions & 10 deletions sklearn/neighbors/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..metrics import pairwise_distances_chunked
from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
from ..utils import check_X_y, check_array, gen_even_slices
from ..utils import _to_object_array
from ..utils.multiclass import check_classification_targets
from ..utils.validation import check_is_fitted
from ..utils.validation import check_non_negative
Expand Down Expand Up @@ -276,8 +277,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance):
indices = indices.astype(np.intp, copy=no_filter_needed)

if return_distance:
neigh_dist = np.array(np.split(data, indptr[1:-1]), dtype=object)
neigh_ind = np.array(np.split(indices, indptr[1:-1]), dtype=object)
neigh_dist = _to_object_array(np.split(data, indptr[1:-1]))
neigh_ind = _to_object_array(np.split(indices, indptr[1:-1]))

if return_distance:
return neigh_dist, neigh_ind
Expand Down Expand Up @@ -940,17 +941,12 @@ class from an array representing our data set and ask who's
neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)
neigh_dist_list = sum(neigh_dist_chunks, [])
neigh_ind_list = sum(neigh_ind_chunks, [])
# See https://github.com/numpy/numpy/issues/5456
# to understand why this is initialized this way.
neigh_dist = np.empty(len(neigh_dist_list), dtype='object')
neigh_dist[:] = neigh_dist_list
neigh_ind = np.empty(len(neigh_ind_list), dtype='object')
neigh_ind[:] = neigh_ind_list
neigh_dist = _to_object_array(neigh_dist_list)
neigh_ind = _to_object_array(neigh_ind_list)
results = neigh_dist, neigh_ind
else:
neigh_ind_list = sum(chunked_results, [])
results = np.empty(len(neigh_ind_list), dtype='object')
results[:] = neigh_ind_list
results = _to_object_array(neigh_ind_list)

elif self._fit_method in ['ball_tree', 'kd_tree']:
if issparse(X):
Expand Down
24 changes: 24 additions & 0 deletions sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,30 @@ def test_radius_neighbors_boundary_handling():
assert_array_equal(results[0], [0, 1])


def test_radius_neighbors_returns_array_of_objects():
# check that we can pass precomputed distances to
# NearestNeighbors.radius_neighbors()
# non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/16036
X = csr_matrix(np.ones((4, 4)))
X.setdiag([0, 0, 0, 0])

nbrs = neighbors.NearestNeighbors(radius=0.5, algorithm='auto',
leaf_size=30,
metric='precomputed').fit(X)
neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True)

expected_dist = np.empty(X.shape[0], dtype=object)
expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]),
np.array([0])]
expected_ind = np.empty(X.shape[0], dtype=object)
expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]),
np.array([3])]

assert_array_equal(neigh_dist, expected_dist)
assert_array_equal(neigh_ind, expected_ind)


def test_RadiusNeighborsClassifier_multioutput():
# Test k-NN classifier on multioutput data
rng = check_random_state(0)
Expand Down
4 changes: 2 additions & 2 deletions sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import _to_object_array

from sklearn.preprocessing._label import LabelBinarizer
from sklearn.preprocessing._label import MultiLabelBinarizer
Expand Down Expand Up @@ -433,8 +434,7 @@ def test_multilabel_binarizer_same_length_sequence():


def test_multilabel_binarizer_non_integer_labels():
tuple_classes = np.empty(3, dtype=object)
tuple_classes[:] = [(1,), (2,), (3,)]
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']),
([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']),
Expand Down
39 changes: 39 additions & 0 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,45 @@ def tosequence(x):
return list(x)


def _to_object_array(sequence):
"""Convert sequence to a 1-D NumPy array of object dtype.

numpy.array constructor has a similar use but it's output
is ambiguous. It can be 1-D NumPy array of object dtype if
the input is a ragged array, but if the input is a list of
equal length arrays, then the output is a 2D numpy.array.
_to_object_array solves this ambiguity by guarantying that
the output is a 1-D NumPy array of objects for any input.

Parameters
----------
sequence : array-like of shape (n_elements,)
The sequence to be converted.

Returns
-------
out : ndarray of shape (n_elements,), dtype=object
The converted sequence into a 1-D NumPy array of object dtype.

Examples
--------
>>> import numpy as np
>>> from sklearn.utils import _to_object_array
>>> _to_object_array([np.array([0]), np.array([1])])
array([array([0]), array([1])], dtype=object)
>>> _to_object_array([np.array([0]), np.array([1, 2])])
array([array([0]), array([1, 2])], dtype=object)
>>> np.array([np.array([0]), np.array([1])])
array([[0],
[1]])
>>> np.array([np.array([0]), np.array([1, 2])])
array([array([0]), array([1, 2])], dtype=object)
"""
out = np.empty(len(sequence), dtype=object)
out[:] = sequence
return out


def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.

Expand Down
12 changes: 12 additions & 0 deletions sklearn/utils/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from sklearn.utils import _message_with_time, _print_elapsed_time
from sklearn.utils import get_chunk_n_rows
from sklearn.utils import is_scalar_nan
from sklearn.utils import _to_object_array
from sklearn.utils._mocking import MockDataFrame
from sklearn import config_context

Expand Down Expand Up @@ -646,3 +647,14 @@ def test_deprecation_joblib_api(tmpdir):

from sklearn.utils._joblib import joblib
del joblib.parallel.BACKENDS['failing']


@pytest.mark.parametrize(
"sequence",
[[np.array(1), np.array(2)], [[1, 2], [3, 4]]]
)
def test_to_object_array(sequence):
out = _to_object_array(sequence)
assert isinstance(out, np.ndarray)
assert out.dtype.kind == 'O'
assert out.ndim == 1