From 04a8fde1178237c57736ea9fdb2bbc00d5b5600b Mon Sep 17 00:00:00 2001 From: Leonard Binet Date: Sun, 1 Sep 2019 11:16:23 +0200 Subject: [PATCH 01/11] fix type_of_target for csr_matrices --- sklearn/utils/multiclass.py | 53 ++++++++++++++++++-------- sklearn/utils/tests/test_multiclass.py | 8 ++++ 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 03e89836eb394..fe17f99be0c10 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -17,7 +17,7 @@ import numpy as np -from .validation import check_array, _assert_all_finite +from .validation import check_array, _assert_all_finite, assert_all_finite def _unique_multiclass(y): @@ -265,12 +265,19 @@ def type_of_target(y): # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): warnings.simplefilter('error', np.VisibleDeprecationWarning) - try: - y = np.asarray(y) - except np.VisibleDeprecationWarning: - # dtype=object should be provided explicitly for ragged arrays, - # see NEP 34 - y = np.asarray(y, dtype=object) + if not issparse(y): + # calling np.asarray on sparse matrix has unexpected behavior + # https://github.com/numpy/numpy/issues/14221 + + try: + y = np.asarray(y) + except np.VisibleDeprecationWarning: + # dtype=object should be provided explicitly for ragged arrays, + # see NEP 34 + y = np.asarray(y, dtype=object) + except ValueError: + # Known to fail in numpy 1.3 for array of arrays + return 'unknown' # The old sequence of sequences format try: @@ -285,9 +292,14 @@ def type_of_target(y): pass # Invalid inputs - if y.ndim > 2 or (y.dtype == object and len(y) and - not isinstance(y.flat[0], str)): + if y.ndim > 2: + return 'unknown' + if not issparse(y) and y.dtype == object and len(y) \ + and not isinstance(y.flat[0], str): return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] + if issparse(y) and y.dtype == object and y.shape[0] \ + and not isinstance(y.data[0], str): + return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] (sparse) if y.ndim == 2 and y.shape[1] == 0: return 'unknown' # [[]] @@ -298,13 +310,22 @@ def type_of_target(y): suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values - if y.dtype.kind == 'f' and np.any(y != y.astype(int)): - # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] - _assert_all_finite(y) - return 'continuous' + suffix - - if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): - return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] + if y.dtype.kind == 'f': + if not issparse(y) and np.any(y != y.astype(int)): + # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] + _assert_all_finite(y) + return 'continuous' + suffix + if issparse(y) and np.any(y.data != y.data.astype(int)): + # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] + assert_all_finite(y) + return 'continuous' + suffix + + if len(np.unique(y)) > 2: + return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] + if not issparse(y) and y.ndim >= 2 and len(y[0]) > 1: + return 'multiclass' + suffix # [[1, 2]] or [[0],[1]] + if issparse(y) and y.ndim >= 2: + return 'multiclass' + suffix # [[1, 2]] else: return 'binary' # [1, 2] or [["a"], ["b"]] diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index efcd2c11fc15c..e8f831d8389a2 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -74,6 +74,11 @@ np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), + csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]])), + csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.int8), + csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.uint8), + csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.float), + csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.float32), np.array([['a', 'b'], ['c', 'd']]), np.array([['a', 'b'], ['c', 'd']]), np.array([['a', 'b'], ['c', 'd']], dtype=object), @@ -112,6 +117,9 @@ np.array([[0, .5], [.5, 0]]), np.array([[0, .5], [.5, 0]], dtype=np.float32), np.array([[0, .5]]), + csr_matrix(np.array([[0, .5], [.5, 0]])), + csr_matrix(np.array([[0, .5], [.5, 0]]), dtype=np.float32), + csr_matrix(np.array([[0, .5]])), ], 'unknown': [ [[]], From 24f1fe93fc0a4c83c276b571b69f9aaaecf927db Mon Sep 17 00:00:00 2001 From: Leonard Binet Date: Tue, 10 Sep 2019 00:11:29 +0200 Subject: [PATCH 02/11] add tests for type_of_target --- sklearn/utils/multiclass.py | 35 ++++++++++++++------------ sklearn/utils/tests/test_multiclass.py | 6 +++++ 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index fe17f99be0c10..7ee1e350a87e1 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -292,40 +292,43 @@ def type_of_target(y): pass # Invalid inputs - if y.ndim > 2: + if y.ndim not in (1, 2): # [[[1, 2]]] return 'unknown' - if not issparse(y) and y.dtype == object and len(y) \ - and not isinstance(y.flat[0], str): - return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] - if issparse(y) and y.dtype == object and y.shape[0] \ - and not isinstance(y.data[0], str): - return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] (sparse) - - if y.ndim == 2 and y.shape[1] == 0: + if not min(y.shape): + if y.ndim == 1: + return 'binary' # [] return 'unknown' # [[]] + # [obj_1] and not ["label_1"] + if not issparse(y) and y.dtype == object and \ + not isinstance(y.flat[0], str): + return 'unknown' + if issparse(y) and y.dtype == object and not isinstance(y.data[0], str): + return 'unknown' + # Check if multioutput if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] - # check float and contains non-integer float values + # Check float and contains non-integer float values if y.dtype.kind == 'f': + # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] if not issparse(y) and np.any(y != y.astype(int)): - # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] _assert_all_finite(y) return 'continuous' + suffix if issparse(y) and np.any(y.data != y.data.astype(int)): - # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] assert_all_finite(y) return 'continuous' + suffix + # Check multiclass if len(np.unique(y)) > 2: return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] - if not issparse(y) and y.ndim >= 2 and len(y[0]) > 1: - return 'multiclass' + suffix # [[1, 2]] or [[0],[1]] - if issparse(y) and y.ndim >= 2: - return 'multiclass' + suffix # [[1, 2]] + # [[1, 2]] + if not issparse(y) and y.ndim == 2 and len(y[0]) > 1: + return 'multiclass' + suffix + if issparse(y) and y.ndim == 2 and len(y.getrow(0).data) > 1: + return 'multiclass' + suffix else: return 'binary' # [1, 2] or [["a"], ["b"]] diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index e8f831d8389a2..30f9343dcfbfe 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -12,6 +12,8 @@ from scipy.sparse import lil_matrix from sklearn.utils._testing import assert_array_equal +from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils.estimator_checks import _NotAnArray @@ -123,6 +125,7 @@ ], 'unknown': [ [[]], + np.array([[]], dtype=object), [()], # sequence of sequences that weren't supported even before deprecation np.array([np.array([]), np.array([1, 2, 3])], dtype=object), @@ -133,6 +136,9 @@ # and also confusable as sequences of sequences [{0: 'a', 1: 'b'}, {0: 'a'}], + # ndim 0 + np.array(0), + # empty second dimension np.array([[], []]), From e36d128fe9ae087e959bdec3e2ee3df9f0f05043 Mon Sep 17 00:00:00 2001 From: Leonard Binet Date: Wed, 1 Jan 2020 19:34:21 +0100 Subject: [PATCH 03/11] remove useless comment --- sklearn/utils/multiclass.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 7ee1e350a87e1..837048ef59fc8 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -266,9 +266,6 @@ def type_of_target(y): with warnings.catch_warnings(): warnings.simplefilter('error', np.VisibleDeprecationWarning) if not issparse(y): - # calling np.asarray on sparse matrix has unexpected behavior - # https://github.com/numpy/numpy/issues/14221 - try: y = np.asarray(y) except np.VisibleDeprecationWarning: From db7a001076cab3e618b14922b8568fe494fa8efe Mon Sep 17 00:00:00 2001 From: Leonard Binet Date: Fri, 21 Feb 2020 18:54:20 +0100 Subject: [PATCH 04/11] remove useless statement on sparse matrices --- sklearn/utils/multiclass.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 837048ef59fc8..e1a32d46ec999 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -299,8 +299,6 @@ def type_of_target(y): if not issparse(y) and y.dtype == object and \ not isinstance(y.flat[0], str): return 'unknown' - if issparse(y) and y.dtype == object and not isinstance(y.data[0], str): - return 'unknown' # Check if multioutput if y.ndim == 2 and y.shape[1] > 1: From 03bf3b59c51b799fbf14f424805a0f48abc042fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9onard=20Binet?= Date: Sun, 15 Nov 2020 10:20:03 +0100 Subject: [PATCH 05/11] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/utils/multiclass.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index e1a32d46ec999..396bc2b271fb8 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -17,7 +17,7 @@ import numpy as np -from .validation import check_array, _assert_all_finite, assert_all_finite +from .validation import check_array, _assert_all_finite def _unique_multiclass(y): @@ -272,9 +272,6 @@ def type_of_target(y): # dtype=object should be provided explicitly for ragged arrays, # see NEP 34 y = np.asarray(y, dtype=object) - except ValueError: - # Known to fail in numpy 1.3 for array of arrays - return 'unknown' # The old sequence of sequences format try: @@ -289,15 +286,19 @@ def type_of_target(y): pass # Invalid inputs - if y.ndim not in (1, 2): # [[[1, 2]]] + if y.ndim not in (1, 2): + # Number of dimension greater than 2: [[[1, 2]]] return 'unknown' if not min(y.shape): + # Empty ndarray: []/[[]] if y.ndim == 1: + # 1-D empty array: [] return 'binary' # [] - return 'unknown' # [[]] - # [obj_1] and not ["label_1"] - if not issparse(y) and y.dtype == object and \ - not isinstance(y.flat[0], str): + # 2-D empty array: [[]] + return 'unknown' + if (not issparse(y) and y.dtype == object and + not isinstance(y.flat[0], str)): + # [obj_1] and not ["label_1"] return 'unknown' # Check if multioutput @@ -309,20 +310,15 @@ def type_of_target(y): # Check float and contains non-integer float values if y.dtype.kind == 'f': # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] - if not issparse(y) and np.any(y != y.astype(int)): - _assert_all_finite(y) - return 'continuous' + suffix - if issparse(y) and np.any(y.data != y.data.astype(int)): - assert_all_finite(y) + data = y.data if issparse(y) else y + if np.any(data != data.astype(int)): + _assert_all_finite(data) return 'continuous' + suffix # Check multiclass - if len(np.unique(y)) > 2: - return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] - # [[1, 2]] - if not issparse(y) and y.ndim == 2 and len(y[0]) > 1: - return 'multiclass' + suffix - if issparse(y) and y.ndim == 2 and len(y.getrow(0).data) > 1: + first_row = y[0] if not issparse(y) else y.getrow(0).data + if len(np.unique(y)) > 2 or (y.ndim == 2 and len(first_row) > 1): + # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] return 'multiclass' + suffix else: return 'binary' # [1, 2] or [["a"], ["b"]] From d669bda21ba97a3d0dc7af146322cb727e2e224b Mon Sep 17 00:00:00 2001 From: Bartosz Telenczuk Date: Sat, 12 Dec 2020 16:22:48 +0100 Subject: [PATCH 06/11] fixing linting issues --- sklearn/utils/tests/test_multiclass.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 30f9343dcfbfe..e578fa269041b 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -12,8 +12,6 @@ from scipy.sparse import lil_matrix from sklearn.utils._testing import assert_array_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raises_regex from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils.estimator_checks import _NotAnArray From 1383e2d1f08876cd599d68276f577c421871f464 Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Sat, 12 Feb 2022 01:39:57 +0100 Subject: [PATCH 07/11] Extend the tests --- sklearn/utils/tests/test_multiclass.py | 44 ++++++++++++++++---------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 50695ea384603..0dcda22cabc0f 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -28,6 +28,21 @@ from sklearn.svm import SVC from sklearn import datasets +sparse_multilable_explicit_zero = csc_matrix(np.array([[0, 1], [1, 0]])) +sparse_multilable_explicit_zero[:, 0] = 0 + + +def _generate_sparse( + matrix, + matrix_types=(csr_matrix, csc_matrix, coo_matrix, dok_matrix, lil_matrix), + dtypes=(bool, int, np.int8, np.uint8, float, np.float32), +): + return [ + matrix_type(matrix, dtype=dtype) + for matrix_type in matrix_types + for dtype in dtypes + ] + EXAMPLES = { "multilabel-indicator": [ @@ -36,14 +51,10 @@ csr_matrix(np.random.RandomState(42).randint(2, size=(10, 10))), [[0, 1], [1, 0]], [[0, 1]], - csr_matrix(np.array([[0, 1], [1, 0]])), - csr_matrix(np.array([[0, 1], [1, 0]], dtype=bool)), - csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.int8)), - csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.uint8)), - csr_matrix(np.array([[0, 1], [1, 0]], dtype=float)), - csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float32)), - csr_matrix(np.array([[0, 0], [0, 0]])), - csr_matrix(np.array([[0, 1]])), + sparse_multilable_explicit_zero, + *_generate_sparse([[0, 1], [1, 0]]), + *_generate_sparse([[0, 0], [0, 0]]), + *_generate_sparse([[0, 1]]), # Only valid when data is dense [[-1, 1], [1, -1]], np.array([[-1, 1], [1, -1]]), @@ -73,11 +84,9 @@ np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), - csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]])), - csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.int8), - csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.uint8), - csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.float), - csr_matrix(np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), dtype=np.float32), + *_generate_sparse([[1, 0, 2, 2], [1, 4, 2, 4]], + matrix_types=(csr_matrix, csc_matrix), + dtypes=(int, np.int8, np.uint8, float, np.float32)), np.array([['a', 'b'], ['c', 'd']]), np.array([['a', 'b'], ['c', 'd']]), np.array([['a', 'b'], ['c', 'd']], dtype=object), @@ -116,9 +125,12 @@ np.array([[0, .5], [.5, 0]]), np.array([[0, .5], [.5, 0]], dtype=np.float32), np.array([[0, .5]]), - csr_matrix(np.array([[0, .5], [.5, 0]])), - csr_matrix(np.array([[0, .5], [.5, 0]]), dtype=np.float32), - csr_matrix(np.array([[0, .5]])), + *_generate_sparse([[0, .5], [.5, 0]], + matrix_types=(csr_matrix, csc_matrix), + dtypes=(float, np.float32)), + *_generate_sparse([[0, .5]], + matrix_types=(csr_matrix, csc_matrix), + dtypes=(float, np.float32)), ], "unknown": [ [[]], From f2ed6ffa3b64dfe20626a760bc7796eec09f1a4c Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Sat, 12 Feb 2022 01:40:47 +0100 Subject: [PATCH 08/11] Fix is_multilable for explicit zeros --- sklearn/utils/multiclass.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 0aae1f207a187..67b326784269c 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -161,12 +161,16 @@ def is_multilabel(y): if issparse(y): if isinstance(y, (dok_matrix, lil_matrix)): y = y.tocsr() + labels = np.unique(y.data) return ( len(y.data) == 0 - or np.unique(y.data).size == 1 + or ( + labels.size == 1 + or (labels.size == 2) and (0 in labels) + ) and ( y.dtype.kind in "biu" - or _is_integral_float(np.unique(y.data)) # bool, int, uint + or _is_integral_float(labels) # bool, int, uint ) ) else: From 280e1c69e67f8e7bc6a6bb4f862a6980e6eb3d74 Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Sat, 12 Feb 2022 01:40:55 +0100 Subject: [PATCH 09/11] Fix doc --- sklearn/utils/multiclass.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 67b326784269c..d764a0125c133 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -217,7 +217,9 @@ def type_of_target(y, input_name=""): Parameters ---------- - y : array-like + y : {array-like, sparse matrix} + The target array. If a sparse matrix, `y` is expected to be a + CSR/CSC matrix. input_name : str, default="" The data name used to construct the error message. From 182db68a809b8cd363f3c5182e1bc20338dc554d Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Sat, 12 Feb 2022 01:43:57 +0100 Subject: [PATCH 10/11] Blacken --- sklearn/utils/multiclass.py | 29 ++++++---------- sklearn/utils/tests/test_multiclass.py | 46 +++++++++++++++----------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index d764a0125c133..f190bf2200826 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -164,14 +164,8 @@ def is_multilabel(y): labels = np.unique(y.data) return ( len(y.data) == 0 - or ( - labels.size == 1 - or (labels.size == 2) and (0 in labels) - ) - and ( - y.dtype.kind in "biu" - or _is_integral_float(labels) # bool, int, uint - ) + or (labels.size == 1 or (labels.size == 2) and (0 in labels)) + and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint ) else: labels = np.unique(y) @@ -294,7 +288,7 @@ def type_of_target(y, input_name=""): # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter('error', np.VisibleDeprecationWarning) + warnings.simplefilter("error", np.VisibleDeprecationWarning) if not issparse(y): try: y = np.asarray(y) @@ -323,18 +317,17 @@ def type_of_target(y, input_name=""): # Invalid inputs if y.ndim not in (1, 2): # Number of dimension greater than 2: [[[1, 2]]] - return 'unknown' + return "unknown" if not min(y.shape): # Empty ndarray: []/[[]] if y.ndim == 1: # 1-D empty array: [] - return 'binary' # [] + return "binary" # [] # 2-D empty array: [[]] - return 'unknown' - if (not issparse(y) and y.dtype == object and - not isinstance(y.flat[0], str)): + return "unknown" + if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str): # [obj_1] and not ["label_1"] - return 'unknown' + return "unknown" # Check if multioutput if y.ndim == 2 and y.shape[1] > 1: @@ -343,18 +336,18 @@ def type_of_target(y, input_name=""): suffix = "" # [1, 2, 3] or [[1], [2], [3]] # Check float and contains non-integer float values - if y.dtype.kind == 'f': + if y.dtype.kind == "f": # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] data = y.data if issparse(y) else y if np.any(data != data.astype(int)): _assert_all_finite(data, input_name=input_name) - return 'continuous' + suffix + return "continuous" + suffix # Check multiclass first_row = y[0] if not issparse(y) else y.getrow(0).data if len(np.unique(y)) > 2 or (y.ndim == 2 and len(first_row) > 1): # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] - return 'multiclass' + suffix + return "multiclass" + suffix else: return "binary" # [1, 2] or [["a"], ["b"]] diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 0dcda22cabc0f..be38593f1529f 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -33,9 +33,9 @@ def _generate_sparse( - matrix, - matrix_types=(csr_matrix, csc_matrix, coo_matrix, dok_matrix, lil_matrix), - dtypes=(bool, int, np.int8, np.uint8, float, np.float32), + matrix, + matrix_types=(csr_matrix, csc_matrix, coo_matrix, dok_matrix, lil_matrix), + dtypes=(bool, int, np.int8, np.uint8, float, np.float32), ): return [ matrix_type(matrix, dtype=dtype) @@ -84,12 +84,14 @@ def _generate_sparse( np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), - *_generate_sparse([[1, 0, 2, 2], [1, 4, 2, 4]], - matrix_types=(csr_matrix, csc_matrix), - dtypes=(int, np.int8, np.uint8, float, np.float32)), - np.array([['a', 'b'], ['c', 'd']]), - np.array([['a', 'b'], ['c', 'd']]), - np.array([['a', 'b'], ['c', 'd']], dtype=object), + *_generate_sparse( + [[1, 0, 2, 2], [1, 4, 2, 4]], + matrix_types=(csr_matrix, csc_matrix), + dtypes=(int, np.int8, np.uint8, float, np.float32), + ), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]], dtype=object), np.array([[1, 0, 2]]), _NotAnArray(np.array([[1, 0, 2]])), ], @@ -121,16 +123,20 @@ def _generate_sparse( np.array([[0], [0.5]]), np.array([[0], [0.5]], dtype=np.float32), ], - 'continuous-multioutput': [ - np.array([[0, .5], [.5, 0]]), - np.array([[0, .5], [.5, 0]], dtype=np.float32), - np.array([[0, .5]]), - *_generate_sparse([[0, .5], [.5, 0]], - matrix_types=(csr_matrix, csc_matrix), - dtypes=(float, np.float32)), - *_generate_sparse([[0, .5]], - matrix_types=(csr_matrix, csc_matrix), - dtypes=(float, np.float32)), + "continuous-multioutput": [ + np.array([[0, 0.5], [0.5, 0]]), + np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), + np.array([[0, 0.5]]), + *_generate_sparse( + [[0, 0.5], [0.5, 0]], + matrix_types=(csr_matrix, csc_matrix), + dtypes=(float, np.float32), + ), + *_generate_sparse( + [[0, 0.5]], + matrix_types=(csr_matrix, csc_matrix), + dtypes=(float, np.float32), + ), ], "unknown": [ [[]], @@ -142,7 +148,7 @@ def _generate_sparse( [{1, 2, 3}, {1, 2}], [frozenset([1, 2, 3]), frozenset([1, 2])], # and also confusable as sequences of sequences - [{0: 'a', 1: 'b'}, {0: 'a'}], + [{0: "a", 1: "b"}, {0: "a"}], # ndim 0 np.array(0), # empty second dimension From eb381f790dda9fc726571c38b75e958a528b9ae3 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 21 Sep 2022 16:04:13 +0200 Subject: [PATCH 11/11] what's new entry --- doc/whats_new/v1.2.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 3186ed60faa08..d60ecf3cadb71 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -458,6 +458,9 @@ Changelog deterministic SVD used by the randomized SVD algorithm. :pr:`20617` by :user:`Srinath Kailasa ` +- |FIX| :func:`utils.multiclass.type_of_target` now properly handles sparse matrices. + :pr:`14862` by :user:`Léonard Binet `. + Code and Documentation Contributors -----------------------------------