From 4869c0d322dbbddcbed0c57ab3eb34859a25975d Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 15 Sep 2023 13:00:21 +0500 Subject: [PATCH 01/21] ENH Array API support for LabelEncoder --- sklearn/preprocessing/_label.py | 15 ++-- sklearn/preprocessing/tests/test_label.py | 25 ++++++ sklearn/utils/_array_api.py | 92 +++++++++++++++++++++++ sklearn/utils/_encode.py | 65 +++++++++------- sklearn/utils/estimator_checks.py | 19 ++++- 5 files changed, 179 insertions(+), 37 deletions(-) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 41494f2649a01..05828c0fa6613 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import column_or_1d +from ..utils._array_api import _setdiff1d, get_namespace from ..utils._encode import _encode, _unique from ..utils._param_validation import Interval, validate_params from ..utils.multiclass import type_of_target, unique_labels @@ -129,10 +130,11 @@ def transform(self, y): Labels as normalized encodings. """ check_is_fitted(self) + xp, _ = get_namespace(y) y = column_or_1d(y, dtype=self.classes_.dtype, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: - return np.array([]) + return xp.asarray([]) return _encode(y, uniques=self.classes_) @@ -150,16 +152,17 @@ def inverse_transform(self, y): Original encoding. """ check_is_fitted(self) + xp, _ = get_namespace(y) y = column_or_1d(y, warn=True) # inverse transform of empty array is empty array if _num_samples(y) == 0: - return np.array([]) + return xp.asarray([]) - diff = np.setdiff1d(y, np.arange(len(self.classes_))) - if len(diff): + diff = _setdiff1d(ar1=y, ar2=xp.arange(self.classes_.shape[0]), xp=xp) + if diff.shape[0]: raise ValueError("y contains previously unseen labels: %s" % str(diff)) - y = np.asarray(y) - return self.classes_[y] + y = xp.asarray(y) + return xp.take(self.classes_, y, axis=0) def _more_tags(self): return {"X_types": ["1dlabels"]} diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index cce0ddc5c267e..8f81b9cfec595 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -12,7 +12,12 @@ label_binarize, ) from sklearn.utils import _to_object_array +from sklearn.utils._array_api import yield_namespace_device_dtype_combinations from sklearn.utils._testing import assert_array_equal, ignore_warnings +from sklearn.utils.estimator_checks import ( + _get_check_estimator_ids, + check_array_api_input_and_values, +) from sklearn.utils.fixes import ( COO_CONTAINERS, CSC_CONTAINERS, @@ -697,3 +702,23 @@ def test_label_encoders_do_not_have_set_output(encoder): y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"]) y_encoded_positional = encoder.fit_transform(["a", "b", "c"]) assert_array_equal(y_encoded_with_kwarg, y_encoded_positional) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "check", + [check_array_api_input_and_values], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [LabelEncoder()], + ids=_get_check_estimator_ids, +) +def test_label_encoder_array_api_compliance( + estimator, check, array_namespace, device, dtype +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype=dtype) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 24534faa931e8..554a5c71a859d 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -232,6 +232,12 @@ def take(self, X, indices, *, axis=0): def isdtype(self, dtype, kind): return isdtype(dtype, kind, xp=self._namespace) + def searchsorted(self, a, v, *, side="left", sorter=None): + a = _convert_to_numpy(a, xp=self._namespace) + v = _convert_to_numpy(v, xp=self._namespace) + indices = numpy.searchsorted(a, v, side=side, sorter=sorter) + return self._namespace.asarray(indices) + def _check_device_cpu(device): # noqa if device not in {"cpu", None}: @@ -330,6 +336,11 @@ def unique_counts(self, x): def unique_values(self, x): return numpy.unique(x) + def unique_all(self, x): + return numpy.unique( + x, return_index=True, return_inverse=True, return_counts=True + ) + def concat(self, arrays, *, axis=None): return numpy.concatenate(arrays, axis=axis) @@ -595,3 +606,84 @@ def _estimator_with_converted_arrays(estimator, converter): def _atol_for_type(dtype): """Return the absolute tolerance for a given dtype.""" return numpy.finfo(dtype).eps * 100 + + +def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): + """Checks whether each element of an array is also present in a + second array. + + Returns a boolean array the same length as `ar1` that is True + where an element of `ar1` is in `ar2` and False otherwise + """ + if not assume_unique: + ar1, rev_idx = xp.unique_inverse(ar1) + ar2 = xp.unique_values(ar2) + + ar = xp.concat((ar1, ar2)) + # We need this to be a stable sort. + order = ar.argsort(stable=True) + sar = ar[order] + if invert: + bool_ar = sar[1:] != sar[:-1] + else: + bool_ar = sar[1:] == sar[:-1] + flag = xp.concat((bool_ar, xp.asarray([invert]))) + ret = xp.empty(ar.shape, dtype=xp.bool) + ret[order] = flag + + if assume_unique: + return ret[: len(ar1)] + else: + return ret[rev_idx] + + +def _setdiff1d(ar1, ar2, xp, assume_unique=False): + """Find the set difference of two arrays. + + Return the unique values in `ar1` that are not in `ar2`. + """ + if _is_numpy_namespace(xp): + return xp.asarray( + numpy.setdiff1d( + ar1=ar1, + ar2=ar2, + assume_unique=assume_unique, + ) + ) + + if assume_unique: + ar1 = xp.reshape(xp.asarray(ar1), (-1,)) + else: + ar1 = xp.unique_values(ar1) + ar2 = xp.unique_values(ar2) + return ar1[_in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)] + + +def _isin(element, test_elements, xp, assume_unique=False, invert=False): + """Calculates ``element in test_elements``, broadcasting over `element` + only. + + Returns a boolean array of the same shape as `element` that is True + where an element of `element` is in `test_elements` and False otherwise. + """ + if _is_numpy_namespace(xp): + return xp.asarray( + numpy.isin( + element=element, + test_elements=test_elements, + assume_unique=assume_unique, + invert=invert, + ) + ) + + element = xp.asarray(element) + return xp.reshape( + _in1d( + ar1=element, + ar2=test_elements, + xp=xp, + assume_unique=assume_unique, + invert=invert, + ), + element.shape, + ) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index b3bf1c2a317ec..55f422f487be7 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -5,6 +5,7 @@ import numpy as np from . import is_scalar_nan +from ._array_api import _convert_to_numpy, _isin, _setdiff1d, get_namespace def _unique(values, *, return_inverse=False, return_counts=False): @@ -51,31 +52,29 @@ def _unique(values, *, return_inverse=False, return_counts=False): def _unique_np(values, return_inverse=False, return_counts=False): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" - uniques = np.unique( - values, return_inverse=return_inverse, return_counts=return_counts - ) + xp, _ = get_namespace(values) inverse, counts = None, None - if return_counts: - *uniques, counts = uniques - - if return_inverse: - *uniques, inverse = uniques - - if return_counts or return_inverse: - uniques = uniques[0] + if return_inverse and return_counts: + uniques, _, inverse, counts = xp.unique_all(values) + elif return_inverse: + uniques, inverse = xp.unique_inverse(values) + elif return_counts: + uniques, counts = xp.unique_counts(values) + else: + uniques = xp.unique_values(values) # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques if uniques.size and is_scalar_nan(uniques[-1]): - nan_idx = np.searchsorted(uniques, np.nan) + nan_idx = xp.searchsorted(uniques, xp.nan) uniques = uniques[: nan_idx + 1] if return_inverse: inverse[inverse > nan_idx] = nan_idx if return_counts: - counts[nan_idx] = np.sum(counts[nan_idx:]) + counts[nan_idx] = xp.sum(counts[nan_idx:]) counts = counts[: nan_idx + 1] ret = (uniques,) @@ -161,8 +160,9 @@ def __missing__(self, key): def _map_to_integer(values, uniques): """Map values based on its position in uniques.""" + xp, _ = get_namespace(values, uniques) table = _nandict({val: i for i, val in enumerate(uniques)}) - return np.array([table[v] for v in values]) + return xp.asarray([table[v] for v in values]) def _unique_python(values, *, return_inverse, return_counts): @@ -220,7 +220,13 @@ def _encode(values, *, uniques, check_unknown=True): encoded : ndarray Encoded values """ - if values.dtype.kind in "OUS": + xp, is_array_api_compliant = get_namespace(values, uniques) + if is_array_api_compliant: + dtype_kind = _convert_to_numpy(values, xp).dtype.kind + else: + dtype_kind = values.dtype.kind + + if dtype_kind in "OUS": try: return _map_to_integer(values, uniques) except KeyError as e: @@ -230,7 +236,7 @@ def _encode(values, *, uniques, check_unknown=True): diff = _check_unknown(values, uniques) if diff: raise ValueError(f"y contains previously unseen labels: {str(diff)}") - return np.searchsorted(uniques, values) + return xp.searchsorted(uniques, values) def _check_unknown(values, known_values, return_mask=False): @@ -258,9 +264,14 @@ def _check_unknown(values, known_values, return_mask=False): Additionally returned if ``return_mask=True``. """ + xp, is_array_api_compliant = get_namespace(values, known_values) valid_mask = None + if is_array_api_compliant: + dtype_kind = _convert_to_numpy(values, xp).dtype.kind + else: + dtype_kind = values.dtype.kind - if values.dtype.kind in "OUS": + if dtype_kind in "OUS": values_set = set(values) values_set, missing_in_values = _extract_missing(values_set) @@ -282,9 +293,9 @@ def is_valid(value): if return_mask: if diff or nan_in_diff or none_in_diff: - valid_mask = np.array([is_valid(value) for value in values]) + valid_mask = xp.array([is_valid(value) for value in values]) else: - valid_mask = np.ones(len(values), dtype=bool) + valid_mask = xp.ones(len(values), dtype=xp.bool) diff = list(diff) if none_in_diff: @@ -292,21 +303,21 @@ def is_valid(value): if nan_in_diff: diff.append(np.nan) else: - unique_values = np.unique(values) - diff = np.setdiff1d(unique_values, known_values, assume_unique=True) + unique_values = xp.unique_values(values) + diff = _setdiff1d(unique_values, known_values, xp, assume_unique=True) if return_mask: if diff.size: - valid_mask = np.isin(values, known_values) + valid_mask = _isin(values, known_values, xp) else: - valid_mask = np.ones(len(values), dtype=bool) + valid_mask = xp.ones(len(values), dtype=xp.bool) # check for nans in the known_values - if np.isnan(known_values).any(): - diff_is_nan = np.isnan(diff) - if diff_is_nan.any(): + if xp.any(xp.isnan(known_values)): + diff_is_nan = xp.isnan(diff) + if xp.any(diff_is_nan): # removes nan from valid_mask if diff.size and return_mask: - is_nan = np.isnan(values) + is_nan = xp.isnan(values) valid_mask[is_nan] = 1 # remove nan from diff diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 895ea98feffde..0bae0b1db34c1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -878,7 +878,18 @@ def check_array_api_input( X_xp = xp.asarray(X, device=device) y_xp = xp.asarray(y, device=device) - est.fit(X, y) + if "Label" in est.__class__.__name__: + fit_args = (y,) + xp_fit_args = (y_xp,) + method_arg = (y,) + xp_method_arg = (y_xp,) + else: + fit_args = (X, y) + xp_fit_args = (X_xp, y_xp) + method_arg = (X,) + xp_method_arg = (X_xp,) + + est.fit(*fit_args) array_attributes = { key: value for key, value in vars(est).items() if isinstance(value, np.ndarray) @@ -886,7 +897,7 @@ def check_array_api_input( est_xp = clone(est) with config_context(array_api_dispatch=True): - est_xp.fit(X_xp, y_xp) + est_xp.fit(*xp_fit_args) input_ns = get_namespace(X_xp)[0].__name__ # Fitted attributes which are arrays must have the same @@ -941,9 +952,9 @@ def check_array_api_input( assert abs(result - result_xp) < np.finfo(X.dtype).eps * 100 continue else: - result = method(X) + result = method(*method_arg) with config_context(array_api_dispatch=True): - result_xp = getattr(est_xp, method_name)(X_xp) + result_xp = getattr(est_xp, method_name)(*xp_method_arg) with config_context(array_api_dispatch=True): result_ns = get_namespace(result_xp)[0].__name__ From 7fbd4589d693145ebcb55f0cb4762dfa5fdce0ed Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 15 Sep 2023 15:05:48 +0500 Subject: [PATCH 02/21] Add changelog --- doc/whats_new/v1.4.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 3e74cabb396b8..98e8ff0574e2f 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -273,6 +273,10 @@ Changelog our usual rolling deprecation cycle policy. See :ref:`array_api` for more details. :pr:`26243` by `Tim Head`_ and :pr:`27110` by :user:`Edoardo Abati `. +- |Enhancement| :class:`preprocessing.LabelEncoder` now supports the + `Array API `_. See :ref:`array_api` + for more details. :pr:`27381` by :user:`Omar Salman `. + :mod:`sklearn.tree` ................... From ec6ccc6c63ca718b8615f54f522683a5789c909d Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 15 Sep 2023 17:15:19 +0500 Subject: [PATCH 03/21] Add tests for array api functions --- sklearn/utils/_array_api.py | 83 ++++++++++++++++----------- sklearn/utils/tests/test_array_api.py | 56 ++++++++++++++++++ 2 files changed, 106 insertions(+), 33 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 554a5c71a859d..1314dc21a799f 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -608,35 +608,6 @@ def _atol_for_type(dtype): return numpy.finfo(dtype).eps * 100 -def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): - """Checks whether each element of an array is also present in a - second array. - - Returns a boolean array the same length as `ar1` that is True - where an element of `ar1` is in `ar2` and False otherwise - """ - if not assume_unique: - ar1, rev_idx = xp.unique_inverse(ar1) - ar2 = xp.unique_values(ar2) - - ar = xp.concat((ar1, ar2)) - # We need this to be a stable sort. - order = ar.argsort(stable=True) - sar = ar[order] - if invert: - bool_ar = sar[1:] != sar[:-1] - else: - bool_ar = sar[1:] == sar[:-1] - flag = xp.concat((bool_ar, xp.asarray([invert]))) - ret = xp.empty(ar.shape, dtype=xp.bool) - ret[order] = flag - - if assume_unique: - return ret[: len(ar1)] - else: - return ret[rev_idx] - - def _setdiff1d(ar1, ar2, xp, assume_unique=False): """Find the set difference of two arrays. @@ -656,7 +627,7 @@ def _setdiff1d(ar1, ar2, xp, assume_unique=False): else: ar1 = xp.unique_values(ar1) ar2 = xp.unique_values(ar2) - return ar1[_in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)] + return ar1[__in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)] def _isin(element, test_elements, xp, assume_unique=False, invert=False): @@ -676,14 +647,60 @@ def _isin(element, test_elements, xp, assume_unique=False, invert=False): ) ) - element = xp.asarray(element) + original_element_shape = element.shape + element = xp.reshape(xp.asarray(element), (-1,)) + test_elements = xp.reshape(xp.asarray(test_elements), (-1,)) return xp.reshape( - _in1d( + __in1d( ar1=element, ar2=test_elements, xp=xp, assume_unique=assume_unique, invert=invert, ), - element.shape, + original_element_shape, ) + + +# Note: This is a helper for the functions `_isin` and +# `_setdiff1d`. It is not meant to be called directly. +def __in1d(ar1, ar2, xp, assume_unique=False, invert=False): + """Checks whether each element of an array is also present in a + second array. + + Returns a boolean array the same length as `ar1` that is True + where an element of `ar1` is in `ar2` and False otherwise + """ + + # This code is run to make the code significantly faster + if ar2.shape[0] < 10 * ar1.shape[0] ** 0.145: + if invert: + mask = xp.ones(ar1.shape[0], dtype=xp.bool) + for a in ar2: + mask &= ar1 != a + else: + mask = xp.zeros(ar1.shape[0], dtype=xp.bool) + for a in ar2: + mask |= ar1 == a + return mask + + if not assume_unique: + ar1, rev_idx = xp.unique_inverse(ar1) + ar2 = xp.unique_values(ar2) + + ar = xp.concat((ar1, ar2)) + # We need this to be a stable sort. + order = ar.argsort(stable=True) + sar = ar[order] + if invert: + bool_ar = sar[1:] != sar[:-1] + else: + bool_ar = sar[1:] == sar[:-1] + flag = xp.concat((bool_ar, xp.asarray([invert]))) + ret = xp.empty(ar.shape, dtype=xp.bool) + ret[order] = flag + + if assume_unique: + return ret[: len(ar1)] + else: + return ret[rev_idx] diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index bf2c0e1acb0fc..2d5b89e4dd2d2 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -1,6 +1,7 @@ from functools import partial import numpy +import numpy as np import pytest from numpy.testing import assert_allclose, assert_array_equal @@ -12,6 +13,7 @@ _atol_for_type, _convert_to_numpy, _estimator_with_converted_arrays, + _isin, _nanmax, _nanmin, _NumPyAPIWrapper, @@ -143,6 +145,29 @@ def test_array_api_wrapper_take(): xp.take(xp.asarray([[[0]]]), xp.asarray([0]), axis=0) +def test_array_api_wrapper_searchsorted(): + """Test _ArrayAPIWrapper API for searchsorted.""" + numpy_array_api = pytest.importorskip("numpy.array_api") + xp_ = _AdjustableNameAPITestWrapper(numpy_array_api, "wrapped_numpy.array_api") + xp = _ArrayAPIWrapper(xp_) + + # Check searchsorted compared to numpy's + a = xp.asarray([1, 2, 3, 4, 5], dtype=xp.float64) + v = 3.0 + result = xp.searchsorted(a, v) + assert hasattr(result, "__array_namespace__") + assert result == numpy.searchsorted(a, v) + + result = xp.searchsorted(a, v, side="right") + assert hasattr(result, "__array_namespace__") + assert result == numpy.searchsorted(a, v, side="right") + + v = xp.asarray([-10, 10, 2, 3], dtype=xp.float64) + result = xp.searchsorted(a, v) + assert hasattr(result, "__array_namespace__") + assert_array_equal(result, numpy.searchsorted(a, v)) + + @pytest.mark.parametrize("array_api", ["numpy", "numpy.array_api"]) def test_asarray_with_order(array_api): """Test _asarray_with_order passes along order for NumPy arrays.""" @@ -371,3 +396,34 @@ def test_get_namespace_array_api_isdtype(wrapper): with pytest.raises(ValueError, match="Unrecognized data type"): assert xp.isdtype(xp.int16, "unknown") + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize("invert", [True, False]) +@pytest.mark.parametrize("assume_unique", [True, False]) +@pytest.mark.parametrize("element_size", [6, 10, 14]) +def test_isin(array_namespace, device, dtype, invert, assume_unique, element_size): + xp, device, dtype = _array_api_for_tests(array_namespace, device, dtype) + r = element_size // 2 + element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(dtype) + test_elements = numpy.array(np.arange(14), dtype=dtype) + element_xp = xp.asarray(element, device=device) + test_elements_xp = xp.asarray(test_elements, device=device) + expected = numpy.isin( + element=element, + test_elements=test_elements, + assume_unique=assume_unique, + invert=invert, + ) + with config_context(array_api_dispatch=True): + result = _isin( + element=element_xp, + test_elements=test_elements_xp, + xp=xp, + assume_unique=assume_unique, + invert=invert, + ) + + assert_array_equal(result, expected) From 43b039d08ddc4c78ec4b5cd1043751709dcae0da Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Sat, 23 Sep 2023 15:01:40 +0500 Subject: [PATCH 04/21] Updates: PR suggestions --- sklearn/utils/_array_api.py | 83 +++++++++++++++++++++------ sklearn/utils/_encode.py | 4 +- sklearn/utils/estimator_checks.py | 3 +- sklearn/utils/tests/test_array_api.py | 13 ++++- 4 files changed, 78 insertions(+), 25 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 1314dc21a799f..cc2453bc01e8e 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -9,6 +9,19 @@ from .._config import get_config from .fixes import parse_version +ARRAY_NAMESPACES = [ + # The following is used to test the array_api_compat wrapper when + # array_api_dispatch is enabled: in particular, the arrays used in the + # tests are regular numpy arrays without any "device" attribute. + "numpy", + # Stricter NumPy-based Array API implementation. The + # numpy.array_api.Array instances always a dummy "device" attribute. + "numpy.array_api", + "cupy", + "cupy.array_api", + "torch", +] + def yield_namespace_device_dtype_combinations(): """Yield supported namespace, device, dtype tuples for testing. @@ -28,18 +41,7 @@ def yield_namespace_device_dtype_combinations(): The name of the data type to use for arrays. Can be None to indicate that the default value should be used. """ - for array_namespace in [ - # The following is used to test the array_api_compat wrapper when - # array_api_dispatch is enabled: in particular, the arrays used in the - # tests are regular numpy arrays without any "device" attribute. - "numpy", - # Stricter NumPy-based Array API implementation. The - # numpy.array_api.Array instances always a dummy "device" attribute. - "numpy.array_api", - "cupy", - "cupy.array_api", - "torch", - ]: + for array_namespace in ARRAY_NAMESPACES: if array_namespace == "torch": for device, dtype in itertools.product( ("cpu", "cuda"), ("float64", "float32") @@ -50,6 +52,43 @@ def yield_namespace_device_dtype_combinations(): yield array_namespace, None, None +def yield_namespace_device_int_dtype_combinations(): + """Yield supported namespace, device, int dtype tuples for testing. + + Use this to test that an estimator works with all combinations. + + Returns + ------- + array_namespace : str + The name of the Array API namespace. + + device : str + The name of the device on which to allocate the arrays. Can be None to + indicate that the default value should be used. + + dtype : str + The name of the int data type to use for arrays. Can be None to + indicate that the default value should be used. + """ + for array_namespace in ARRAY_NAMESPACES: + if array_namespace == "torch": + for device, dtype in itertools.product( + ("cpu", "cuda", "mps"), ("int16", "int32", "int64", "uint8") + ): + yield array_namespace, device, dtype + else: + for dtype in ( + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + ): + yield array_namespace, None, dtype + + def _check_array_api_dispatch(array_api_dispatch): """Check that array_api_compat is installed and NumPy version is compatible. @@ -233,10 +272,16 @@ def isdtype(self, dtype, kind): return isdtype(dtype, kind, xp=self._namespace) def searchsorted(self, a, v, *, side="left", sorter=None): + # Temporary workaround needed as long as searchsorted is not part + # of the Array API spec: + # https://github.com/data-apis/array-api/issues/688 + if hasattr(self._namespace, "searchsorted"): + return self._namespace.searchsorted(a, v, side=side, sorter=sorter) + a = _convert_to_numpy(a, xp=self._namespace) v = _convert_to_numpy(v, xp=self._namespace) indices = numpy.searchsorted(a, v, side=side, sorter=sorter) - return self._namespace.asarray(indices) + return self._namespace.asarray(indices, device=device(a)) def _check_device_cpu(device): # noqa @@ -623,11 +668,11 @@ def _setdiff1d(ar1, ar2, xp, assume_unique=False): ) if assume_unique: - ar1 = xp.reshape(xp.asarray(ar1), (-1,)) + ar1 = xp.reshape(ar1, (-1,)) else: ar1 = xp.unique_values(ar1) ar2 = xp.unique_values(ar2) - return ar1[__in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)] + return ar1[_in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)] def _isin(element, test_elements, xp, assume_unique=False, invert=False): @@ -648,10 +693,10 @@ def _isin(element, test_elements, xp, assume_unique=False, invert=False): ) original_element_shape = element.shape - element = xp.reshape(xp.asarray(element), (-1,)) - test_elements = xp.reshape(xp.asarray(test_elements), (-1,)) + element = xp.reshape(element, (-1,)) + test_elements = xp.reshape(test_elements, (-1,)) return xp.reshape( - __in1d( + _in1d( ar1=element, ar2=test_elements, xp=xp, @@ -664,7 +709,7 @@ def _isin(element, test_elements, xp, assume_unique=False, invert=False): # Note: This is a helper for the functions `_isin` and # `_setdiff1d`. It is not meant to be called directly. -def __in1d(ar1, ar2, xp, assume_unique=False, invert=False): +def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): """Checks whether each element of an array is also present in a second array. diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 55f422f487be7..b885b646bb887 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -5,7 +5,7 @@ import numpy as np from . import is_scalar_nan -from ._array_api import _convert_to_numpy, _isin, _setdiff1d, get_namespace +from ._array_api import _convert_to_numpy, _isin, _setdiff1d, device, get_namespace def _unique(values, *, return_inverse=False, return_counts=False): @@ -162,7 +162,7 @@ def _map_to_integer(values, uniques): """Map values based on its position in uniques.""" xp, _ = get_namespace(values, uniques) table = _nandict({val: i for i, val in enumerate(uniques)}) - return xp.asarray([table[v] for v in values]) + return xp.asarray([table[v] for v in values], device=device(values)) def _unique_python(values, *, return_inverse, return_counts): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fdc870705fef6..1dfb4a238b70a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -878,7 +878,8 @@ def check_array_api_input( X_xp = xp.asarray(X, device=device) y_xp = xp.asarray(y, device=device) - if "Label" in est.__class__.__name__: + X_types = est._get_tags().get("X_types", [""]) + if "labels" in X_types[0]: fit_args = (y,) xp_fit_args = (y_xp,) method_arg = (y,) diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 2d5b89e4dd2d2..237fd05f3190d 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -21,6 +21,7 @@ get_namespace, supported_float_dtypes, yield_namespace_device_dtype_combinations, + yield_namespace_device_int_dtype_combinations, ) from sklearn.utils._testing import ( _array_api_for_tests, @@ -156,15 +157,18 @@ def test_array_api_wrapper_searchsorted(): v = 3.0 result = xp.searchsorted(a, v) assert hasattr(result, "__array_namespace__") + assert result.__array_namespace__().__name__ == "numpy.array_api" assert result == numpy.searchsorted(a, v) result = xp.searchsorted(a, v, side="right") assert hasattr(result, "__array_namespace__") + assert result.__array_namespace__().__name__ == "numpy.array_api" assert result == numpy.searchsorted(a, v, side="right") v = xp.asarray([-10, 10, 2, 3], dtype=xp.float64) result = xp.searchsorted(a, v) assert hasattr(result, "__array_namespace__") + assert result.__array_namespace__().__name__ == "numpy.array_api" assert_array_equal(result, numpy.searchsorted(a, v)) @@ -399,13 +403,16 @@ def test_get_namespace_array_api_isdtype(wrapper): @pytest.mark.parametrize( - "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() + "array_namespace, device, _", yield_namespace_device_int_dtype_combinations() ) @pytest.mark.parametrize("invert", [True, False]) @pytest.mark.parametrize("assume_unique", [True, False]) @pytest.mark.parametrize("element_size", [6, 10, 14]) -def test_isin(array_namespace, device, dtype, invert, assume_unique, element_size): - xp, device, dtype = _array_api_for_tests(array_namespace, device, dtype) +@pytest.mark.parametrize("int_dtype", ["int32", "int64", "uint8"]) +def test_isin( + array_namespace, device, _, invert, assume_unique, element_size, int_dtype +): + xp, device, dtype = _array_api_for_tests(array_namespace, device, int_dtype) r = element_size // 2 element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(dtype) test_elements = numpy.array(np.arange(14), dtype=dtype) From cfdabebd93d02702835ea8dc3477c1a61a00bf4b Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 4 Apr 2024 17:57:09 +0500 Subject: [PATCH 05/21] Fix dtype_name parameter --- sklearn/preprocessing/tests/test_label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index e37e84343abec..5bf63b21267ac 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -721,4 +721,4 @@ def test_label_encoder_array_api_compliance( estimator, check, array_namespace, device, dtype ): name = estimator.__class__.__name__ - check(name, estimator, array_namespace, device=device, dtype=dtype) + check(name, estimator, array_namespace, device=device, dtype_name=dtype) From 23ee51015d12d71ee62c4e40e935c8e6d540fc06 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 12 Apr 2024 17:28:46 +0500 Subject: [PATCH 06/21] Updates as suggested in review --- sklearn/utils/_array_api.py | 56 +++++---------------------- sklearn/utils/_encode.py | 34 +++++++++++----- sklearn/utils/tests/test_array_api.py | 11 +++--- 3 files changed, 39 insertions(+), 62 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 758df7df61f92..0badef6f22198 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -60,49 +60,6 @@ def yield_namespace_device_dtype_combinations(include_numpy_namespaces=True): yield array_namespace, None, None -def yield_namespace_device_int_dtype_combinations(): - """Yield supported namespace, device, int dtype tuples for testing. - - Use this to test that an estimator works with all combinations. - - Returns - ------- - array_namespace : str - The name of the Array API namespace. - - device : str - The name of the device on which to allocate the arrays. Can be None to - indicate that the default value should be used. - - dtype : str - The name of the int data type to use for arrays. Can be None to - indicate that the default value should be used. - """ - for array_namespace in [ - "numpy", - "array_api_strict", - "cupy", - "cupy.array_api", - "torch", - ]: - if array_namespace == "torch": - for device, dtype in itertools.product( - ("cpu", "cuda", "mps"), ("int16", "int32", "int64", "uint8") - ): - yield array_namespace, device, dtype - else: - for dtype in ( - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - ): - yield array_namespace, None, dtype - - def _check_array_api_dispatch(array_api_dispatch): """Check that array_api_compat is installed and NumPy version is compatible. @@ -325,9 +282,10 @@ def isdtype(self, dtype, kind): return isdtype(dtype, kind, xp=self._namespace) def searchsorted(self, a, v, *, side="left", sorter=None): - # Temporary workaround needed as long as searchsorted is not part - # of the Array API spec: - # https://github.com/data-apis/array-api/issues/688 + # Temporary workaround needed as long as searchsorted is not widely + # adopted by implementers of the Array API spec. This is a quite + # recent addition to the spec: + # https://data-apis.org/array-api/latest/API_specification/generated/array_api.searchsorted.html # noqa if hasattr(self._namespace, "searchsorted"): return self._namespace.searchsorted(a, v, side=side, sorter=sorter) @@ -911,7 +869,11 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): second array. Returns a boolean array the same length as `ar1` that is True - where an element of `ar1` is in `ar2` and False otherwise + where an element of `ar1` is in `ar2` and False otherwise. + + This function has been adapted using the original implementation + present in numpy: + https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/arraysetops.py#L524-L758 """ # This code is run to make the code significantly faster diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 420e736f66b89..2c1a4fe73e517 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -4,7 +4,13 @@ import numpy as np -from ._array_api import _convert_to_numpy, _isin, _setdiff1d, device, get_namespace +from ._array_api import ( + _is_numpy_namespace, + _isin, + _setdiff1d, + device, + get_namespace, +) from ._missing import is_scalar_nan @@ -221,12 +227,17 @@ def _encode(values, *, uniques, check_unknown=True): Encoded values """ xp, is_array_api_compliant = get_namespace(values, uniques) - if is_array_api_compliant: - dtype_kind = _convert_to_numpy(values, xp).dtype.kind + if is_array_api_compliant and not _is_numpy_namespace(xp=xp): + try: + dtype = values.dtype + dtype_kind = dtype.kind if hasattr(dtype, "kind") else dtype + numeric_dtype = xp.isdtype(dtype=dtype, kind=dtype_kind) + except ValueError: + numeric_dtype = False else: - dtype_kind = values.dtype.kind + numeric_dtype = values.dtype.kind not in "OUS" - if dtype_kind in "OUS": + if not numeric_dtype: try: return _map_to_integer(values, uniques) except KeyError as e: @@ -266,12 +277,17 @@ def _check_unknown(values, known_values, return_mask=False): """ xp, is_array_api_compliant = get_namespace(values, known_values) valid_mask = None - if is_array_api_compliant: - dtype_kind = _convert_to_numpy(values, xp).dtype.kind + if is_array_api_compliant and not _is_numpy_namespace(xp=xp): + try: + dtype = values.dtype + dtype_kind = dtype.kind if hasattr(dtype, "kind") else dtype + numeric_dtype = xp.isdtype(dtype=dtype, kind=dtype_kind) + except ValueError: + numeric_dtype = False else: - dtype_kind = values.dtype.kind + numeric_dtype = values.dtype.kind not in "OUS" - if dtype_kind in "OUS": + if not numeric_dtype: values_set = set(values) values_set, missing_in_values = _extract_missing(values_set) diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 5cf8347cadfd7..8814d61a1e656 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -24,7 +24,6 @@ indexing_dtype, supported_float_dtypes, yield_namespace_device_dtype_combinations, - yield_namespace_device_int_dtype_combinations, ) from sklearn.utils._testing import ( _array_api_for_tests, @@ -489,19 +488,19 @@ def test_indexing_dtype(namespace, _device, _dtype): @pytest.mark.parametrize( - "array_namespace, device, _", yield_namespace_device_int_dtype_combinations() + "array_namespace, device, _", yield_namespace_device_dtype_combinations() ) @pytest.mark.parametrize("invert", [True, False]) @pytest.mark.parametrize("assume_unique", [True, False]) @pytest.mark.parametrize("element_size", [6, 10, 14]) -@pytest.mark.parametrize("int_dtype", ["int32", "int64", "uint8"]) +@pytest.mark.parametrize("int_dtype", ["int16", "int32", "int64", "uint8"]) def test_isin( array_namespace, device, _, invert, assume_unique, element_size, int_dtype ): - xp, device, dtype = _array_api_for_tests(array_namespace, device, int_dtype) + xp = _array_api_for_tests(array_namespace, device) r = element_size // 2 - element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(dtype) - test_elements = numpy.array(np.arange(14), dtype=dtype) + element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(int_dtype) + test_elements = numpy.array(np.arange(14), dtype=int_dtype) element_xp = xp.asarray(element, device=device) test_elements_xp = xp.asarray(test_elements, device=device) expected = numpy.isin( From 61774758338c6bed04c002dc991044c2972980e1 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 3 May 2024 11:44:32 +0500 Subject: [PATCH 07/21] Revert changes is estimator_checks --- sklearn/utils/estimator_checks.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index da2e189632d46..59d371bad57cd 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -897,19 +897,7 @@ def check_array_api_input( X_xp = xp.asarray(X, device=device) y_xp = xp.asarray(y, device=device) - X_types = est._get_tags().get("X_types", [""]) - if "labels" in X_types[0]: - fit_args = (y,) - xp_fit_args = (y_xp,) - method_arg = (y,) - xp_method_arg = (y_xp,) - else: - fit_args = (X, y) - xp_fit_args = (X_xp, y_xp) - method_arg = (X,) - xp_method_arg = (X_xp,) - - est.fit(*fit_args) + est.fit(X, y) array_attributes = { key: value for key, value in vars(est).items() if isinstance(value, np.ndarray) @@ -917,7 +905,7 @@ def check_array_api_input( est_xp = clone(est) with config_context(array_api_dispatch=True): - est_xp.fit(*xp_fit_args) + est_xp.fit(X_xp, y_xp) input_ns = get_namespace(X_xp)[0].__name__ # Fitted attributes which are arrays must have the same @@ -972,9 +960,9 @@ def check_array_api_input( assert abs(result - result_xp) < _atol_for_type(X.dtype) continue else: - result = method(*method_arg) + result = method(X) with config_context(array_api_dispatch=True): - result_xp = getattr(est_xp, method_name)(*xp_method_arg) + result_xp = getattr(est_xp, method_name)(X_xp) with config_context(array_api_dispatch=True): result_ns = get_namespace(result_xp)[0].__name__ From a21a490976a20153db89a43b52ca7c3ede349b9a Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 3 May 2024 16:48:26 +0500 Subject: [PATCH 08/21] Improve the tests and handle device in _in1d --- sklearn/preprocessing/tests/test_label.py | 55 +++++++++++++++-------- sklearn/utils/_array_api.py | 5 ++- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 5bf63b21267ac..3369ec080418e 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -2,7 +2,7 @@ import pytest from scipy.sparse import issparse -from sklearn import datasets +from sklearn import config_context, datasets from sklearn.preprocessing._label import ( LabelBinarizer, LabelEncoder, @@ -11,11 +11,15 @@ _inverse_binarize_thresholding, label_binarize, ) -from sklearn.utils._array_api import yield_namespace_device_dtype_combinations -from sklearn.utils._testing import assert_array_equal, ignore_warnings -from sklearn.utils.estimator_checks import ( - _get_check_estimator_ids, - check_array_api_input_and_values, +from sklearn.utils._array_api import ( + _convert_to_numpy, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._testing import ( + _array_api_for_tests, + assert_array_equal, + ignore_warnings, ) from sklearn.utils.fixes import ( COO_CONTAINERS, @@ -708,17 +712,30 @@ def test_label_encoders_do_not_have_set_output(encoder): "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() ) @pytest.mark.parametrize( - "check", - [check_array_api_input_and_values], - ids=_get_check_estimator_ids, -) -@pytest.mark.parametrize( - "estimator", - [LabelEncoder()], - ids=_get_check_estimator_ids, + "y", + [ + np.array([2, 1, 3, 1, 3]), + np.array([1, 1, 4, 5, -1, 0]), + np.array([3, 5, 9, 5, 9, 3]), + ], ) -def test_label_encoder_array_api_compliance( - estimator, check, array_namespace, device, dtype -): - name = estimator.__class__.__name__ - check(name, estimator, array_namespace, device=device, dtype_name=dtype) +def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + xp_y = xp.asarray(y, device=device) + xp_label = LabelEncoder() + with config_context(array_api_dispatch=True): + xp_label_fit = xp_label.fit(xp_y) + xp_transformed = xp_label_fit.transform(xp_y) + xp_inv_transformed = xp_label_fit.inverse_transform(xp_transformed) + np_label = LabelEncoder() + np_label_fit = np_label.fit(y) + np_transformed = np_label_fit.transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y) + + xp_transformed = xp_label.fit_transform(xp_y) + np_transformed = np_label.fit_transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index a4ac7a7fe55a7..458d739d674dc 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -923,15 +923,16 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): present in numpy: https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/arraysetops.py#L524-L758 """ + xp, _ = get_namespace(ar1, ar2, xp=xp) # This code is run to make the code significantly faster if ar2.shape[0] < 10 * ar1.shape[0] ** 0.145: if invert: - mask = xp.ones(ar1.shape[0], dtype=xp.bool) + mask = xp.ones(ar1.shape[0], dtype=xp.bool, device=device(ar1)) for a in ar2: mask &= ar1 != a else: - mask = xp.zeros(ar1.shape[0], dtype=xp.bool) + mask = xp.zeros(ar1.shape[0], dtype=xp.bool, device=device(ar1)) for a in ar2: mask |= ar1 == a return mask From b09b57bb9937c11d0de46061a4443c2d7d14d2ac Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 3 May 2024 17:56:22 +0200 Subject: [PATCH 09/21] Fix missing device specification and explicit conversion to numpy --- sklearn/utils/_array_api.py | 5 +++-- sklearn/utils/tests/test_array_api.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 458d739d674dc..1e75bc3dcb26b 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -942,6 +942,7 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): ar2 = xp.unique_values(ar2) ar = xp.concat((ar1, ar2)) + device_ = device(ar) # We need this to be a stable sort. order = ar.argsort(stable=True) sar = ar[order] @@ -949,8 +950,8 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): bool_ar = sar[1:] != sar[:-1] else: bool_ar = sar[1:] == sar[:-1] - flag = xp.concat((bool_ar, xp.asarray([invert]))) - ret = xp.empty(ar.shape, dtype=xp.bool) + flag = xp.concat((bool_ar, xp.asarray([invert], device=device_))) + ret = xp.empty(ar.shape, dtype=xp.bool, device=device_) ret[order] = flag if assume_unique: diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 0594363ef50c4..5d23c8c376e75 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -540,4 +540,4 @@ def test_isin( invert=invert, ) - assert_array_equal(result, expected) + assert_array_equal(_convert_to_numpy(result, xp=xp), expected) From 0544c32052750668f67e794ac7fcec8114d82353 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 3 May 2024 18:23:35 +0200 Subject: [PATCH 10/21] Fix _isin to work with Array API inputs --- sklearn/utils/_array_api.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 1e75bc3dcb26b..54d278323c767 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -944,17 +944,17 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): ar = xp.concat((ar1, ar2)) device_ = device(ar) # We need this to be a stable sort. - order = ar.argsort(stable=True) - sar = ar[order] + order = xp.argsort(ar, stable=True) + reverse_order = xp.argsort(order, stable=True) + sar = xp.take(ar, order) if invert: bool_ar = sar[1:] != sar[:-1] else: bool_ar = sar[1:] == sar[:-1] flag = xp.concat((bool_ar, xp.asarray([invert], device=device_))) - ret = xp.empty(ar.shape, dtype=xp.bool, device=device_) - ret[order] = flag + ret = xp.take(flag, reverse_order) if assume_unique: - return ret[: len(ar1)] + return ret[: ar1.shape[0]] else: - return ret[rev_idx] + return xp.take(ret, rev_idx) From a34138bbd631de5b1815d40964d069256cd88c53 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 6 May 2024 15:30:13 +0500 Subject: [PATCH 11/21] Fix the errors, make searchsorted a helper function --- sklearn/preprocessing/_label.py | 8 ++++++-- sklearn/utils/_array_api.py | 27 ++++++++++++++------------- sklearn/utils/_encode.py | 5 +++-- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 5a1ca9638c15b..3c439733bfb94 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -17,7 +17,7 @@ from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import column_or_1d -from ..utils._array_api import _setdiff1d, get_namespace +from ..utils._array_api import _setdiff1d, device, get_namespace from ..utils._encode import _encode, _unique from ..utils._param_validation import Interval, validate_params from ..utils.multiclass import type_of_target, unique_labels @@ -158,7 +158,11 @@ def inverse_transform(self, y): if _num_samples(y) == 0: return xp.asarray([]) - diff = _setdiff1d(ar1=y, ar2=xp.arange(self.classes_.shape[0]), xp=xp) + diff = _setdiff1d( + ar1=y, + ar2=xp.arange(self.classes_.shape[0], device=device(y)), + xp=xp, + ) if diff.shape[0]: raise ValueError("y contains previously unseen labels: %s" % str(diff)) y = xp.asarray(y) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 54d278323c767..68dbd450803c1 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -302,19 +302,6 @@ def __eq__(self, other): def isdtype(self, dtype, kind): return isdtype(dtype, kind, xp=self._namespace) - def searchsorted(self, a, v, *, side="left", sorter=None): - # Temporary workaround needed as long as searchsorted is not widely - # adopted by implementers of the Array API spec. This is a quite - # recent addition to the spec: - # https://data-apis.org/array-api/latest/API_specification/generated/array_api.searchsorted.html # noqa - if hasattr(self._namespace, "searchsorted"): - return self._namespace.searchsorted(a, v, side=side, sorter=sorter) - - a = _convert_to_numpy(a, xp=self._namespace) - v = _convert_to_numpy(v, xp=self._namespace) - indices = numpy.searchsorted(a, v, side=side, sorter=sorter) - return self._namespace.asarray(indices, device=device(a)) - def _check_device_cpu(device): # noqa if device not in {"cpu", None}: @@ -856,6 +843,20 @@ def indexing_dtype(xp): return xp.asarray(0).dtype +def _searchsorted(xp, a, v, *, side="left", sorter=None): + # Temporary workaround needed as long as searchsorted is not widely + # adopted by implementers of the Array API spec. This is a quite + # recent addition to the spec: + # https://data-apis.org/array-api/latest/API_specification/generated/array_api.searchsorted.html # noqa + if hasattr(xp, "searchsorted"): + return xp.searchsorted(a, v, side=side, sorter=sorter) + + a_np = _convert_to_numpy(a, xp=xp) + v_np = _convert_to_numpy(v, xp=xp) + indices = numpy.searchsorted(a_np, v_np, side=side, sorter=sorter) + return xp.asarray(indices, device=device(a)) + + def _setdiff1d(ar1, ar2, xp, assume_unique=False): """Find the set difference of two arrays. diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 2c1a4fe73e517..a0f26aca9ad54 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -7,6 +7,7 @@ from ._array_api import ( _is_numpy_namespace, _isin, + _searchsorted, _setdiff1d, device, get_namespace, @@ -74,7 +75,7 @@ def _unique_np(values, return_inverse=False, return_counts=False): # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques if uniques.size and is_scalar_nan(uniques[-1]): - nan_idx = xp.searchsorted(uniques, xp.nan) + nan_idx = _searchsorted(xp, uniques, xp.nan) uniques = uniques[: nan_idx + 1] if return_inverse: inverse[inverse > nan_idx] = nan_idx @@ -247,7 +248,7 @@ def _encode(values, *, uniques, check_unknown=True): diff = _check_unknown(values, uniques) if diff: raise ValueError(f"y contains previously unseen labels: {str(diff)}") - return xp.searchsorted(uniques, values) + return _searchsorted(xp, uniques, values) def _check_unknown(values, known_values, return_mask=False): From beb036a74b82e41789d82039e4ef3a8e34243d87 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 6 May 2024 17:44:20 +0500 Subject: [PATCH 12/21] Add array_api_support tag --- sklearn/preprocessing/_label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 3c439733bfb94..2db1486c0251d 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -169,7 +169,7 @@ def inverse_transform(self, y): return xp.take(self.classes_, y, axis=0) def _more_tags(self): - return {"X_types": ["1dlabels"]} + return {"X_types": ["1dlabels"], "array_api_support": True} class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): From 34c2d9201506501b3e2f24fc3a9590d3cf563bf0 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 7 May 2024 11:18:48 +0500 Subject: [PATCH 13/21] Updates: according to some pr suggestions --- sklearn/utils/_array_api.py | 6 +++--- sklearn/utils/_encode.py | 38 +++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 68dbd450803c1..78a1d1023c355 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -947,15 +947,15 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): # We need this to be a stable sort. order = xp.argsort(ar, stable=True) reverse_order = xp.argsort(order, stable=True) - sar = xp.take(ar, order) + sar = xp.take(ar, order, axis=0) if invert: bool_ar = sar[1:] != sar[:-1] else: bool_ar = sar[1:] == sar[:-1] flag = xp.concat((bool_ar, xp.asarray([invert], device=device_))) - ret = xp.take(flag, reverse_order) + ret = xp.take(flag, reverse_order, axis=0) if assume_unique: return ret[: ar1.shape[0]] else: - return xp.take(ret, rev_idx) + return xp.take(ret, rev_idx, axis=0) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index a0f26aca9ad54..7e768604de86e 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -198,6 +198,20 @@ def _unique_python(values, *, return_inverse, return_counts): return ret[0] if len(ret) == 1 else ret +def _is_array_of_numeric_dtype(arr, xp, is_array_api_compliant): + if is_array_api_compliant and not _is_numpy_namespace(xp=xp): + try: + dtype = arr.dtype + dtype_kind = dtype.kind if hasattr(dtype, "kind") else dtype + numeric_dtype = xp.isdtype(dtype=dtype, kind=dtype_kind) + except ValueError: + numeric_dtype = False + else: + numeric_dtype = arr.dtype.kind not in "OUS" + + return numeric_dtype + + def _encode(values, *, uniques, check_unknown=True): """Helper function to encode values into [0, n_uniques - 1]. @@ -228,15 +242,9 @@ def _encode(values, *, uniques, check_unknown=True): Encoded values """ xp, is_array_api_compliant = get_namespace(values, uniques) - if is_array_api_compliant and not _is_numpy_namespace(xp=xp): - try: - dtype = values.dtype - dtype_kind = dtype.kind if hasattr(dtype, "kind") else dtype - numeric_dtype = xp.isdtype(dtype=dtype, kind=dtype_kind) - except ValueError: - numeric_dtype = False - else: - numeric_dtype = values.dtype.kind not in "OUS" + numeric_dtype = _is_array_of_numeric_dtype( + arr=values, xp=xp, is_array_api_compliant=is_array_api_compliant + ) if not numeric_dtype: try: @@ -278,15 +286,9 @@ def _check_unknown(values, known_values, return_mask=False): """ xp, is_array_api_compliant = get_namespace(values, known_values) valid_mask = None - if is_array_api_compliant and not _is_numpy_namespace(xp=xp): - try: - dtype = values.dtype - dtype_kind = dtype.kind if hasattr(dtype, "kind") else dtype - numeric_dtype = xp.isdtype(dtype=dtype, kind=dtype_kind) - except ValueError: - numeric_dtype = False - else: - numeric_dtype = values.dtype.kind not in "OUS" + numeric_dtype = _is_array_of_numeric_dtype( + arr=values, xp=xp, is_array_api_compliant=is_array_api_compliant + ) if not numeric_dtype: values_set = set(values) From db32acf9b0a1fd0a3f1f41ab18841129c392e07d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 7 May 2024 11:40:00 +0200 Subject: [PATCH 14/21] Use xp.isdtype(values.dtype, "numeric") directly --- sklearn/utils/_encode.py | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 7e768604de86e..3fd4d45f522e6 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -5,7 +5,6 @@ import numpy as np from ._array_api import ( - _is_numpy_namespace, _isin, _searchsorted, _setdiff1d, @@ -198,20 +197,6 @@ def _unique_python(values, *, return_inverse, return_counts): return ret[0] if len(ret) == 1 else ret -def _is_array_of_numeric_dtype(arr, xp, is_array_api_compliant): - if is_array_api_compliant and not _is_numpy_namespace(xp=xp): - try: - dtype = arr.dtype - dtype_kind = dtype.kind if hasattr(dtype, "kind") else dtype - numeric_dtype = xp.isdtype(dtype=dtype, kind=dtype_kind) - except ValueError: - numeric_dtype = False - else: - numeric_dtype = arr.dtype.kind not in "OUS" - - return numeric_dtype - - def _encode(values, *, uniques, check_unknown=True): """Helper function to encode values into [0, n_uniques - 1]. @@ -241,12 +226,8 @@ def _encode(values, *, uniques, check_unknown=True): encoded : ndarray Encoded values """ - xp, is_array_api_compliant = get_namespace(values, uniques) - numeric_dtype = _is_array_of_numeric_dtype( - arr=values, xp=xp, is_array_api_compliant=is_array_api_compliant - ) - - if not numeric_dtype: + xp, _ = get_namespace(values, uniques) + if not xp.isdtype(values.dtype, "numeric"): try: return _map_to_integer(values, uniques) except KeyError as e: @@ -284,13 +265,10 @@ def _check_unknown(values, known_values, return_mask=False): Additionally returned if ``return_mask=True``. """ - xp, is_array_api_compliant = get_namespace(values, known_values) + xp, _ = get_namespace(values, known_values) valid_mask = None - numeric_dtype = _is_array_of_numeric_dtype( - arr=values, xp=xp, is_array_api_compliant=is_array_api_compliant - ) - if not numeric_dtype: + if not xp.isdtype(values.dtype, "numeric"): values_set = set(values) values_set, missing_in_values = _extract_missing(values_set) From a5934783555abe159a6f5d69efb91384d7fb845b Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 7 May 2024 14:45:39 +0500 Subject: [PATCH 15/21] Update changelog --- doc/whats_new/v1.4.rst | 4 ---- doc/whats_new/v1.6.rst | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 5f60aefa0d95e..7865ff38adb79 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -926,10 +926,6 @@ Changelog `transform` without calling `fit` since `categories` always requires to be checked. :pr:`27821` by :user:`Guillaume Lemaitre `. -- |Enhancement| :class:`preprocessing.LabelEncoder` now supports the - `Array API `_. See :ref:`array_api` - for more details. :pr:`27381` by :user:`Omar Salman `. - :mod:`sklearn.tree` ................... diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index b90394c75b6ff..70a8fafb9ba12 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -22,6 +22,22 @@ Version 1.6.0 **In Development** +Support for Array API +--------------------- + +Additional estimators and functions have been updated to include support for all +`Array API `_ compliant inputs. + +See :ref:`array_api` for more details. + +**Functions:** + +**Classes:** + +- :class:`preprocessing.LabelEncoder` now supports the Array API + `Array API `_. See :ref:`array_api` + for more details. :pr:`27381` by :user:`Omar Salman `. + Changelog --------- From 22fa6118b967a7dd13057723ab3148c93115f2eb Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 7 May 2024 15:08:03 +0500 Subject: [PATCH 16/21] Update docstring for inverse transform --- sklearn/preprocessing/_label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 2db1486c0251d..d9023b3301e54 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -143,12 +143,12 @@ def inverse_transform(self, y): Parameters ---------- - y : ndarray of shape (n_samples,) + y : array-like of shape (n_samples,) Target values. Returns ------- - y : ndarray of shape (n_samples,) + y : array-like of shape (n_samples,) Original encoding. """ check_is_fitted(self) From f8144410ebd0276d17d70c9a51a41c10a2efa4c5 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 7 May 2024 21:16:45 +0500 Subject: [PATCH 17/21] Change array-like to array --- sklearn/preprocessing/_label.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index d9023b3301e54..54ff93e4f8f59 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -87,7 +87,7 @@ def fit(self, y): Parameters ---------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Target values. Returns @@ -104,12 +104,12 @@ def fit_transform(self, y): Parameters ---------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Target values. Returns ------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Encoded labels. """ y = column_or_1d(y, warn=True) @@ -121,12 +121,12 @@ def transform(self, y): Parameters ---------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Target values. Returns ------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Labels as normalized encodings. """ check_is_fitted(self) @@ -143,12 +143,12 @@ def inverse_transform(self, y): Parameters ---------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Target values. Returns ------- - y : array-like of shape (n_samples,) + y : array of shape (n_samples,) Original encoding. """ check_is_fitted(self) From 8ce860d5e69ce43ca6d4b975cde456afc37c791b Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 8 May 2024 15:57:52 +0500 Subject: [PATCH 18/21] Update the changelog definition to make it consistent --- doc/whats_new/v1.6.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index c6f049facff49..63c261497a3cc 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -38,9 +38,8 @@ See :ref:`array_api` for more details. **Classes:** -- :class:`preprocessing.LabelEncoder` now supports the Array API - `Array API `_. See :ref:`array_api` - for more details. :pr:`27381` by :user:`Omar Salman `. +- :class:`preprocessing.LabelEncoder` now supports Array API compatible inputs. + :pr:`27381` by :user:`Omar Salman `. Changelog --------- From fae25aa4c3f79219c5d453bb83931f8e9cb42010 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 10 May 2024 22:55:05 +0500 Subject: [PATCH 19/21] Revert and update parameter and return type names --- sklearn/preprocessing/_label.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 54ff93e4f8f59..ecf0c400a2c2f 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -87,7 +87,7 @@ def fit(self, y): Parameters ---------- - y : array of shape (n_samples,) + y : array-like of shape (n_samples,) Target values. Returns @@ -104,12 +104,12 @@ def fit_transform(self, y): Parameters ---------- - y : array of shape (n_samples,) + y : array-like of shape (n_samples,) Target values. Returns ------- - y : array of shape (n_samples,) + y : array-like of shape (n_samples,) Encoded labels. """ y = column_or_1d(y, warn=True) @@ -121,12 +121,12 @@ def transform(self, y): Parameters ---------- - y : array of shape (n_samples,) + y : array-like of shape (n_samples,) Target values. Returns ------- - y : array of shape (n_samples,) + y : array-like of shape (n_samples,) Labels as normalized encodings. """ check_is_fitted(self) @@ -143,12 +143,12 @@ def inverse_transform(self, y): Parameters ---------- - y : array of shape (n_samples,) + y : array-like of shape (n_samples,) Target values. Returns ------- - y : array of shape (n_samples,) + y : ndarray of shape (n_samples,) Original encoding. """ check_is_fitted(self) From dbf233a2f6aabf983cda035f179848ca1467e917 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 16 May 2024 14:22:18 +0500 Subject: [PATCH 20/21] Updates: Address further PR suggestions --- sklearn/preprocessing/tests/test_label.py | 13 +++++++++---- sklearn/utils/tests/test_array_api.py | 3 +-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 3369ec080418e..c7206f2f4b8a0 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -722,20 +722,25 @@ def test_label_encoders_do_not_have_set_output(encoder): def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): xp = _array_api_for_tests(array_namespace, device) xp_y = xp.asarray(y, device=device) - xp_label = LabelEncoder() with config_context(array_api_dispatch=True): - xp_label_fit = xp_label.fit(xp_y) - xp_transformed = xp_label_fit.transform(xp_y) - xp_inv_transformed = xp_label_fit.inverse_transform(xp_transformed) + xp_label = LabelEncoder() + xp_label = xp_label.fit(xp_y) + xp_transformed = xp_label.transform(xp_y) + xp_inv_transformed = xp_label.inverse_transform(xp_transformed) np_label = LabelEncoder() np_label_fit = np_label.fit(y) np_transformed = np_label_fit.transform(y) assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) + xp_label = LabelEncoder() xp_transformed = xp_label.fit_transform(xp_y) np_transformed = np_label.fit_transform(y) assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 5d23c8c376e75..30fc88c539fc8 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -2,7 +2,6 @@ from functools import partial import numpy -import numpy as np import pytest from numpy.testing import assert_allclose @@ -522,7 +521,7 @@ def test_isin( xp = _array_api_for_tests(array_namespace, device) r = element_size // 2 element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(int_dtype) - test_elements = numpy.array(np.arange(14), dtype=int_dtype) + test_elements = numpy.array(numpy.arange(14), dtype=int_dtype) element_xp = xp.asarray(element, device=device) test_elements_xp = xp.asarray(test_elements, device=device) expected = numpy.isin( From 7500c2f4b82731c75cff3852469f910d35a5e552 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 16 May 2024 14:23:53 +0500 Subject: [PATCH 21/21] Minor adjustment --- sklearn/preprocessing/tests/test_label.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index c7206f2f4b8a0..90e3aa210eebb 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -724,12 +724,12 @@ def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): xp_y = xp.asarray(y, device=device) with config_context(array_api_dispatch=True): xp_label = LabelEncoder() + np_label = LabelEncoder() xp_label = xp_label.fit(xp_y) xp_transformed = xp_label.transform(xp_y) xp_inv_transformed = xp_label.inverse_transform(xp_transformed) - np_label = LabelEncoder() - np_label_fit = np_label.fit(y) - np_transformed = np_label_fit.transform(y) + np_label = np_label.fit(y) + np_transformed = np_label.transform(y) assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__ assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ @@ -738,6 +738,7 @@ def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) xp_label = LabelEncoder() + np_label = LabelEncoder() xp_transformed = xp_label.fit_transform(xp_y) np_transformed = np_label.fit_transform(y) assert get_namespace(xp_transformed)[0].__name__ == xp.__name__