Fix train_test_split array API implementation

betatim · betatim · commit a796f33f7e29 · 2024-03-12T09:31:23.000Z
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -29,6 +29,12 @@
     indexable,
     metadata_routing,
 )
+from ..utils._array_api import (
+    _convert_to_numpy,
+    _is_numpy_namespace,
+    device,
+    get_namespace,
+)
 from ..utils._param_validation import Interval, RealNotInt, validate_params
 from ..utils.extmath import _approximate_mode
 from ..utils.metadata_routing import _MetadataRequester
@@ -2221,6 +2227,12 @@ def _iter_indices(self, X, y, groups=None):
             default_test_size=self._default_test_size,
         )
 
+        # Convert to numpy as not all operations are supported by the Array API.
+        # `y` is probably never a very large array, which means that converting it
+        # should be cheap
+        xp, _ = get_namespace(y)
+        y = _convert_to_numpy(y, xp=xp)
+
         if y.ndim == 2:
             # for multi-label y, map each distinct row to a string repr
             # using join because str(row) uses an ellipsis if len(row) > 1000
@@ -2787,6 +2799,13 @@ def train_test_split(
 
         train, test = next(cv.split(X=arrays[0], y=stratify))
 
+    xp, is_array_api = get_namespace(*arrays)
+    if is_array_api and not _is_numpy_namespace(xp):
+        # Move train and test indexers to the same namespace and device as the
+        # arrays we are indexing. Assumes that all arrays are on the same device
+        train = xp.asarray(train, device=device(arrays[0]))
+        test = xp.asarray(test, device=device(arrays[0]))
+
     return list(
         chain.from_iterable(
             (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -17,6 +17,7 @@
 
 from ..exceptions import DataConversionWarning
 from . import _joblib, metadata_routing
+from ._array_api import _is_numpy_namespace, get_namespace
 from ._bunch import Bunch
 from ._chunking import gen_batches, gen_even_slices
 from ._estimator_html_repr import estimator_html_repr
@@ -89,6 +90,9 @@
 
 def _array_indexing(array, key, key_dtype, axis):
     """Index an array or scipy.sparse consistently across NumPy version."""
+    xp, is_array_api = get_namespace(array)
+    if is_array_api and not _is_numpy_namespace(xp):
+        return xp.take(array, key, axis=axis)
     if issparse(array) and key_dtype == "bool":
         key = np.asarray(key)
     if isinstance(key, tuple):
@@ -215,10 +219,19 @@ def _determine_key_type(key, accept_slice=True):
             raise ValueError(err_msg)
         return key_type.pop()
     if hasattr(key, "dtype"):
-        try:
-            return array_dtype_to_str[key.dtype.kind]
-        except KeyError:
-            raise ValueError(err_msg)
+        xp, is_array_api = get_namespace(key)
+        if is_array_api and not _is_numpy_namespace(xp):
+            if xp.isdtype(key.dtype, "bool"):
+                return "bool"
+            elif xp.isdtype(key.dtype, "integral"):
+                return "int"
+            else:
+                raise ValueError(err_msg)
+        else:
+            try:
+                return array_dtype_to_str[key.dtype.kind]
+            except KeyError:
+                raise ValueError(err_msg)
     raise ValueError(err_msg)