diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index dac97146be221..808ef6afafd66 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -402,6 +402,9 @@ Changelog :pr:`25733` by :user:`Brigitta Sipőcz ` and :user:`Jérémie du Boisberranger `. +- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas + extension arrays. :pr:`25813` by `Thomas Fan`_. + :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index b90218a97016b..d8566c85e7b73 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -118,15 +118,19 @@ def test_label_binarizer_set_label_encoding(): @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_label_binarizer_pandas_nullable(dtype): +@pytest.mark.parametrize("unique_first", [True, False]) +def test_label_binarizer_pandas_nullable(dtype, unique_first): """Checks that LabelBinarizer works with pandas nullable dtypes. Non-regression test for gh-25637. """ pd = pytest.importorskip("pandas") - from sklearn.preprocessing import LabelBinarizer y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) + if unique_first: + # Calling unique creates a pandas array which has a different interface + # compared to a pandas Series. Specifically, pandas arrays do not have "iloc". + y_true = y_true.unique() lb = LabelBinarizer().fit(y_true) y_out = lb.transform([1, 0]) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5c9379facf366..028a5ee3042bf 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1762,6 +1762,25 @@ def test_boolean_series_remains_boolean(): assert_array_equal(res, expected) +@pytest.mark.parametrize("input_values", [[0, 1, 0, 1, 0, np.nan], [0, 1, 0, 1, 0, 1]]) +def test_pandas_array_returns_ndarray(input_values): + """Check pandas array with extensions dtypes returns a numeric ndarray. + + Non-regression test for gh-25637. + """ + pd = importorskip("pandas") + input_series = pd.array(input_values, dtype="Int32") + result = check_array( + input_series, + dtype=None, + ensure_2d=False, + allow_nd=False, + force_all_finite=False, + ) + assert np.issubdtype(result.dtype.kind, np.floating) + assert_allclose(result, input_values) + + @pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"]) def test_check_array_array_api_has_non_finite(array_namespace): """Checks that Array API arrays checks non-finite correctly.""" diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index eb56caa5e65e4..2d5f323195e97 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -626,6 +626,15 @@ def _pandas_dtype_needs_early_conversion(pd_dtype): return False +def _is_extension_array_dtype(array): + try: + from pandas.api.types import is_extension_array_dtype + + return is_extension_array_dtype(array) + except ImportError: + return False + + def check_array( array, accept_sparse=False, @@ -777,7 +786,9 @@ def check_array( if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) - elif hasattr(array, "iloc") and hasattr(array, "dtype"): + elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( + array, "dtype" + ): # array is a pandas series pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype) if isinstance(array.dtype, np.dtype):