diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 62ebf1b8e0093..7c67b7bb014ed 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -140,6 +140,7 @@ Tools ----- - :func:`model_selection.train_test_split` +- :func:`utils.check_consistent_length` Coverage is expected to grow over time. Please follow the dedicated `meta-issue on GitHub `_ to track progress. diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index fd5c30805a0c0..07e4c0b294436 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -16,7 +16,6 @@ from numbers import Integral, Real import numpy as np -from scipy.integrate import trapezoid from scipy.sparse import csr_matrix, issparse from scipy.stats import rankdata @@ -28,6 +27,7 @@ check_consistent_length, column_or_1d, ) +from ..utils._array_api import _trapezoid, get_namespace from ..utils._encode import _encode, _unique from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ..utils.extmath import stable_cumsum @@ -79,6 +79,8 @@ def auc(x, y): >>> metrics.auc(fpr, tpr) np.float64(0.75) """ + xp, _ = get_namespace(x, y) + check_consistent_length(x, y) x = column_or_1d(x) y = column_or_1d(y) @@ -90,19 +92,14 @@ def auc(x, y): ) direction = 1 - dx = np.diff(x) - if np.any(dx < 0): - if np.all(dx <= 0): + dx = xp.subtract(x[1:], x[:-1]) + if xp.any(dx < 0): + if xp.all(dx <= 0): direction = -1 else: raise ValueError("x is neither increasing nor decreasing : {}.".format(x)) - area = direction * trapezoid(y, x) - if isinstance(area, np.memmap): - # Reductions such as .sum used internally in trapezoid do not return a - # scalar by default for numpy.memmap instances contrary to - # regular numpy.ndarray instances. - area = area.dtype.type(area) + area = direction * _trapezoid(y, x) return area diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index ce967f1df083b..ed94212f9df01 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -1179,7 +1179,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False): .. versionadded:: 0.18 - The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of + The Fowlkes-Mallows index (FMI) is defined as the geometric mean of the precision and recall:: FMI = TP / sqrt((TP + FP) * (TP + FN)) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 4aeaf6ec3a4d7..cd96be22b4934 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -9,6 +9,7 @@ from sklearn.datasets import make_multilabel_classification from sklearn.metrics import ( accuracy_score, + auc, average_precision_score, balanced_accuracy_score, brier_score_loss, @@ -1990,7 +1991,27 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) ) +def check_array_api_regression_ranking(metric, array_namespace, device, dtype_name): + x_np = np.array([-1.1, -0.3, 0.4, 1.0, 4.0], dtype=dtype_name) + y_np = np.array([1.0, 0.5, -0.5, 2, 2], dtype=dtype_name) + + metric_kwargs = {} + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=x_np, + b_np=y_np, + **metric_kwargs, + ) + + array_api_metric_checkers = { + auc: [ + check_array_api_regression_ranking, + ], accuracy_score: [ check_array_api_binary_classification_metric, check_array_api_multiclass_classification_metric, diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 645f1d61cf527..6a296581629f5 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -530,10 +530,11 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None): ------- namespace : module Namespace shared by array objects. If any of the `arrays` are not arrays, - the namespace defaults to NumPy. + the namespace defaults to the NumPy namespace. is_array_api_compliant : bool - True if the arrays are containers that implement the Array API spec. + True if the arrays are containers that implement the array API spec (see + https://data-apis.org/array-api/latest/API_specification/). Always False when array_api_dispatch=False. """ array_api_dispatch = get_config()["array_api_dispatch"] @@ -1045,3 +1046,18 @@ def _modify_in_place_if_numpy(xp, func, *args, out=None, **kwargs): else: out = func(*args, **kwargs) return out + + +def _trapezoid(y, x=None, dx=1.0, axis=None): + """Partial (one-dimensional) port of scipy.trapezoid to support the Array API.""" + xp, _, device = get_namespace_and_device(x, y) + + if size(y) < 2: + return xp.asarray(0, device=device, dtype=y.dtype) + + if x is None: + d = dx + else: + d = xp.subtract(x[1:], x[:-1]) + + return xp.sum(d * (y[:-1] + y[1:]) / 2.0) diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 8156662d6780d..63f1feac6a47f 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -4,6 +4,7 @@ import numpy import pytest +import scipy.integrate from numpy.testing import assert_allclose from sklearn._config import config_context @@ -23,6 +24,7 @@ _nanmin, _NumPyAPIWrapper, _ravel, + _trapezoid, device, get_namespace, get_namespace_and_device, @@ -606,3 +608,35 @@ def test_fill_or_add_to_diagonal(array_namespace, device_, dtype_name, wrap): _fill_or_add_to_diagonal(array_xp, value=1, xp=xp, add_value=False, wrap=wrap) numpy.fill_diagonal(array_np, val=1, wrap=wrap) assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np) + + +@pytest.mark.parametrize( + "x,y", + [ + ([], []), + ([1], [2]), + (2.0, [-1.0, 2.0, 1.0]), + ([1.0, 3.0, 4.0], [-1.0, 2.0, 1.0]), + ], +) +@pytest.mark.parametrize( + "array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations() +) +def test_trapezoid(x, y, array_namespace, device_, dtype_name): + xp = _array_api_for_tests(array_namespace, device_) + x_in = numpy.asarray(x, dtype=dtype_name) + x_in = xp.asarray(x_in, device=device_) + + y_in = numpy.asarray(y, dtype=dtype_name) + y_in = xp.asarray(y_in, device=device_) + + with config_context(array_api_dispatch=True): + if isinstance(x, float): + result = _trapezoid(y_in, dx=x) + expected = scipy.integrate.trapezoid(y, dx=x) + else: + result = _trapezoid(y_in, x_in) + expected = scipy.integrate.trapezoid(y, x=x) + + result = _convert_to_numpy(result, xp) + assert_allclose(result, expected, atol=_atol_for_type(dtype_name)) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index e3216885d17e4..6242108101cff 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -34,6 +34,7 @@ check_X_y, deprecated, ) +from sklearn.utils._array_api import yield_namespace_device_dtype_combinations from sklearn.utils._mocking import ( MockDataFrame, _MockEstimatorOnOffPrediction, @@ -41,6 +42,7 @@ from sklearn.utils._testing import ( SkipTest, TempMemmap, + _array_api_for_tests, _convert_container, assert_allclose, assert_allclose_dense_sparse, @@ -985,25 +987,54 @@ def test_check_is_fitted_with_attributes(wrap): def test_check_consistent_length(): + """Test that `check_consistent_length` raises on inconsistent lengths and wrong + input types trigger TypeErrors.""" check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ["a", "b"]) check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2))) with pytest.raises(ValueError, match="inconsistent numbers of samples"): check_consistent_length([1, 2], [1]) + with pytest.raises(TypeError, match=r"got <\w+ 'int'>"): check_consistent_length([1, 2], 1) with pytest.raises(TypeError, match=r"got <\w+ 'object'>"): check_consistent_length([1, 2], object()) - with pytest.raises(TypeError): check_consistent_length([1, 2], np.array(1)) - # Despite ensembles having __len__ they must raise TypeError with pytest.raises(TypeError, match="Expected sequence or array-like"): check_consistent_length([1, 2], RandomForestRegressor()) # XXX: We should have a test with a string, but what is correct behaviour? +@pytest.mark.parametrize( + "array_namespace, device, _", yield_namespace_device_dtype_combinations() +) +def test_check_consistent_length_array_API(array_namespace, device, _): + """Test that check_consistent_length works with different array types.""" + xp = _array_api_for_tests(array_namespace, device) + + check_consistent_length( + xp.asarray([1], device=device), + xp.asarray([2], device=device), + ) + if xp.__name__ == "numpy": + check_consistent_length( + xp.asarray([[1, 2], [1, 2]], device=device), + xp.asarray([1, 2], device=device), + xp.asarray(["a", "b"], device=device), + ) + else: + check_consistent_length( + xp.asarray([[1, 2], [1, 2]], device=device), + xp.asarray([1, 2], device=device), + ) + with pytest.raises(ValueError, match="inconsistent numbers of samples"): + check_consistent_length( + xp.asarray([1, 2], device=device), xp.asarray([1], device=device) + ) + + def test_check_dataframe_fit_attribute(): # check pandas dataframe with 'fit' column does not raise error # https://github.com/scikit-learn/scikit-learn/issues/8415 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 8a8c12506216e..ebbd6024b19a8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -465,10 +465,10 @@ def check_consistent_length(*arrays): >>> b = [2, 3, 4] >>> check_consistent_length(a, b) """ - - lengths = [_num_samples(X) for X in arrays if X is not None] - uniques = np.unique(lengths) - if len(uniques) > 1: + xp, _ = get_namespace(*arrays) + lengths = xp.asarray([_num_samples(X) for X in arrays if X is not None]) + uniques = xp.unique_values(lengths) + if uniques.shape[0] > 1: raise ValueError( "Found input variables with inconsistent numbers of samples: %r" % [int(l) for l in lengths]