From 15c305a3c316b86aee9b5f5995b02c2fa4220267 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 4 Dec 2024 18:22:57 +0100 Subject: [PATCH 1/3] FIX: deprecate integer valued numerical features for PDP --- sklearn/inspection/_partial_dependence.py | 19 +++++ .../tests/test_partial_dependence.py | 82 ++++++++++++++++--- 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 46cd357785357..7c777df364329 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -3,6 +3,7 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause +import warnings from collections.abc import Iterable import numpy as np @@ -699,6 +700,24 @@ def partial_dependence( if isinstance(features, (str, int)): features = [features] + for feature_idx, feature, is_cat in zip(features_indices, features, is_categorical): + if is_cat: + continue + + if _safe_indexing(X, feature_idx, axis=1).dtype.kind in "iu": + # TODO(1.8): raise a ValueError instead. + warnings.warn( + f"The column {feature!r} contains integer data. Partial " + "dependence plots are not supported for integer data: this " + "can lead to implicit rounding with NumPy arrays or even errors " + "with newer pandas versions. Please convert numerical features" + "to floating point dtypes ahead of time to avoid problems. " + "This will raise ValueError in scikit-learn 1.8.", + FutureWarning, + ) + # Do not warn again for other features to avoid spamming the caller. + break + X_subset = _safe_indexing(X, features_indices, axis=1) custom_values_for_X_subset = { diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index aff12044ee32a..25cefe8d7e24f 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -2,6 +2,9 @@ Testing for the partial dependence module. """ +import re +import warnings + import numpy as np import pytest @@ -751,13 +754,14 @@ def test_partial_dependence_binary_model_grid_resolution( pd = pytest.importorskip("pandas") model = DummyClassifier() + rng = np.random.RandomState(0) X = pd.DataFrame( { - "a": np.random.randint(0, 10, size=100), - "b": np.random.randint(0, 10, size=100), + "a": rng.randint(0, 10, size=100).astype(np.float64), + "b": rng.randint(0, 10, size=100).astype(np.float64), } ) - y = pd.Series(np.random.randint(0, 2, size=100)) + y = pd.Series(rng.randint(0, 2, size=100)) model.fit(X, y) part_dep = partial_dependence( @@ -773,9 +777,9 @@ def test_partial_dependence_binary_model_grid_resolution( @pytest.mark.parametrize( "features, custom_values, n_vals_expected", [ - (["a"], {"a": [1, 2, 3, 4]}, 4), - (["a"], {"a": [1, 2]}, 2), - (["a"], {"a": [1]}, 1), + (["a"], {"a": [1.0, 2.0, 3.0, 4.0]}, 4), + (["a"], {"a": [1.0, 2.0]}, 2), + (["a"], {"a": [1.0]}, 1), ], ) def test_partial_dependence_binary_model_custom_values( @@ -784,7 +788,7 @@ def test_partial_dependence_binary_model_custom_values( pd = pytest.importorskip("pandas") model = DummyClassifier() - X = pd.DataFrame({"a": [1, 2, 3, 4], "b": [6, 7, 8, 9]}) + X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [6.0, 7.0, 8.0, 9.0]}) y = pd.Series([0, 1, 0, 1]) model.fit(X, y) @@ -804,7 +808,7 @@ def test_partial_dependence_binary_model_custom_values( [ (["b"], {"b": ["a", "b"]}, 2), (["b"], {"b": ["a"]}, 1), - (["a", "b"], {"a": [1, 2], "b": ["a", "b"]}, 4), + (["a", "b"], {"a": [1.0, 2.0], "b": ["a", "b"]}, 4), ], ) def test_partial_dependence_pipeline_custom_values( @@ -815,11 +819,11 @@ def test_partial_dependence_pipeline_custom_values( SimpleImputer(strategy="most_frequent"), OneHotEncoder(), DummyClassifier() ) - X = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "a", "b"]}) + X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": ["a", "b", "a", "b"]}) y = pd.Series([0, 1, 0, 1]) pl.fit(X, y) - X_holdout = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "a", None]}) + X_holdout = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": ["a", "b", "a", None]}) part_dep = partial_dependence( pl, X_holdout, @@ -1134,3 +1138,61 @@ def test_mixed_type_categorical(): ).fit(X, y) with pytest.raises(ValueError, match="The column #0 contains mixed data types"): partial_dependence(clf, X, features=[0]) + + +def test_reject_array_with_integer_dtype(): + X = np.arange(8).reshape(4, 2) + y = np.array([0, 1, 0, 1]) + clf = DummyClassifier() + clf.fit(X, y) + with pytest.warns( + FutureWarning, match=re.escape("The column 0 contains integer data.") + ): + partial_dependence(clf, X, features=0) + + with pytest.warns( + FutureWarning, match=re.escape("The column 1 contains integer data.") + ): + partial_dependence(clf, X, features=[1], categorical_features=[0]) + + with pytest.warns( + FutureWarning, match=re.escape("The column 0 contains integer data.") + ): + partial_dependence(clf, X, features=[0, 1]) + + # The following should not raise as we do not compute numerical partial + # dependence on integer columns. + with warnings.catch_warnings(): + warnings.simplefilter("error") + partial_dependence(clf, X, features=1, categorical_features=[1]) + + +def test_reject_pandas_with_integer_dtype(): + pd = pytest.importorskip("pandas") + X = pd.DataFrame( + { + "a": [1.0, 2.0, 3.0], + "b": [1, 2, 3], + "c": [1, 2, 3], + } + ) + y = np.array([0, 1, 0]) + clf = DummyClassifier() + clf.fit(X, y) + + with pytest.warns( + FutureWarning, match=re.escape("The column 'c' contains integer data.") + ): + partial_dependence(clf, X, features="c") + + with pytest.warns( + FutureWarning, match=re.escape("The column 'c' contains integer data.") + ): + partial_dependence(clf, X, features=["a", "c"]) + + # The following should not raise as we do not compute numerical partial + # dependence on integer columns. + with warnings.catch_warnings(): + warnings.simplefilter("error") + partial_dependence(clf, X, features=["a"]) + partial_dependence(clf, X, features=["c"], categorical_features=["c"]) From b9097c947c4c77119aaa881c69e854f1683d6833 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 4 Dec 2024 19:18:59 +0100 Subject: [PATCH 2/3] Fix warning raised in test_plot_partial_dependence_legend --- sklearn/inspection/_plot/tests/test_plot_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py index 3fa623c39b787..b2338b5c03b3a 100644 --- a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py @@ -870,7 +870,7 @@ def test_plot_partial_dependence_legend(pyplot): X = pd.DataFrame( { "col_A": ["A", "B", "C"], - "col_B": [1, 0, 2], + "col_B": [1.0, 0.0, 2.0], "col_C": ["C", "B", "A"], } ) From ca7fb2ea76f4af7ac7f2d67da73e8023286286ad Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 4 Dec 2024 19:26:19 +0100 Subject: [PATCH 3/3] Changelog entry --- .../upcoming_changes/sklearn.inspection/30409.api.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.inspection/30409.api.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.inspection/30409.api.rst b/doc/whats_new/upcoming_changes/sklearn.inspection/30409.api.rst new file mode 100644 index 0000000000000..cbbfe19a9b7cc --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.inspection/30409.api.rst @@ -0,0 +1,5 @@ +- :func:`inspection.partial_dependence` does no longer accept integer dtype for + numerical feature columns. Explicity conversion to floating point values is + now required before calling this tool (and preferably even before fitting the + model to inspect). + By :user:`Olivier Grisel `