Merge branch 'main' into fea/pdr-simultaneous-stable-sort

jjerphan · web-flow · commit d27f138a2da4 · 2022-12-02T14:46:15.000+01:00
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -416,6 +416,10 @@ Changelog
   :pr:`18298` by :user:`Madhura Jayaratne <madhuracj>` and
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| :class:`inspection.DecisionBoundaryDisplay` now raises error if input
+  data is not 2-dimensional.
+  :pr:`25077` by :user:`Arturo Amor <ArturoAmorQ>`.
+
 :mod:`sklearn.kernel_approximation`
 ...................................
 
@@ -709,6 +713,10 @@ Changelog
 - |Fix| :func:`utils.estimator_checks.check_estimator` now takes into account
   the `requires_positive_X` tag correctly. :pr:`24667` by `Thomas Fan`_.
 
+- |Fix| :func:`utils.check_array` now supports Pandas Series with `pd.NA`
+  by raising a better error message or returning a compatible `ndarray`.
+  :pr:`25080` by `Thomas Fan`_.
+
 - |API| The extra keyword parameters of :func:`utils.extmath.density` are deprecated
   and will be removed in 1.4.
   :pr:`24523` by :user:`Mia Bajic <clytaemnestra>`.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -270,18 +270,21 @@ def _check_categories(self, X):
                 if missing.any():
                     categories = categories[~missing]
 
+                if hasattr(self, "feature_names_in_"):
+                    feature_name = f"'{self.feature_names_in_[f_idx]}'"
+                else:
+                    feature_name = f"at index {f_idx}"
+
                 if categories.size > self.max_bins:
                     raise ValueError(
-                        f"Categorical feature at index {f_idx} is "
-                        "expected to have a "
-                        f"cardinality <= {self.max_bins}"
+                        f"Categorical feature {feature_name} is expected to "
+                        f"have a cardinality <= {self.max_bins}"
                     )
 
                 if (categories >= self.max_bins).any():
                     raise ValueError(
-                        f"Categorical feature at index {f_idx} is "
-                        "expected to be encoded with "
-                        f"values < {self.max_bins}"
+                        f"Categorical feature {feature_name} is expected to "
+                        f"be encoded with values < {self.max_bins}"
                     )
             else:
                 categories = None
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1141,20 +1141,32 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
 @pytest.mark.parametrize(
     "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
 )
-def test_categorical_bad_encoding_errors(Est):
+@pytest.mark.parametrize(
+    "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
+)
+def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
     # Test errors when categories are encoded incorrectly
 
     gb = Est(categorical_features=[True], max_bins=2)
 
-    X = np.array([[0, 1, 2]]).T
+    if use_pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame({"f0": [0, 1, 2]})
+    else:
+        X = np.array([[0, 1, 2]]).T
     y = np.arange(3)
-    msg = "Categorical feature at index 0 is expected to have a cardinality <= 2"
+    msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2"
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 
-    X = np.array([[0, 2]]).T
+    if use_pandas:
+        X = pd.DataFrame({"f0": [0, 2]})
+    else:
+        X = np.array([[0, 2]]).T
     y = np.arange(2)
-    msg = "Categorical feature at index 0 is expected to be encoded with values < 2"
+    msg = (
+        f"Categorical feature {feature_name} is expected to be encoded with values < 2"
+    )
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 
diff --git a/sklearn/inspection/_plot/decision_boundary.py b/sklearn/inspection/_plot/decision_boundary.py
@@ -6,7 +6,11 @@
 from ...utils import check_matplotlib_support
 from ...utils import _safe_indexing
 from ...base import is_regressor
-from ...utils.validation import check_is_fitted, _is_arraylike_not_scalar
+from ...utils.validation import (
+    check_is_fitted,
+    _is_arraylike_not_scalar,
+    _num_features,
+)
 
 
 def _check_boundary_response_method(estimator, response_method):
@@ -316,6 +320,12 @@ def from_estimator(
                 f"Got {plot_method} instead."
             )
 
+        num_features = _num_features(X)
+        if num_features != 2:
+            raise ValueError(
+                f"n_features must be equal to 2. Got {num_features} instead."
+            )
+
         x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1)
 
         x0_min, x0_max = x0.min() - eps, x0.max() + eps
diff --git a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py
@@ -38,6 +38,16 @@ def fitted_clf():
     return LogisticRegression().fit(X, y)
 
 
+def test_input_data_dimension():
+    """Check that we raise an error when `X` does not have exactly 2 features."""
+    X, y = make_classification(n_samples=10, n_features=4, random_state=0)
+
+    clf = LogisticRegression().fit(X, y)
+    msg = "n_features must be equal to 2. Got 4 instead."
+    with pytest.raises(ValueError, match=msg):
+        DecisionBoundaryDisplay.from_estimator(estimator=clf, X=X)
+
+
 def test_check_boundary_response_method_auto():
     """Check _check_boundary_response_method behavior with 'auto'."""
 
diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py
@@ -98,7 +98,7 @@ def test_balance_property(model, with_sample_weight, global_random_seed):
     ):
         pytest.skip("Estimator does not support sample_weight.")
 
-    rel = 1e-4  # test precision
+    rel = 2e-4  # test precision
     if isinstance(model, SGDRegressor):
         rel = 1e-1
     elif hasattr(model, "solver") and model.solver == "saga":
diff --git a/sklearn/utils/_param_validation.py b/sklearn/utils/_param_validation.py
@@ -50,13 +50,6 @@ def validate_parameter_constraints(parameter_constraints, params, caller_name):
     caller_name : str
         The name of the estimator or function or method that called this function.
     """
-    if len(set(parameter_constraints) - set(params)) != 0:
-        raise ValueError(
-            f"The parameter constraints {list(parameter_constraints)}"
-            " contain unexpected parameters"
-            f" {set(parameter_constraints) - set(params)}"
-        )
-
     for param_name, param_val in params.items():
         # We allow parameters to not have a constraint so that third party estimators
         # can inherit from sklearn estimators without having to necessarily use the
diff --git a/sklearn/utils/tests/test_param_validation.py b/sklearn/utils/tests/test_param_validation.py
@@ -453,20 +453,6 @@ def test_validate_params():
         _func(0, *[1, 2, 3], c="four", **{"e": 5})
 
 
-def test_validate_params_match_error():
-    """Check that an informative error is raised when there are constraints
-    that have no matching function paramaters
-    """
-
-    @validate_params({"a": [int], "c": [int]})
-    def func(a, b):
-        pass
-
-    match = r"The parameter constraints .* contain unexpected parameters {'c'}"
-    with pytest.raises(ValueError, match=match):
-        func(1, 2)
-
-
 def test_validate_params_missing_params():
     """Check that no error is raised when there are parameters without
     constraints
@@ -633,3 +619,22 @@ def test_cv_objects():
     assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
     assert constraint.is_satisfied_by(None)
     assert not constraint.is_satisfied_by("not a CV object")
+
+
+def test_third_party_estimator():
+    """Check that the validation from a scikit-learn estimator inherited by a third
+    party estimator does not impose a match between the dict of constraints and the
+    parameters of the estimator.
+    """
+
+    class ThirdPartyEstimator(_Estimator):
+        def __init__(self, b):
+            self.b = b
+            super().__init__(a=0)
+
+        def fit(self, X=None, y=None):
+            super().fit(X, y)
+
+    # does not raise, even though "b" is not in the constraints dict and "a" is not
+    # a parameter of the estimator.
+    ThirdPartyEstimator(b=0).fit()
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -447,6 +447,27 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
         check_array(X, force_all_finite=True)
 
 
+def test_check_array_panadas_na_support_series():
+    """Check check_array is correct with pd.NA in a series."""
+    pd = pytest.importorskip("pandas")
+
+    X_int64 = pd.Series([1, 2, pd.NA], dtype="Int64")
+
+    msg = "Input contains NaN"
+    with pytest.raises(ValueError, match=msg):
+        check_array(X_int64, force_all_finite=True, ensure_2d=False)
+
+    X_out = check_array(X_int64, force_all_finite=False, ensure_2d=False)
+    assert_allclose(X_out, [1, 2, np.nan])
+    assert X_out.dtype == np.float64
+
+    X_out = check_array(
+        X_int64, force_all_finite=False, ensure_2d=False, dtype=np.float32
+    )
+    assert_allclose(X_out, [1, 2, np.nan])
+    assert X_out.dtype == np.float32
+
+
 def test_check_array_pandas_dtype_casting():
     # test that data-frames with homogeneous dtype are not upcast
     pd = pytest.importorskip("pandas")
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -777,6 +777,13 @@ def check_array(
         if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
 
+    elif hasattr(array, "iloc") and hasattr(array, "dtype"):
+        # array is a pandas series
+        pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
+        if pandas_requires_conversion:
+            # Set to None, to convert to a np.dtype that works with array.dtype
+            dtype_orig = None
+
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":
             # if input is object, convert to float.