Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d27f138

Browse files
authored
Merge branch 'main' into fea/pdr-simultaneous-stable-sort
2 parents 86fac4b + caefdd4 commit d27f138

File tree

10 files changed

+103
-34
lines changed

10 files changed

+103
-34
lines changed

doc/whats_new/v1.2.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,10 @@ Changelog
416416
:pr:`18298` by :user:`Madhura Jayaratne <madhuracj>` and
417417
:user:`Guillaume Lemaitre <glemaitre>`.
418418

419+
- |Fix| :class:`inspection.DecisionBoundaryDisplay` now raises error if input
420+
data is not 2-dimensional.
421+
:pr:`25077` by :user:`Arturo Amor <ArturoAmorQ>`.
422+
419423
:mod:`sklearn.kernel_approximation`
420424
...................................
421425

@@ -709,6 +713,10 @@ Changelog
709713
- |Fix| :func:`utils.estimator_checks.check_estimator` now takes into account
710714
the `requires_positive_X` tag correctly. :pr:`24667` by `Thomas Fan`_.
711715

716+
- |Fix| :func:`utils.check_array` now supports Pandas Series with `pd.NA`
717+
by raising a better error message or returning a compatible `ndarray`.
718+
:pr:`25080` by `Thomas Fan`_.
719+
712720
- |API| The extra keyword parameters of :func:`utils.extmath.density` are deprecated
713721
and will be removed in 1.4.
714722
:pr:`24523` by :user:`Mia Bajic <clytaemnestra>`.

sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,18 +270,21 @@ def _check_categories(self, X):
270270
if missing.any():
271271
categories = categories[~missing]
272272

273+
if hasattr(self, "feature_names_in_"):
274+
feature_name = f"'{self.feature_names_in_[f_idx]}'"
275+
else:
276+
feature_name = f"at index {f_idx}"
277+
273278
if categories.size > self.max_bins:
274279
raise ValueError(
275-
f"Categorical feature at index {f_idx} is "
276-
"expected to have a "
277-
f"cardinality <= {self.max_bins}"
280+
f"Categorical feature {feature_name} is expected to "
281+
f"have a cardinality <= {self.max_bins}"
278282
)
279283

280284
if (categories >= self.max_bins).any():
281285
raise ValueError(
282-
f"Categorical feature at index {f_idx} is "
283-
"expected to be encoded with "
284-
f"values < {self.max_bins}"
286+
f"Categorical feature {feature_name} is expected to "
287+
f"be encoded with values < {self.max_bins}"
285288
)
286289
else:
287290
categories = None

sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,20 +1141,32 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
11411141
@pytest.mark.parametrize(
11421142
"Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
11431143
)
1144-
def test_categorical_bad_encoding_errors(Est):
1144+
@pytest.mark.parametrize(
1145+
"use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
1146+
)
1147+
def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
11451148
# Test errors when categories are encoded incorrectly
11461149

11471150
gb = Est(categorical_features=[True], max_bins=2)
11481151

1149-
X = np.array([[0, 1, 2]]).T
1152+
if use_pandas:
1153+
pd = pytest.importorskip("pandas")
1154+
X = pd.DataFrame({"f0": [0, 1, 2]})
1155+
else:
1156+
X = np.array([[0, 1, 2]]).T
11501157
y = np.arange(3)
1151-
msg = "Categorical feature at index 0 is expected to have a cardinality <= 2"
1158+
msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2"
11521159
with pytest.raises(ValueError, match=msg):
11531160
gb.fit(X, y)
11541161

1155-
X = np.array([[0, 2]]).T
1162+
if use_pandas:
1163+
X = pd.DataFrame({"f0": [0, 2]})
1164+
else:
1165+
X = np.array([[0, 2]]).T
11561166
y = np.arange(2)
1157-
msg = "Categorical feature at index 0 is expected to be encoded with values < 2"
1167+
msg = (
1168+
f"Categorical feature {feature_name} is expected to be encoded with values < 2"
1169+
)
11581170
with pytest.raises(ValueError, match=msg):
11591171
gb.fit(X, y)
11601172

sklearn/inspection/_plot/decision_boundary.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
from ...utils import check_matplotlib_support
77
from ...utils import _safe_indexing
88
from ...base import is_regressor
9-
from ...utils.validation import check_is_fitted, _is_arraylike_not_scalar
9+
from ...utils.validation import (
10+
check_is_fitted,
11+
_is_arraylike_not_scalar,
12+
_num_features,
13+
)
1014

1115

1216
def _check_boundary_response_method(estimator, response_method):
@@ -316,6 +320,12 @@ def from_estimator(
316320
f"Got {plot_method} instead."
317321
)
318322

323+
num_features = _num_features(X)
324+
if num_features != 2:
325+
raise ValueError(
326+
f"n_features must be equal to 2. Got {num_features} instead."
327+
)
328+
319329
x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1)
320330

321331
x0_min, x0_max = x0.min() - eps, x0.max() + eps

sklearn/inspection/_plot/tests/test_boundary_decision_display.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,16 @@ def fitted_clf():
3838
return LogisticRegression().fit(X, y)
3939

4040

41+
def test_input_data_dimension():
42+
"""Check that we raise an error when `X` does not have exactly 2 features."""
43+
X, y = make_classification(n_samples=10, n_features=4, random_state=0)
44+
45+
clf = LogisticRegression().fit(X, y)
46+
msg = "n_features must be equal to 2. Got 4 instead."
47+
with pytest.raises(ValueError, match=msg):
48+
DecisionBoundaryDisplay.from_estimator(estimator=clf, X=X)
49+
50+
4151
def test_check_boundary_response_method_auto():
4252
"""Check _check_boundary_response_method behavior with 'auto'."""
4353

sklearn/linear_model/tests/test_common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_balance_property(model, with_sample_weight, global_random_seed):
9898
):
9999
pytest.skip("Estimator does not support sample_weight.")
100100

101-
rel = 1e-4 # test precision
101+
rel = 2e-4 # test precision
102102
if isinstance(model, SGDRegressor):
103103
rel = 1e-1
104104
elif hasattr(model, "solver") and model.solver == "saga":

sklearn/utils/_param_validation.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,6 @@ def validate_parameter_constraints(parameter_constraints, params, caller_name):
5050
caller_name : str
5151
The name of the estimator or function or method that called this function.
5252
"""
53-
if len(set(parameter_constraints) - set(params)) != 0:
54-
raise ValueError(
55-
f"The parameter constraints {list(parameter_constraints)}"
56-
" contain unexpected parameters"
57-
f" {set(parameter_constraints) - set(params)}"
58-
)
59-
6053
for param_name, param_val in params.items():
6154
# We allow parameters to not have a constraint so that third party estimators
6255
# can inherit from sklearn estimators without having to necessarily use the

sklearn/utils/tests/test_param_validation.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -453,20 +453,6 @@ def test_validate_params():
453453
_func(0, *[1, 2, 3], c="four", **{"e": 5})
454454

455455

456-
def test_validate_params_match_error():
457-
"""Check that an informative error is raised when there are constraints
458-
that have no matching function paramaters
459-
"""
460-
461-
@validate_params({"a": [int], "c": [int]})
462-
def func(a, b):
463-
pass
464-
465-
match = r"The parameter constraints .* contain unexpected parameters {'c'}"
466-
with pytest.raises(ValueError, match=match):
467-
func(1, 2)
468-
469-
470456
def test_validate_params_missing_params():
471457
"""Check that no error is raised when there are parameters without
472458
constraints
@@ -633,3 +619,22 @@ def test_cv_objects():
633619
assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
634620
assert constraint.is_satisfied_by(None)
635621
assert not constraint.is_satisfied_by("not a CV object")
622+
623+
624+
def test_third_party_estimator():
625+
"""Check that the validation from a scikit-learn estimator inherited by a third
626+
party estimator does not impose a match between the dict of constraints and the
627+
parameters of the estimator.
628+
"""
629+
630+
class ThirdPartyEstimator(_Estimator):
631+
def __init__(self, b):
632+
self.b = b
633+
super().__init__(a=0)
634+
635+
def fit(self, X=None, y=None):
636+
super().fit(X, y)
637+
638+
# does not raise, even though "b" is not in the constraints dict and "a" is not
639+
# a parameter of the estimator.
640+
ThirdPartyEstimator(b=0).fit()

sklearn/utils/tests/test_validation.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,27 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
447447
check_array(X, force_all_finite=True)
448448

449449

450+
def test_check_array_panadas_na_support_series():
451+
"""Check check_array is correct with pd.NA in a series."""
452+
pd = pytest.importorskip("pandas")
453+
454+
X_int64 = pd.Series([1, 2, pd.NA], dtype="Int64")
455+
456+
msg = "Input contains NaN"
457+
with pytest.raises(ValueError, match=msg):
458+
check_array(X_int64, force_all_finite=True, ensure_2d=False)
459+
460+
X_out = check_array(X_int64, force_all_finite=False, ensure_2d=False)
461+
assert_allclose(X_out, [1, 2, np.nan])
462+
assert X_out.dtype == np.float64
463+
464+
X_out = check_array(
465+
X_int64, force_all_finite=False, ensure_2d=False, dtype=np.float32
466+
)
467+
assert_allclose(X_out, [1, 2, np.nan])
468+
assert X_out.dtype == np.float32
469+
470+
450471
def test_check_array_pandas_dtype_casting():
451472
# test that data-frames with homogeneous dtype are not upcast
452473
pd = pytest.importorskip("pandas")

sklearn/utils/validation.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,13 @@ def check_array(
777777
if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
778778
dtype_orig = np.result_type(*dtypes_orig)
779779

780+
elif hasattr(array, "iloc") and hasattr(array, "dtype"):
781+
# array is a pandas series
782+
pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
783+
if pandas_requires_conversion:
784+
# Set to None, to convert to a np.dtype that works with array.dtype
785+
dtype_orig = None
786+
780787
if dtype_numeric:
781788
if dtype_orig is not None and dtype_orig.kind == "O":
782789
# if input is object, convert to float.

0 commit comments

Comments
 (0)