diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 1639b4b691c65..fba40e25a9e7e 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -38,9 +38,20 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. - -:mod:`sklearn.decomposition` -............................ +:mod:`sklearn.utils` +.................... + +- |Enhancement| :func:`utils.validation._check_sample_weight` can perform a + non-negativity check on the sample weights. It can be turned on + using the only_non_negative bool parameter. + Estimators that check for non-negative weights are updated: + :func:`linear_model.LinearRegression` (here the previous + error message was misleading), + :func:`ensemble.AdaBoostClassifier`, + :func:`ensemble.AdaBoostRegressor`, + :func:`neighbors.KernelDensity`. + :pr:`20880` by :user:`Guillaume Lemaitre ` + and :user:`András Simon `. Code and Documentation Contributors diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 77ef449ba1933..a47937880d91c 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -123,10 +123,10 @@ def fit(self, X, y, sample_weight=None): y_numeric=is_regressor(self), ) - sample_weight = _check_sample_weight(sample_weight, X, np.float64, copy=True) + sample_weight = _check_sample_weight( + sample_weight, X, np.float64, copy=True, only_non_negative=True + ) sample_weight /= sample_weight.sum() - if np.any(sample_weight < 0): - raise ValueError("sample_weight cannot contain negative weights") # Check parameters self._validate_estimator() @@ -136,7 +136,7 @@ def fit(self, X, y, sample_weight=None): self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) - # Initializion of the random number instance that will be used to + # Initialization of the random number instance that will be used to # generate a seed at each iteration random_state = check_random_state(self.random_state) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 6927d47c11cfe..159f83abf24c4 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -576,6 +576,6 @@ def test_adaboost_negative_weight_error(model, X, y): sample_weight = np.ones_like(y) sample_weight[-1] = -10 - err_msg = "sample_weight cannot contain negative weight" + err_msg = "Negative values in data passed to `sample_weight`" with pytest.raises(ValueError, match=err_msg): model.fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 79d6f321cb124..8b5102ecdd403 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -663,7 +663,9 @@ def fit(self, X, y, sample_weight=None): ) if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, only_non_negative=True + ) X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 328a13371bafd..0ac0ea7226b90 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -191,9 +191,9 @@ def fit(self, X, y=None, sample_weight=None): X = self._validate_data(X, order="C", dtype=DTYPE) if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, DTYPE) - if sample_weight.min() <= 0: - raise ValueError("sample_weight must have positive values") + sample_weight = _check_sample_weight( + sample_weight, X, DTYPE, only_non_negative=True + ) kwargs = self.metric_params if kwargs is None: diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 84f7623c8dbf1..d4fb775c44826 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -209,7 +209,7 @@ def test_sample_weight_invalid(): data = np.reshape([1.0, 2.0, 3.0], (-1, 1)) sample_weight = [0.1, -0.2, 0.3] - expected_err = "sample_weight must have positive values" + expected_err = "Negative values in data passed to `sample_weight`" with pytest.raises(ValueError, match=expected_err): kde.fit(data, sample_weight=sample_weight) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index d1409d6129812..2cbbaac35a31b 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -52,8 +52,8 @@ FLOAT_DTYPES, _get_feature_names, _check_feature_names_in, + _check_fit_params, ) -from sklearn.utils.validation import _check_fit_params from sklearn.base import BaseEstimator import sklearn @@ -1253,6 +1253,14 @@ def test_check_sample_weight(): sample_weight = _check_sample_weight(None, X, dtype=X.dtype) assert sample_weight.dtype == np.float64 + # check negative weight when only_non_negative=True + X = np.ones((5, 2)) + sample_weight = np.ones(_num_samples(X)) + sample_weight[-1] = -10 + err_msg = "Negative values in data passed to `sample_weight`" + with pytest.raises(ValueError, match=err_msg): + _check_sample_weight(sample_weight, X, only_non_negative=True) + @pytest.mark.parametrize("toarray", [np.array, sp.csr_matrix, sp.csc_matrix]) def test_allclose_dense_sparse_equals(toarray): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 87f957b931073..d45f246b233f8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1492,7 +1492,9 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): return lambdas -def _check_sample_weight(sample_weight, X, dtype=None, copy=False): +def _check_sample_weight( + sample_weight, X, dtype=None, copy=False, only_non_negative=False +): """Validate sample weights. Note that passing sample_weight=None will output an array of ones. @@ -1503,17 +1505,22 @@ def _check_sample_weight(sample_weight, X, dtype=None, copy=False): Parameters ---------- sample_weight : {ndarray, Number or None}, shape (n_samples,) - Input sample weights. + Input sample weights. X : {ndarray, list, sparse matrix} Input data. + only_non_negative : bool, default=False, + Whether or not the weights are expected to be non-negative. + + .. versionadded:: 1.0 + dtype : dtype, default=None - dtype of the validated `sample_weight`. - If None, and the input `sample_weight` is an array, the dtype of the - input is preserved; otherwise an array with the default numpy dtype - is be allocated. If `dtype` is not one of `float32`, `float64`, - `None`, the output will be of dtype `float64`. + dtype of the validated `sample_weight`. + If None, and the input `sample_weight` is an array, the dtype of the + input is preserved; otherwise an array with the default numpy dtype + is be allocated. If `dtype` is not one of `float32`, `float64`, + `None`, the output will be of dtype `float64`. copy : bool, default=False If True, a copy of sample_weight will be created. @@ -1521,7 +1528,7 @@ def _check_sample_weight(sample_weight, X, dtype=None, copy=False): Returns ------- sample_weight : ndarray of shape (n_samples,) - Validated sample weight. It is guaranteed to be "C" contiguous. + Validated sample weight. It is guaranteed to be "C" contiguous. """ n_samples = _num_samples(X) @@ -1553,6 +1560,9 @@ def _check_sample_weight(sample_weight, X, dtype=None, copy=False): ) ) + if only_non_negative: + check_non_negative(sample_weight, "`sample_weight`") + return sample_weight