From 542cd7be863e6b06968cee122ab58def22382517 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 1 Apr 2019 06:43:40 -0400 Subject: [PATCH 1/5] WIP --- sklearn/base.py | 3 +- sklearn/dummy.py | 6 +- sklearn/ensemble/base.py | 3 + sklearn/utils/estimator_checks.py | 175 +++++++++++++++--------------- 4 files changed, 96 insertions(+), 91 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 167baaf2b7ebd..4934e4ddeff69 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -25,7 +25,8 @@ 'stateless': False, 'multilabel': False, '_skip_test': False, - 'multioutput_only': False} + 'multioutput_only': False, + 'supports_sample_weight': False} def clone(estimator, safe=True): diff --git a/sklearn/dummy.py b/sklearn/dummy.py index b83712e37f1eb..3a803a98ae466 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -318,7 +318,8 @@ def predict_log_proba(self, X): return [np.log(p) for p in proba] def _more_tags(self): - return {'poor_score': True, 'no_validation': True} + return {'poor_score': True, 'no_validation': True, + 'supports_sample_weight': True} def score(self, X, y, sample_weight=None): """Returns the mean accuracy on the given test data and labels. @@ -510,7 +511,8 @@ def predict(self, X, return_std=False): return (y, y_std) if return_std else y def _more_tags(self): - return {'poor_score': True, 'no_validation': True} + return {'poor_score': True, 'no_validation': True, + 'supports_sample_weight': True} def score(self, X, y, sample_weight=None): """Returns the coefficient of determination R^2 of the prediction. diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py index 7ac1dd4f72613..0ecf212b6e51f 100644 --- a/sklearn/ensemble/base.py +++ b/sklearn/ensemble/base.py @@ -147,6 +147,9 @@ def __iter__(self): """Returns iterator over estimators in the ensemble.""" return iter(self.estimators_) + def _more_tags(self): + return {'supports_sample_weight': True} + def _partition_estimators(n_estimators, n_jobs): """Private function used to partition estimators between jobs.""" diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 73c98ea4685be..6c693f1afba3e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -78,38 +78,40 @@ def _safe_tags(estimator, key=None): def _yield_checks(name, estimator): tags = _safe_tags(estimator) - yield check_estimators_dtypes - yield check_fit_score_takes_y - yield check_sample_weights_pandas_series - yield check_sample_weights_list - yield check_sample_weights_invariance - yield check_estimators_fit_returns_self - yield partial(check_estimators_fit_returns_self, readonly_memmap=True) - - # Check that all estimator yield informative messages when - # trained on empty datasets - if not tags["no_validation"]: - yield check_complex_data - yield check_dtype_object - yield check_estimators_empty_data_messages - - if name not in CROSS_DECOMPOSITION: - # cross-decomposition's "transform" returns X and Y - yield check_pipeline_consistency - - if not tags["allow_nan"] and not tags["no_validation"]: - # Test that all estimators check their input for NaN's and infs - yield check_estimators_nan_inf - - yield check_estimators_overwrite_params - if hasattr(estimator, 'sparsify'): - yield check_sparsify_coefficients - - yield check_estimator_sparse_data - - # Test that estimators can be pickled, and once pickled - # give the same answer as before. - yield check_estimators_pickle + # yield check_estimators_dtypes + # yield check_fit_score_takes_y + if tags['supports_sample_weight']: + print(name) + yield check_sample_weights_pandas_series + yield check_sample_weights_list + yield check_sample_weights_invariance + # yield check_estimators_fit_returns_self + # yield partial(check_estimators_fit_returns_self, readonly_memmap=True) + + # # Check that all estimator yield informative messages when + # # trained on empty datasets + # if not tags["no_validation"]: + # yield check_complex_data + # yield check_dtype_object + # yield check_estimators_empty_data_messages + + # if name not in CROSS_DECOMPOSITION: + # # cross-decomposition's "transform" returns X and Y + # yield check_pipeline_consistency + + # if not tags["allow_nan"] and not tags["no_validation"]: + # # Test that all estimators check their input for NaN's and infs + # yield check_estimators_nan_inf + + # yield check_estimators_overwrite_params + # if hasattr(estimator, 'sparsify'): + # yield check_sparsify_coefficients + + # yield check_estimator_sparse_data + + # # Test that estimators can be pickled, and once pickled + # # give the same answer as before. + # yield check_estimators_pickle def _yield_classifier_checks(name, classifier): @@ -240,31 +242,31 @@ def _yield_all_checks(name, estimator): for check in _yield_checks(name, estimator): yield check - if is_classifier(estimator): - for check in _yield_classifier_checks(name, estimator): - yield check - if is_regressor(estimator): - for check in _yield_regressor_checks(name, estimator): - yield check - if hasattr(estimator, 'transform'): - for check in _yield_transformer_checks(name, estimator): - yield check - if isinstance(estimator, ClusterMixin): - for check in _yield_clustering_checks(name, estimator): - yield check - if is_outlier_detector(estimator): - for check in _yield_outliers_checks(name, estimator): - yield check - yield check_fit2d_predict1d - yield check_methods_subset_invariance - yield check_fit2d_1sample - yield check_fit2d_1feature - yield check_fit1d - yield check_get_params_invariance - yield check_set_params - yield check_dict_unchanged - yield check_dont_overwrite_parameters - yield check_fit_idempotent + # if is_classifier(estimator): + # for check in _yield_classifier_checks(name, estimator): + # yield check + # if is_regressor(estimator): + # for check in _yield_regressor_checks(name, estimator): + # yield check + # if hasattr(estimator, 'transform'): + # for check in _yield_transformer_checks(name, estimator): + # yield check + # if isinstance(estimator, ClusterMixin): + # for check in _yield_clustering_checks(name, estimator): + # yield check + # if is_outlier_detector(estimator): + # for check in _yield_outliers_checks(name, estimator): + # yield check + # yield check_fit2d_predict1d + # yield check_methods_subset_invariance + # yield check_fit2d_1sample + # yield check_fit2d_1feature + # yield check_fit1d + # yield check_get_params_invariance + # yield check_set_params + # yield check_dict_unchanged + # yield check_dont_overwrite_parameters + # yield check_fit_idempotent def check_estimator(Estimator): @@ -546,50 +548,47 @@ def check_sample_weights_pandas_series(name, estimator_orig): # check that estimators will accept a 'sample_weight' parameter of # type pandas.Series in the 'fit' function. estimator = clone(estimator_orig) - if has_fit_parameter(estimator, "sample_weight"): + try: + import pandas as pd + X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], + [2, 1], [2, 2], [2, 3], [2, 4]]) + X = pd.DataFrame(pairwise_estimator_convert_X(X, estimator_orig)) + y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2]) + weights = pd.Series([1] * 8) + if _safe_tags(estimator, "multioutput_only"): + y = pd.DataFrame(y) try: - import pandas as pd - X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], - [2, 1], [2, 2], [2, 3], [2, 4]]) - X = pd.DataFrame(pairwise_estimator_convert_X(X, estimator_orig)) - y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2]) - weights = pd.Series([1] * 8) - if _safe_tags(estimator, "multioutput_only"): - y = pd.DataFrame(y) - try: - estimator.fit(X, y, sample_weight=weights) - except ValueError: - raise ValueError("Estimator {0} raises error if " - "'sample_weight' parameter is of " - "type pandas.Series".format(name)) - except ImportError: - raise SkipTest("pandas is not installed: not testing for " - "input of type pandas.Series to class weight.") + estimator.fit(X, y, sample_weight=weights) + except ValueError: + raise ValueError("Estimator {0} raises error if " + "'sample_weight' parameter is of " + "type pandas.Series".format(name)) + except ImportError: + raise SkipTest("pandas is not installed: not testing for " + "input of type pandas.Series to class weight.") @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_sample_weights_list(name, estimator_orig): # check that estimators will accept a 'sample_weight' parameter of # type list in the 'fit' function. - if has_fit_parameter(estimator_orig, "sample_weight"): - estimator = clone(estimator_orig) - rnd = np.random.RandomState(0) - X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), - estimator_orig) - y = np.arange(10) % 3 - y = multioutput_estimator_convert_y_2d(estimator, y) - sample_weight = [3] * 10 - # Test that estimators don't raise any exception - estimator.fit(X, y, sample_weight=sample_weight) + estimator = clone(estimator_orig) + rnd = np.random.RandomState(0) + X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), + estimator_orig) + y = np.arange(10) % 3 + y = multioutput_estimator_convert_y_2d(estimator, y) + sample_weight = [3] * 10 + # Test that estimators don't raise any exception + estimator.fit(X, y, sample_weight=sample_weight) @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_sample_weights_invariance(name, estimator_orig): # check that the estimators yield same results for # unit weights and no weights - if (has_fit_parameter(estimator_orig, "sample_weight") and - not (hasattr(estimator_orig, "_pairwise") - and estimator_orig._pairwise)): + if not (hasattr(estimator_orig, "_pairwise") + and estimator_orig._pairwise): # We skip pairwise because the data is not pairwise estimator1 = clone(estimator_orig) From 3d98f7dd3f73fd76d3e8ad8e0a0194cce2924d96 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 2 Apr 2019 14:10:47 -0400 Subject: [PATCH 2/5] done ensemble + svm --- sklearn/ensemble/bagging.py | 14 ++++++++++++-- sklearn/ensemble/gradient_boosting.py | 16 +++++----------- sklearn/ensemble/tests/test_gradient_boosting.py | 12 ------------ sklearn/ensemble/weight_boosting.py | 2 +- sklearn/impute.py | 4 ++++ sklearn/svm/base.py | 3 +++ sklearn/svm/classes.py | 6 ++++++ sklearn/tree/tree.py | 3 +++ 8 files changed, 34 insertions(+), 26 deletions(-) diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py index f21fbd81b9a8c..bd6ceda41f210 100644 --- a/sklearn/ensemble/bagging.py +++ b/sklearn/ensemble/bagging.py @@ -66,8 +66,7 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features - support_sample_weight = has_fit_parameter(ensemble.base_estimator_, - "sample_weight") + support_sample_weight = ensemble._get_tags()['supports_sample_weight'] if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") @@ -427,6 +426,17 @@ def estimators_samples_(self): return [sample_indices for _, sample_indices in self._get_estimators_indices()] + def _more_tags(self): + if self.base_estimator is None: + # base_estimator can be None in which case we use a decision tree, + # which accepts sample_weight + supports_sample_weight = True + else: + supports_sample_weight = ( + self.base_estimator._get_tags()['supports_sample_weight']) + + return {'supports_sample_weight': supports_sample_weight} + class BaggingClassifier(BaseBagging, ClassifierMixin): """A Bagging classifier. diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 253cc60ea8ef2..2d2b2d0b55f7a 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1478,21 +1478,15 @@ def fit(self, X, y, sample_weight=None, monitor=None): raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64) else: - # XXX clean this once we have a support_sample_weight tag if sample_weight_is_none: self.init_.fit(X, y) else: - msg = ("The initial estimator {} does not support sample " - "weights.".format(self.init_.__class__.__name__)) - try: - self.init_.fit(X, y, sample_weight=sample_weight) - except TypeError: # regular estimator without SW support + if not self.init_._get_tags()['supports_sample_weight']: + msg = ("The initial estimator {} does not " + "support sample weights." + .format(self.init_.__class__.__name__)) raise ValueError(msg) - except ValueError as e: - if 'not enough values to unpack' in str(e): # pipeline - raise ValueError(msg) from e - else: # regular estimator whose input checking failed - raise + self.init_.fit(X, y, sample_weight=sample_weight) raw_predictions = \ self.loss_.get_init_raw_predictions(X, self.init_) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 821124b73cb1c..a615981401a2c 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1395,18 +1395,6 @@ def test_gradient_boosting_with_init_pipeline(): 'weights'): gb.fit(X, y, sample_weight=np.ones(X.shape[0])) - # Passing sample_weight to a pipeline raises a ValueError. This test makes - # sure we make the distinction between ValueError raised by a pipeline that - # was passed sample_weight, and a ValueError raised by a regular estimator - # whose input checking failed. - with pytest.raises( - ValueError, - match='nu <= 0 or nu > 1'): - # Note that NuSVR properly supports sample_weight - init = NuSVR(gamma='auto', nu=1.5) - gb = GradientBoostingRegressor(init=init) - gb.fit(X, y, sample_weight=np.ones(X.shape[0])) - @pytest.mark.parametrize('estimator, missing_method', [ (GradientBoostingClassifier(init=LinearSVC()), 'predict_proba'), diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 6e13b7bd80ae2..e259db04b6451 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -440,7 +440,7 @@ def _validate_estimator(self): "probabilities with a predict_proba method.\n" "Please change the base estimator or set " "algorithm='SAMME' instead.") - if not has_fit_parameter(self.base_estimator_, "sample_weight"): + if not self.base_estimator_._get_tags()['supports_sample_weight']: raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__) diff --git a/sklearn/impute.py b/sklearn/impute.py index ea4e8663d0313..3263d28af3451 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1139,11 +1139,15 @@ def _get_missing_features_info(self, X): # The imputer mask will be constructed with the same sparse format # as X. + print(mask) sparse_constructor = (sparse.csr_matrix if X.format == 'csr' else sparse.csc_matrix) imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) + print(imputer_mask.todense()) + # imputer_mask.eliminate_zeros() + print(imputer_mask.todense()) missing_values_mask = imputer_mask.copy() missing_values_mask.eliminate_zeros() diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index effb0dcd12504..f2ce7b4b01dcc 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -495,6 +495,9 @@ def coef_(self): def _get_coef(self): return safe_sparse_dot(self._dual_coef_, self.support_vectors_) + def _more_tags(self): + return {'supports_sample_weight': True} + class BaseSVC(BaseLibSVM, ClassifierMixin, metaclass=ABCMeta): """ABC for LibSVM-based classifiers.""" diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index e73ca2e559bb6..85401d69d0d63 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -244,6 +244,9 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return {'supports_sample_weight': True} + class LinearSVR(LinearModel, RegressorMixin): """Linear Support Vector Regression. @@ -425,6 +428,9 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return {'supports_sample_weight': True} + class SVC(BaseSVC): """C-Support Vector Classification. diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index a07e6a0ca5d9a..9a2438d863706 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -526,6 +526,9 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() + def _more_tags(self): + return {'supports_sample_weight': True} + # ============================================================================= # Public estimators From 5f1ed2b9a13640a80c23e28c7de9958aef7b5d97 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 2 Apr 2019 14:13:22 -0400 Subject: [PATCH 3/5] reverted unwanted changes --- sklearn/impute.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 3263d28af3451..ea4e8663d0313 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1139,15 +1139,11 @@ def _get_missing_features_info(self, X): # The imputer mask will be constructed with the same sparse format # as X. - print(mask) sparse_constructor = (sparse.csr_matrix if X.format == 'csr' else sparse.csc_matrix) imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) - print(imputer_mask.todense()) - # imputer_mask.eliminate_zeros() - print(imputer_mask.todense()) missing_values_mask = imputer_mask.copy() missing_values_mask.eliminate_zeros() From bc61c5f475171afb15c75b45ab471af210a69e20 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 3 Apr 2019 14:29:44 -0400 Subject: [PATCH 4/5] more on this --- sklearn/calibration.py | 11 +++++++++++ sklearn/cluster/dbscan_.py | 3 +++ sklearn/cluster/k_means_.py | 2 ++ sklearn/compose/_target.py | 12 +++++++++++- sklearn/ensemble/bagging.py | 2 +- sklearn/ensemble/voting_classifier.py | 17 +++++++++++++---- sklearn/ensemble/weight_boosting.py | 1 - sklearn/isotonic.py | 3 ++- sklearn/kernel_ridge.py | 3 +++ sklearn/linear_model/base.py | 3 +++ sklearn/linear_model/bayes.py | 3 +++ sklearn/linear_model/huber.py | 3 +++ sklearn/linear_model/logistic.py | 6 ++++++ sklearn/linear_model/perceptron.py | 3 +++ sklearn/linear_model/ransac.py | 18 +++++++++++++----- sklearn/linear_model/ridge.py | 6 ++++++ sklearn/linear_model/stochastic_gradient.py | 6 ++++++ sklearn/multioutput.py | 11 +++++++---- sklearn/naive_bayes.py | 3 +++ sklearn/neighbors/kde.py | 3 +++ sklearn/utils/estimator_checks.py | 10 +++++++++- 21 files changed, 111 insertions(+), 18 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index f84cbb328370f..fd04eea0d847a 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -199,6 +199,17 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + if self.base_estimator is None: + # base_estimator can be None in which case we use LinearSVC + # which accepts sample_weight + supports_sample_weight = True + else: + supports_sample_weight = ( + self.base_estimator._get_tags()['supports_sample_weight']) + + return {'supports_sample_weight': supports_sample_weight} + def predict_proba(self, X): """Posterior probabilities of classification diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index f21beb3f91453..afcd98e85a60c 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -367,3 +367,6 @@ def fit_predict(self, X, y=None, sample_weight=None): """ self.fit(X, sample_weight=sample_weight) return self.labels_ + + def _more_tags(self): + return {'supports_sample_weight': True} \ No newline at end of file diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 042e6990b5df1..7aae4551b258c 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -1103,6 +1103,8 @@ def score(self, X, y=None, sample_weight=None): return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] + def _more_tags(self): + return {'supports_sample_weight': True} def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, old_center_buffer, compute_squared_diff, diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 5213605defd30..7c0bd9a72cbaf 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -235,4 +235,14 @@ def predict(self, X): return pred_trans def _more_tags(self): - return {'poor_score': True, 'no_validation': True} + if self.regressor is None: + # base_estimator can be None in which case we use LinearRegression + # which accepts sample_weight + supports_sample_weight = True + else: + supports_sample_weight = ( + self.base_estimator._get_tags()['supports_sample_weight']) + + return {'poor_score': True, 'no_validation': True, + 'supports_sample_weight': supports_sample_weight} + diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py index bd6ceda41f210..9c00a54f78f5c 100644 --- a/sklearn/ensemble/bagging.py +++ b/sklearn/ensemble/bagging.py @@ -20,7 +20,7 @@ from ..utils.metaestimators import if_delegate_has_method from ..utils.multiclass import check_classification_targets from ..utils.random import sample_without_replacement -from ..utils.validation import has_fit_parameter, check_is_fitted +from ..utils.validation import check_is_fitted __all__ = ["BaggingClassifier", diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py index 63e0ee94a97b5..2d41ba5677173 100644 --- a/sklearn/ensemble/voting_classifier.py +++ b/sklearn/ensemble/voting_classifier.py @@ -18,7 +18,7 @@ from ..base import clone from ..preprocessing import LabelEncoder from ..utils._joblib import Parallel, delayed -from ..utils.validation import has_fit_parameter, check_is_fitted +from ..utils.validation import check_is_fitted from ..utils.metaestimators import _BaseComposition from ..utils import Bunch @@ -176,10 +176,11 @@ def fit(self, X, y, sample_weight=None): % (len(self.weights), len(self.estimators))) if sample_weight is not None: - for name, step in self.estimators: - if not has_fit_parameter(step, 'sample_weight'): + for est in self.estimators: + if not est._get_tags()['supports_sample_weight']: raise ValueError('Underlying estimator \'%s\' does not' - ' support sample weights.' % name) + ' support sample weights.' % + est.__class__.__name__) names, clfs = zip(*self.estimators) self._validate_names(names) @@ -343,3 +344,11 @@ def get_params(self, deep=True): def _predict(self, X): """Collect results from clf.predict calls. """ return np.asarray([clf.predict(X) for clf in self.estimators_]).T + + def _more_tags(self): + supports_sample_weight = all( + est._get_tags()['supports_sample_weight'] + for est in self.estimators + ) + + return {'supports_sample_weight': supports_sample_weight} diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index e259db04b6451..bf563254d48bd 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -37,7 +37,6 @@ from ..utils.extmath import stable_cumsum from ..metrics import accuracy_score, r2_score from ..utils.validation import check_is_fitted -from ..utils.validation import has_fit_parameter from ..utils.validation import _num_samples __all__ = [ diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 3b8f74a946699..216d18c845665 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -407,4 +407,5 @@ def __setstate__(self, state): self._build_f(self._necessary_X_, self._necessary_y_) def _more_tags(self): - return {'X_types': ['1darray']} + return {'X_types': ['1darray'], + 'supports_sample_weight': True} diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index aeb5fd45f413f..bd657d52da8fd 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -192,3 +192,6 @@ def predict(self, X): check_is_fitted(self, ["X_fit_", "dual_coef_"]) K = self._get_kernel(X, self.X_fit_) return np.dot(K, self.dual_coef_) + + def _more_tags(self): + return {'supports_sample_weight': True} diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index 54083fee1e904..c210d3ae3e1dc 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -508,6 +508,9 @@ def rmatvec(b): self._set_intercept(X_offset, y_offset, X_scale) return self + def _more_tags(self): + return {'supports_sample_weight': True} + def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy, check_input=True): diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py index 1ff59aa313b99..0d75a50667c79 100644 --- a/sklearn/linear_model/bayes.py +++ b/sklearn/linear_model/bayes.py @@ -359,6 +359,9 @@ def _log_marginal_likelihood(self, n_samples, n_features, eigen_vals, return score + def _more_tags(self): + return {'supports_sample_weight': True} + ############################################################################### # ARD (Automatic Relevance Determination) regression diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index 285913684832f..7b7443f81397e 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -305,3 +305,6 @@ def fit(self, X, y, sample_weight=None): y - safe_sparse_dot(X, self.coef_) - self.intercept_) self.outliers_ = residual > self.scale_ * self.epsilon return self + + def _more_tags(self): + return {'supports_sample_weight': True} \ No newline at end of file diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index be664d5b5c087..af3e4240558ed 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -1680,6 +1680,9 @@ def predict_log_proba(self, X): """ return np.log(self.predict_proba(X)) + def _more_tags(self): + return {'supports_sample_weight': True} + class LogisticRegressionCV(LogisticRegression, BaseEstimator, LinearClassifierMixin): @@ -2260,3 +2263,6 @@ def score(self, X, y, sample_weight=None): scoring = get_scorer(scoring) return scoring(self, X, y, sample_weight=sample_weight) + + def _more_tags(self): + return {'supports_sample_weight': True} diff --git a/sklearn/linear_model/perceptron.py b/sklearn/linear_model/perceptron.py index 2bf7899069864..393c414d6d60d 100644 --- a/sklearn/linear_model/perceptron.py +++ b/sklearn/linear_model/perceptron.py @@ -153,3 +153,6 @@ def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, power_t=0.5, warm_start=warm_start, class_weight=class_weight, n_jobs=n_jobs) + + def _more_tags(self): + return {'supports_sample_weight': True} diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index 0205b75df55cf..4e3bdf3997cb9 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -13,7 +13,6 @@ from ..utils.random import sample_without_replacement from ..utils.validation import check_is_fitted from .base import LinearRegression -from ..utils.validation import has_fit_parameter from ..exceptions import ConvergenceWarning _EPSILON = np.spacing(1) @@ -316,11 +315,9 @@ def fit(self, X, y, sample_weight=None): except ValueError: pass - estimator_fit_has_sample_weight = has_fit_parameter(base_estimator, - "sample_weight") + supports_sample_weight = self._get_tags()['supports_sample_weight'] estimator_name = type(base_estimator).__name__ - if (sample_weight is not None and not - estimator_fit_has_sample_weight): + if sample_weight is not None and not supports_sample_weight: raise ValueError("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) @@ -492,3 +489,14 @@ def score(self, X, y): check_is_fitted(self, 'estimator_') return self.estimator_.score(X, y) + + def _more_tags(self): + if self.base_estimator is None: + # base_estimator can be None in which case we use LinearRegression + # which accepts sample_weight + supports_sample_weight = True + else: + supports_sample_weight = ( + self.base_estimator._get_tags()['supports_sample_weight']) + + return {'supports_sample_weight': supports_sample_weight} diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index e1fc9b42438e4..35bc43892d3e0 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -572,6 +572,9 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return {'supports_sample_weight': True} + class Ridge(_BaseRidge, RegressorMixin): """Linear least squares with l2 regularization. @@ -1223,6 +1226,9 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return {'supports_sample_weight': True} + class RidgeCV(_BaseRidgeCV, RegressorMixin): """Ridge regression with built-in cross-validation. diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 3e33e59588117..b71eeaee4cade 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -1048,6 +1048,9 @@ def predict_log_proba(self): def _predict_log_proba(self, X): return np.log(self.predict_proba(X)) + def _more_tags(self): + return {'supports_sample_weight': True} + class BaseSGDRegressor(BaseSGD, RegressorMixin): @@ -1526,3 +1529,6 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average) + + def _more_tags(self): + return {'supports_sample_weight': True} diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 0da22e5e570d7..7d93d1facc9ed 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -23,7 +23,7 @@ from .utils import check_array, check_X_y, check_random_state from .utils.fixes import parallel_helper from .utils.metaestimators import if_delegate_has_method -from .utils.validation import check_is_fitted, has_fit_parameter +from .utils.validation import check_is_fitted from .utils.multiclass import check_classification_targets from .utils._joblib import Parallel, delayed @@ -106,7 +106,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): "multi-output regression but has only one.") if (sample_weight is not None and - not has_fit_parameter(self.estimator, 'sample_weight')): + not self.estimator._get_tags()['supports_sample_weight']): raise ValueError("Underlying estimator does not support" " sample weights.") @@ -159,7 +159,7 @@ def fit(self, X, y, sample_weight=None): "multi-output regression but has only one.") if (sample_weight is not None and - not has_fit_parameter(self.estimator, 'sample_weight')): + not self.estimator._get_tags()['supports_sample_weight']): raise ValueError("Underlying estimator does not support" " sample weights.") @@ -197,7 +197,10 @@ def predict(self, X): return np.asarray(y).T def _more_tags(self): - return {'multioutput_only': True} + supports_sample_weight = ( + self.estimator._get_tags()['supports_sample_weight']) + return {'multioutput_only': True, + 'supports_sample_weight': supports_sample_weight} class MultiOutputRegressor(MultiOutputEstimator, RegressorMixin): diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 5d18327c8a261..1f549d8c07804 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -102,6 +102,9 @@ def predict_proba(self, X): """ return np.exp(self.predict_log_proba(X)) + def _more_tags(self): + return {'supports_sample_weight': True} + class GaussianNB(BaseNB): """ diff --git a/sklearn/neighbors/kde.py b/sklearn/neighbors/kde.py index be5002e579423..8c7539ae8b77c 100644 --- a/sklearn/neighbors/kde.py +++ b/sklearn/neighbors/kde.py @@ -244,3 +244,6 @@ def sample(self, n_samples=1, random_state=None): correction = (gammainc(0.5 * dim, 0.5 * s_sq) ** (1. / dim) * self.bandwidth / np.sqrt(s_sq)) return data[i] + X * correction[:, np.newaxis] + + def _more_tags(self): + return {'supports_sample_weight': True} diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6c693f1afba3e..acf6472bcb663 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -81,7 +81,6 @@ def _yield_checks(name, estimator): # yield check_estimators_dtypes # yield check_fit_score_takes_y if tags['supports_sample_weight']: - print(name) yield check_sample_weights_pandas_series yield check_sample_weights_list yield check_sample_weights_invariance @@ -267,6 +266,7 @@ def _yield_all_checks(name, estimator): # yield check_dict_unchanged # yield check_dont_overwrite_parameters # yield check_fit_idempotent + yield check_supports_sample_weight_tag def check_estimator(Estimator): @@ -2467,3 +2467,11 @@ def check_fit_idempotent(name, estimator_orig): if hasattr(estimator, method): new_result = getattr(estimator, method)(X_test) assert_allclose_dense_sparse(result[method], new_result) + +def check_supports_sample_weight_tag(name, estimator_orig): + # Make sure that the supports_sample_weight tag is correct + + estimator = clone(estimator_orig) + + assert (has_fit_parameter(estimator, 'sample_weight') == + estimator._get_tags()['supports_sample_weight']) From 7318f48f400b38be870eb512ed67660ff26a6eea Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 3 Apr 2019 15:04:06 -0400 Subject: [PATCH 5/5] uncommented rest of the checks --- sklearn/utils/estimator_checks.py | 96 +++++++++++++++---------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index acf6472bcb663..7841f09b81c6d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -78,39 +78,39 @@ def _safe_tags(estimator, key=None): def _yield_checks(name, estimator): tags = _safe_tags(estimator) - # yield check_estimators_dtypes - # yield check_fit_score_takes_y + yield check_estimators_dtypes + yield check_fit_score_takes_y if tags['supports_sample_weight']: yield check_sample_weights_pandas_series yield check_sample_weights_list yield check_sample_weights_invariance - # yield check_estimators_fit_returns_self - # yield partial(check_estimators_fit_returns_self, readonly_memmap=True) + yield check_estimators_fit_returns_self + yield partial(check_estimators_fit_returns_self, readonly_memmap=True) - # # Check that all estimator yield informative messages when - # # trained on empty datasets - # if not tags["no_validation"]: - # yield check_complex_data - # yield check_dtype_object - # yield check_estimators_empty_data_messages + # Check that all estimator yield informative messages when + # trained on empty datasets + if not tags["no_validation"]: + yield check_complex_data + yield check_dtype_object + yield check_estimators_empty_data_messages - # if name not in CROSS_DECOMPOSITION: - # # cross-decomposition's "transform" returns X and Y - # yield check_pipeline_consistency + if name not in CROSS_DECOMPOSITION: + # cross-decomposition's "transform" returns X and Y + yield check_pipeline_consistency - # if not tags["allow_nan"] and not tags["no_validation"]: - # # Test that all estimators check their input for NaN's and infs - # yield check_estimators_nan_inf + if not tags["allow_nan"] and not tags["no_validation"]: + # Test that all estimators check their input for NaN's and infs + yield check_estimators_nan_inf - # yield check_estimators_overwrite_params - # if hasattr(estimator, 'sparsify'): - # yield check_sparsify_coefficients + yield check_estimators_overwrite_params + if hasattr(estimator, 'sparsify'): + yield check_sparsify_coefficients - # yield check_estimator_sparse_data + yield check_estimator_sparse_data - # # Test that estimators can be pickled, and once pickled - # # give the same answer as before. - # yield check_estimators_pickle + # Test that estimators can be pickled, and once pickled + # give the same answer as before. + yield check_estimators_pickle def _yield_classifier_checks(name, classifier): @@ -241,31 +241,31 @@ def _yield_all_checks(name, estimator): for check in _yield_checks(name, estimator): yield check - # if is_classifier(estimator): - # for check in _yield_classifier_checks(name, estimator): - # yield check - # if is_regressor(estimator): - # for check in _yield_regressor_checks(name, estimator): - # yield check - # if hasattr(estimator, 'transform'): - # for check in _yield_transformer_checks(name, estimator): - # yield check - # if isinstance(estimator, ClusterMixin): - # for check in _yield_clustering_checks(name, estimator): - # yield check - # if is_outlier_detector(estimator): - # for check in _yield_outliers_checks(name, estimator): - # yield check - # yield check_fit2d_predict1d - # yield check_methods_subset_invariance - # yield check_fit2d_1sample - # yield check_fit2d_1feature - # yield check_fit1d - # yield check_get_params_invariance - # yield check_set_params - # yield check_dict_unchanged - # yield check_dont_overwrite_parameters - # yield check_fit_idempotent + if is_classifier(estimator): + for check in _yield_classifier_checks(name, estimator): + yield check + if is_regressor(estimator): + for check in _yield_regressor_checks(name, estimator): + yield check + if hasattr(estimator, 'transform'): + for check in _yield_transformer_checks(name, estimator): + yield check + if isinstance(estimator, ClusterMixin): + for check in _yield_clustering_checks(name, estimator): + yield check + if is_outlier_detector(estimator): + for check in _yield_outliers_checks(name, estimator): + yield check + yield check_fit2d_predict1d + yield check_methods_subset_invariance + yield check_fit2d_1sample + yield check_fit2d_1feature + yield check_fit1d + yield check_get_params_invariance + yield check_set_params + yield check_dict_unchanged + yield check_dont_overwrite_parameters + yield check_fit_idempotent yield check_supports_sample_weight_tag