Date: Fri, 15 Nov 2019 11:06:57 -0500 Subject: [PATCH 013/448] MNT Fix whats new link on the home page (#15634) * fix whats new link * oops fixed bracket --- doc/templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/templates/index.html b/doc/templates/index.html index 9a60b36ddeae2..b867d8c517cf5 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -8,7 +8,7 @@

scikit-learn

Machine Learning in Python

Getting Started - Whats New in {{ version }} + What's New in {{ version }} GitHub

From 947f37e3a134e4036c3aaf1549f38fa1ff27ca3d Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 15 Nov 2019 21:24:13 +0100 Subject: [PATCH 014/448] MNT bump version to 0.23.dev0 and add new whats_new (#15631) --- doc/whats_new.rst | 1 + doc/whats_new/v0.23.rst | 51 +++++++++++++++++++++++++++++++++++++++++ sklearn/__init__.py | 2 +- 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 doc/whats_new/v0.23.rst diff --git a/doc/whats_new.rst b/doc/whats_new.rst index a9097d765886e..7b84374bd5146 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -12,6 +12,7 @@ on libraries.io to be notified when new versions are released. .. toctree:: :maxdepth: 1 + Version 0.23 Version 0.22 Version 0.21 Version 0.20 diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst new file mode 100644 index 0000000000000..5cec206a0fe95 --- /dev/null +++ b/doc/whats_new/v0.23.rst @@ -0,0 +1,51 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_23: + +Version 0.23.0 +============== + +**In Development** + + +.. include:: changelog_legend.inc + +Put the changes in their relevant module. + + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- models come here + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +.. + Entries should be grouped by module (in alphabetic order) and prefixed with + one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|, + |Fix| or |API| (see whats_new.rst for descriptions). + Entries should be ordered by those labels (e.g. |Fix| after |Efficiency|). + Changes not specific to a module should be listed under *Multiple Modules* + or *Miscellaneous*. + Entries should end with: + :pr:`123456` by :user:`Joe Bloggs `. + where 123456 is the *pull request* number, not the issue number. + +:mod:`sklearn.module` +..................... + +- |Fix| example fix in model XXX. :pr:`xxxx` or :issue:`xxxx` by + :user:`name ` diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 8b897eadc03e6..d49d93e224537 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -40,7 +40,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.22.dev0' +__version__ = '0.23.dev0' # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From 004426ab535d7a98c8d4254b66dd113358a77771 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 15 Nov 2019 21:26:09 +0100 Subject: [PATCH 015/448] ENH Consistent checks for sample weights in linear models (#15530) --- sklearn/linear_model/_base.py | 22 ++--- sklearn/linear_model/_bayes.py | 8 +- sklearn/linear_model/_ransac.py | 3 +- sklearn/linear_model/_ridge.py | 164 ++++++++++++++++---------------- 4 files changed, 104 insertions(+), 93 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 457752d9a560d..e6270dce6d906 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -34,7 +34,7 @@ from ..utils.fixes import sparse_lsqr from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 from ..utils._seq_dataset import ArrayDataset64, CSRDataset64 -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from ..preprocessing import normalize as f_normalize # TODO: bayesian_ridge_regression and bayesian_regression_ard @@ -117,7 +117,6 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, This is here because nearly all linear models will want their data to be centered. This function also systematically makes y consistent with X.dtype """ - if isinstance(sample_weight, numbers.Number): sample_weight = None if sample_weight is not None: @@ -183,7 +182,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, def _rescale_data(X, y, sample_weight): """Rescale data so as to support sample_weight""" n_samples = X.shape[0] - sample_weight = np.array(sample_weight) + sample_weight = np.asarray(sample_weight) if sample_weight.ndim == 0: sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype) @@ -408,7 +407,7 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): Attributes ---------- - coef_ : array, shape (n_features, ) or (n_targets, n_features) + coef_ : array of shape (n_features, ) or (n_targets, n_features) Estimated coefficients for the linear regression problem. If multiple targets are passed during the fit (y 2D), this is a 2D array of shape (n_targets, n_features), while if only @@ -417,10 +416,10 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): rank_ : int Rank of matrix `X`. Only available when `X` is dense. - singular_ : array, shape (min(X, y),) + singular_ : array of shape (min(X, y),) Singular values of `X`. Only available when `X` is dense. - intercept_ : float | array, shape = (n_targets,) + intercept_ : float or array of shape of (n_targets,) Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. @@ -471,13 +470,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data - y : array_like, shape (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary - sample_weight : numpy array of shape [n_samples] + sample_weight : array-like of shape (n_samples,), default=None Individual weights for each sample .. versionadded:: 0.17 @@ -492,8 +491,9 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) - if sample_weight is not None and np.asarray(sample_weight).ndim > 1: - raise ValueError("Sample weights must be 1D array or scalar") + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, + dtype=X.dtype) X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index c97f52ac6778e..333ae5494b4e9 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -14,6 +14,7 @@ from ..utils.extmath import fast_logdet from ..utils import check_X_y from ..utils.fixes import pinvh +from ..utils.validation import _check_sample_weight ############################################################################### @@ -169,7 +170,7 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : ndarray of shape (n_samples,n_features) + X : ndarray of shape (n_samples, n_features) Training data y : ndarray of shape (n_samples,) Target values. Will be cast to X's dtype if necessary @@ -190,6 +191,11 @@ def fit(self, X, y, sample_weight=None): ' Got {!r}.'.format(self.n_iter)) X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, + dtype=X.dtype) + X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 5fe50b5a21acb..1350878b54154 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -320,7 +320,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) - sample_weight = _check_sample_weight(sample_weight, X) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) n_inliers_best = 1 score_best = -np.inf diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index d217e0d832d2b..9e1dd7f22085d 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -245,15 +245,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', Parameters ---------- - X : {array-like, sparse matrix, LinearOperator}, - shape = [n_samples, n_features] + X : {array-like, sparse matrix, LinearOperator} of shape \ + (n_samples, n_features) Training data y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values - alpha : {float, array-like}, - shape = [n_targets] if array-like + alpha : float or array-like of shape (n_targets,) Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. @@ -262,8 +261,9 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', assumed to be specific to the targets. Hence they must correspond in number. - sample_weight : float or numpy array of shape (n_samples,), default=None - Individual weights for each sample. If sample_weight is not None and + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. If sample_weight is not None and solver='auto', the solver will be set to 'cholesky'. .. versionadded:: 0.17 @@ -349,14 +349,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', Returns ------- - coef : array, shape = [n_features] or [n_targets, n_features] + coef : array of shape (n_features,) or (n_targets, n_features) Weight vector(s). n_iter : int, optional The actual number of iteration performed by the solver. Only returned if `return_n_iter` is True. - intercept : float or array, shape = [n_targets] + intercept : float or array of shape (n_targets,) The intercept of the model. Only returned if `return_intercept` is True and if X is a scipy sparse array. @@ -364,7 +364,6 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', ----- This function won't compute the intercept. """ - return _ridge_regression(X, y, alpha, sample_weight=sample_weight, solver=solver, @@ -566,9 +565,9 @@ def fit(self, X, y, sample_weight=None): else: solver = self.solver - if ((sample_weight is not None) and - np.asarray(sample_weight).ndim > 1): - raise ValueError("Sample weights must be 1D array or scalar") + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, + dtype=X.dtype) # when X is sparse we only remove offset from y X, y, X_offset, y_offset, X_scale = self._preprocess_data( @@ -613,7 +612,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): the linear least squares function and regularization is given by the l2-norm. Also known as Ridge Regression or Tikhonov regularization. This estimator has built-in support for multi-variate regression - (i.e., when y is a 2d-array of shape [n_samples, n_targets]). + (i.e., when y is a 2d-array of shape (n_samples, n_targets)). Read more in the :ref:`User Guide `. @@ -701,14 +700,14 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): Attributes ---------- - coef_ : array, shape (n_features,) or (n_targets, n_features) + coef_ : array of shape (n_features,) or (n_targets, n_features) Weight vector(s). - intercept_ : float | array, shape = (n_targets,) + intercept_ : float or array of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. - n_iter_ : array or None, shape (n_targets,) + n_iter_ : None or array of shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. @@ -732,8 +731,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): >>> clf = Ridge(alpha=1.0) >>> clf.fit(X, y) Ridge() - """ + def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): @@ -744,7 +743,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, random_state=random_state) def fit(self, X, y, sample_weight=None): - """Fit Ridge regression model + """Fit Ridge regression model. Parameters ---------- @@ -754,8 +753,9 @@ def fit(self, X, y, sample_weight=None): y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values - sample_weight : float or numpy array of shape [n_samples] - Individual weights for each sample + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. Returns ------- @@ -856,16 +856,16 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): Attributes ---------- - coef_ : array, shape (1, n_features) or (n_classes, n_features) + coef_ : array of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. ``coef_`` is of shape (1, n_features) when the given problem is binary. - intercept_ : float | array, shape = (n_targets,) + intercept_ : float or array of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. - n_iter_ : array or None, shape (n_targets,) + n_iter_ : None or array of shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. @@ -903,7 +903,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, self.class_weight = class_weight def fit(self, X, y, sample_weight=None): - """Fit Ridge regression model. + """Fit Ridge classifier model. Parameters ---------- @@ -913,8 +913,9 @@ def fit(self, X, y, sample_weight=None): y : array-like of shape (n_samples,) Target values. - sample_weight : {float, array-like of shape (n_samples,)}, default=None - Sample weight. + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. .. versionadded:: 0.17 *sample_weight* support to Classifier. @@ -926,7 +927,9 @@ def fit(self, X, y, sample_weight=None): """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True) + X, y = check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True, + y_numeric=False) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) @@ -939,8 +942,6 @@ def fit(self, X, y, sample_weight=None): self.__class__.__name__)) if self.class_weight: - if sample_weight is None: - sample_weight = 1. # modify the sample weights with the corresponding class weight sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) @@ -976,10 +977,10 @@ def _find_smallest_angle(query, vectors): Parameters ---------- - query : ndarray, shape (n_samples,) + query : ndarray of shape (n_samples,) Normalized query vector. - vectors : ndarray, shape (n_samples, n_features) + vectors : ndarray of shape (n_samples, n_features) Vectors to which we compare query, as columns. Must be normalized. """ abs_cosine = np.abs(query.dot(vectors)) @@ -1120,17 +1121,17 @@ def _compute_gram(self, X, sqrt_sw): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The preprocessed design matrix. - sqrt_sw : ndarray, shape (n_samples,) + sqrt_sw : ndarray of shape (n_samples,) square roots of sample weights Returns ------- - gram : ndarray, shape (n_samples, n_samples) + gram : ndarray of shape (n_samples, n_samples) The Gram matrix. - X_mean : ndarray, shape (n_feature,) + X_mean : ndarray of shape (n_feature,) The weighted mean of ``X`` for each feature. Notes @@ -1170,17 +1171,17 @@ def _compute_covariance(self, X, sqrt_sw): Parameters ---------- - X : sparse matrix, shape (n_samples, n_features) + X : sparse matrix of shape (n_samples, n_features) The preprocessed design matrix. - sqrt_sw : ndarray, shape (n_samples,) + sqrt_sw : ndarray of shape (n_samples,) square roots of sample weights Returns ------- - covariance : ndarray, shape (n_features, n_features) + covariance : ndarray of shape (n_features, n_features) The covariance matrix. - X_mean : ndarray, shape (n_feature,) + X_mean : ndarray of shape (n_feature,) The weighted mean of ``X`` for each feature. Notes @@ -1219,16 +1220,16 @@ def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw): ---------- X : sparse matrix of shape (n_samples, n_features) - A : np.ndarray, shape = (n_features, n_features) + A : ndarray of shape (n_features, n_features) - X_mean : np.ndarray, shape = (n_features,) + X_mean : ndarray of shape (n_features,) - sqrt_sw : np.ndarray, shape = (n_features,) + sqrt_sw : ndarray of shape (n_features,) square roots of sample weights Returns ------- - diag : np.ndarray, shape = (n_samples,) + diag : np.ndarray, shape (n_samples,) The computed diagonal. """ intercept_col = scale = sqrt_sw @@ -1249,7 +1250,7 @@ def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw): return diag def _eigen_decompose_gram(self, X, y, sqrt_sw): - """Eigendecomposition of X.X^T, used when n_samples <= n_features""" + """Eigendecomposition of X.X^T, used when n_samples <= n_features.""" # if X is dense it has already been centered in preprocessing K, X_mean = self._compute_gram(X, sqrt_sw) if self.fit_intercept: @@ -1263,7 +1264,7 @@ def _eigen_decompose_gram(self, X, y, sqrt_sw): return X_mean, eigvals, Q, QT_y def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y): - """Compute dual coefficients and diagonal of G^-1 + """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X.X^T (n_samples <= n_features). """ @@ -1329,7 +1330,7 @@ def _solve_eigen_covariance_no_intercept( def _solve_eigen_covariance_intercept( self, alpha, y, sqrt_sw, X_mean, eigvals, V, X): - """Compute dual coefficients and diagonal of G^-1 + """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X^T.X (n_samples > n_features and X is sparse), @@ -1359,7 +1360,7 @@ def _solve_eigen_covariance_intercept( def _solve_eigen_covariance( self, alpha, y, sqrt_sw, X_mean, eigvals, V, X): - """Compute dual coefficients and diagonal of G^-1 + """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X^T.X (n_samples > n_features and X is sparse). @@ -1386,7 +1387,7 @@ def _svd_decompose_design_matrix(self, X, y, sqrt_sw): def _solve_svd_design_matrix( self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y): - """Compute dual coefficients and diagonal of G^-1 + """Compute dual coefficients and diagonal of G^-1. Used when we have an SVD decomposition of X (n_samples > n_features and X is dense). @@ -1406,34 +1407,36 @@ def _solve_svd_design_matrix( return G_inverse_diag, c def fit(self, X, y, sample_weight=None): - """Fit Ridge regression model + """Fit Ridge regression model with gcv. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) - Training data. Will be cast to float64 if necessary + Training data. Will be cast to float64 if necessary. y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values. Will be cast to float64 if necessary + Target values. Will be cast to float64 if necessary. - sample_weight : float or array-like of shape [n_samples] - Sample weight + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. Returns ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], - dtype=[np.float64], + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=[np.float64], multi_output=True, y_numeric=True) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, + dtype=X.dtype) + if np.any(self.alphas <= 0): raise ValueError( "alphas must be positive. Got {} containing some " "negative or null value instead.".format(self.alphas)) - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( @@ -1525,7 +1528,7 @@ def __init__(self, alphas=(0.1, 1.0, 10.0), self.store_cv_values = store_cv_values def fit(self, X, y, sample_weight=None): - """Fit Ridge regression model + """Fit Ridge regression model with cv. Parameters ---------- @@ -1534,10 +1537,11 @@ def fit(self, X, y, sample_weight=None): if necessary. y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values. Will be cast to X's dtype if necessary + Target values. Will be cast to X's dtype if necessary. - sample_weight : float or array-like of shape [n_samples] - Sample weight + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. Returns ------- @@ -1595,7 +1599,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): Parameters ---------- - alphas : numpy array of shape (n_alphas,), default=(0.1, 1.0, 10.0) + alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0) Array of alpha values to try. Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of @@ -1661,17 +1665,17 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): Attributes ---------- - cv_values_ : array, shape = [n_samples, n_alphas] or \ - shape = [n_samples, n_targets, n_alphas], optional + cv_values_ : array of shape (n_samples, n_alphas) or \ + shape (n_samples, n_targets, n_alphas), optional Cross-validation values for each alpha (if ``store_cv_values=True``\ and ``cv=None``). After ``fit()`` has been called, this attribute \ will contain the mean squared errors (by default) or the values \ of the ``{loss,score}_func`` function (if provided in the constructor). - coef_ : array, shape = [n_features] or [n_targets, n_features] + coef_ : array of shape (n_features) or (n_targets, n_features) Weight vector(s). - intercept_ : float | array, shape = (n_targets,) + intercept_ : float or array of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. @@ -1709,7 +1713,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): Parameters ---------- - alphas : numpy array of shape (n_alphas,), default=(0.1, 1.0, 10.0) + alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0) Array of alpha values to try. Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of @@ -1763,19 +1767,19 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): Attributes ---------- - cv_values_ : array, shape = [n_samples, n_targets, n_alphas], optional + cv_values_ : array of shape (n_samples, n_targets, n_alphas), optional Cross-validation values for each alpha (if ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been called, this attribute will contain the mean squared errors (by default) or the values of the ``{loss,score}_func`` function (if provided in the constructor). This attribute exists only when ``store_cv_values`` is True. - coef_ : array, shape (1, n_features) or (n_targets, n_features) + coef_ : array of shape (1, n_features) or (n_targets, n_features) Coefficient of the features in the decision function. ``coef_`` is of shape (1, n_features) when the given problem is binary. - intercept_ : float | array, shape = (n_targets,) + intercept_ : float or array of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. @@ -1816,27 +1820,29 @@ def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, self.class_weight = class_weight def fit(self, X, y, sample_weight=None): - """Fit the ridge classifier. + """Fit Ridge classifier with cv. Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. When using GCV, will be cast to float64 if necessary. - y : array-like, shape (n_samples,) - Target values. Will be cast to X's dtype if necessary + y : array-like of shape (n_samples,) + Target values. Will be cast to X's dtype if necessary. - sample_weight : {float, array-like of shape (n_samples,)}, default=None - Sample weight. + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. Returns ------- self : object """ - check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True, y_numeric=False) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) @@ -1844,8 +1850,6 @@ def fit(self, X, y, sample_weight=None): y = column_or_1d(y, warn=True) if self.class_weight: - if sample_weight is None: - sample_weight = 1. # modify the sample weights with the corresponding class weight sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) From 25a88b4a2b5fbcda23f7f569a1bbea15bcbbafd7 Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Fri, 15 Nov 2019 12:32:18 -0800 Subject: [PATCH 016/448] DOC docstrings validation in TfidfVectorizer (#15509) --- sklearn/feature_extraction/text.py | 127 +++++++++++++++++++---------- 1 file changed, 84 insertions(+), 43 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 1da4c96a95ea8..afc8ee4118cdc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -123,7 +123,7 @@ def strip_accents_unicode(s): s : string The string to strip - See also + See Also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct @@ -150,7 +150,7 @@ def strip_accents_ascii(s): s : string The string to strip - See also + See Also -------- strip_accents_unicode Remove accentuated char for any unicode symbol. @@ -190,14 +190,19 @@ class _VectorizerMixin: _white_spaces = re.compile(r"\s\s+") def decode(self, doc): - """Decode the input into a string of unicode symbols + """Decode the input into a string of unicode symbols. The decoding strategy depends on the vectorizer parameters. Parameters ---------- - doc : string - The string to decode + doc : str + The string to decode. + + Returns + ------- + doc: str + A string of unicode symbols. """ if self.input == 'filename': with open(doc, 'rb') as fh: @@ -298,7 +303,13 @@ def _char_wb_ngrams(self, text_document): return ngrams def build_preprocessor(self): - """Return a function to preprocess the text before tokenization""" + """Return a function to preprocess the text before tokenization. + + Returns + ------- + preprocessor: callable + A function to preprocess the text before tokenization. + """ if self.preprocessor is not None: return self.preprocessor @@ -320,14 +331,26 @@ def build_preprocessor(self): ) def build_tokenizer(self): - """Return a function that splits a string into a sequence of tokens""" + """Return a function that splits a string into a sequence of tokens. + + Returns + ------- + tokenizer: callable + A function to split a string into a sequence of tokens. + """ if self.tokenizer is not None: return self.tokenizer token_pattern = re.compile(self.token_pattern) return token_pattern.findall def get_stop_words(self): - """Build or fetch the effective stop words list""" + """Build or fetch the effective stop words list. + + Returns + ------- + stop_words: list or None + A list of stop words. + """ return _check_stop_list(self.stop_words) def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): @@ -391,8 +414,13 @@ def _validate_custom_analyzer(self): def build_analyzer(self): """Return a callable that handles preprocessing, tokenization - and n-grams generation. + + Returns + ------- + analyzer: callable + A function to handle preprocessing, tokenization + and n-grams generation. """ if callable(self.analyzer): @@ -667,11 +695,12 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): >>> print(X.shape) (4, 16) - See also + See Also -------- CountVectorizer, TfidfVectorizer """ + def __init__(self, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, @@ -982,7 +1011,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [1 0 0 1 0 0 0 0 1 1 0 1 0] [0 0 1 0 1 0 1 0 0 0 0 0 1]] - See also + See Also -------- HashingVectorizer, TfidfVectorizer @@ -1249,6 +1278,7 @@ def inverse_transform(self, X): Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document-term matrix. Returns ------- @@ -1274,7 +1304,13 @@ def inverse_transform(self, X): for i in range(n_samples)] def get_feature_names(self): - """Array mapping from feature integer indices to feature name""" + """Array mapping from feature integer indices to feature name. + + Returns + ------- + feature_names : list + A list of feature names. + """ self._check_vocabulary() @@ -1504,7 +1540,7 @@ class TfidfVectorizer(CountVectorizer): Parameters ---------- - input : string {'filename', 'file', 'content'} + input : str {'filename', 'file', 'content'} If 'filename', the sequence passed as an argument to fit is expected to be a list of filenames that need reading to fetch the raw content to analyze. @@ -1515,7 +1551,7 @@ class TfidfVectorizer(CountVectorizer): Otherwise the input is expected to be a sequence of items that can be of type string or byte. - encoding : string, 'utf-8' by default. + encoding : str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode. @@ -1536,7 +1572,7 @@ class TfidfVectorizer(CountVectorizer): Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. - lowercase : boolean (default=True) + lowercase : bool (default=True) Convert all characters to lowercase before tokenizing. preprocessor : callable or None (default=None) @@ -1549,7 +1585,7 @@ class TfidfVectorizer(CountVectorizer): preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. - analyzer : string, {'word', 'char', 'char_wb'} or callable + analyzer : str, {'word', 'char', 'char_wb'} or callable Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. @@ -1563,7 +1599,7 @@ class TfidfVectorizer(CountVectorizer): first read from the file and then passed to the given callable analyzer. - stop_words : string {'english'}, list, or None (default=None) + stop_words : str {'english'}, list, or None (default=None) If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. @@ -1578,7 +1614,7 @@ class TfidfVectorizer(CountVectorizer): in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. - token_pattern : string + token_pattern : str Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored @@ -1619,10 +1655,10 @@ class TfidfVectorizer(CountVectorizer): indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input documents. - binary : boolean (default=False) + binary : bool (default=False) If True, all non-zero term counts are set to 1. This does not mean outputs will have only 0/1 values, only that the tf term in tf-idf - is binary. (Set idf and normalization to False to get 0/1 outputs.) + is binary. (Set idf and normalization to False to get 0/1 outputs). dtype : type, optional (default=float64) Type of the matrix returned by fit_transform() or transform(). @@ -1633,17 +1669,17 @@ class TfidfVectorizer(CountVectorizer): similarity between two vectors is their dot product when l2 norm has been applied. * 'l1': Sum of absolute values of vector elements is 1. - See :func:`preprocessing.normalize` + See :func:`preprocessing.normalize`. - use_idf : boolean (default=True) + use_idf : bool (default=True) Enable inverse-document-frequency reweighting. - smooth_idf : boolean (default=True) + smooth_idf : bool (default=True) Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. - sublinear_tf : boolean (default=False) + sublinear_tf : bool (default=False) Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). Attributes @@ -1651,7 +1687,7 @@ class TfidfVectorizer(CountVectorizer): vocabulary_ : dict A mapping of terms to feature indices. - fixed_vocabulary_: boolean + fixed_vocabulary_: bool True if a fixed vocabulary of term to indices mapping is provided by the user @@ -1668,6 +1704,19 @@ class TfidfVectorizer(CountVectorizer): This is only available if no vocabulary was given. + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. + + TfidfTransformer : Performs the TF-IDF transformation from a provided + matrix of counts. + + Notes + ----- + The ``stop_words_`` attribute can get large and increase the model size + when pickling. This attribute is provided only for introspection and can + be safely removed using delattr or set to None before pickling. + Examples -------- >>> from sklearn.feature_extraction.text import TfidfVectorizer @@ -1683,19 +1732,6 @@ class TfidfVectorizer(CountVectorizer): ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.shape) (4, 9) - - See also - -------- - CountVectorizer : Transforms text into a sparse matrix of n-gram counts. - - TfidfTransformer : Performs the TF-IDF transformation from a provided - matrix of counts. - - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. """ def __init__(self, input='content', encoding='utf-8', @@ -1782,11 +1818,14 @@ def fit(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which yields either str, unicode or file objects. + y : None + This parameter is not needed to compute tfidf. Returns ------- - self : TfidfVectorizer + self : object + Fitted vectorizer. """ self._check_params() self._warn_for_unused_params() @@ -1803,7 +1842,9 @@ def fit_transform(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which yields either str, unicode or file objects. + y : None + This parameter is ignored. Returns ------- @@ -1826,9 +1867,9 @@ def transform(self, raw_documents, copy="deprecated"): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which yields either str, unicode or file objects. - copy : boolean, default True + copy : bool, default True Whether to copy X and operate on the copy or perform in-place operations. From ff878d60aab59dadf3bc1ee2e914fd038662a71a Mon Sep 17 00:00:00 2001 From: Paula Date: Fri, 15 Nov 2019 13:01:55 -0800 Subject: [PATCH 017/448] DOC add versionadded to OneHotEcoder, LabelEncoder (#15506) and OrdinalEncoder --- sklearn/preprocessing/_encoders.py | 4 ++++ sklearn/preprocessing/_label.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 73865d22605f8..b9c6d3adf8393 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -171,6 +171,8 @@ class OneHotEncoder(_BaseEncoder): Read more in the :ref:`User Guide `. + .. versionchanged:: 0.20 + Parameters ---------- categories : 'auto' or a list of array-like, default='auto' @@ -551,6 +553,8 @@ class OrdinalEncoder(_BaseEncoder): Read more in the :ref:`User Guide `. + .. versionchanged:: 0.20.1 + Parameters ---------- categories : 'auto' or a list of array-like, default='auto' diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 9fbc959969e33..c644aa919f5cf 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -176,6 +176,8 @@ class LabelEncoder(TransformerMixin, BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.12 + Attributes ---------- classes_ : array of shape (n_class,) From 8360786895e12a63afe07aa66ad8a3fbf438b729 Mon Sep 17 00:00:00 2001 From: "Yu-Hang \"Maxin\" Tang" Date: Fri, 15 Nov 2019 16:07:36 -0800 Subject: [PATCH 018/448] ENH Allowing Gaussian process kernels on structured data (#15557) --- doc/whats_new/v0.22.rst | 9 + .../plot_gpr_on_structured_data.py | 174 ++++++++++++++++++ sklearn/base.py | 5 +- sklearn/gaussian_process/_gpc.py | 59 ++++-- sklearn/gaussian_process/_gpr.py | 36 +++- sklearn/gaussian_process/kernels.py | 152 ++++++++++----- .../tests/_mini_sequence_kernel.py | 51 +++++ sklearn/gaussian_process/tests/test_gpc.py | 17 +- sklearn/gaussian_process/tests/test_gpr.py | 23 ++- .../gaussian_process/tests/test_kernels.py | 41 +++-- 10 files changed, 480 insertions(+), 87 deletions(-) create mode 100644 examples/gaussian_process/plot_gpr_on_structured_data.py create mode 100644 sklearn/gaussian_process/tests/_mini_sequence_kernel.py diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index a7b3c027d85ba..512e0574896ea 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -426,6 +426,15 @@ Changelog estimator's constructor but not stored as attributes on the instance. :pr:`14464` by `Joel Nothman`_. +- |Feature| Gaussian process models on structured data: :class:`gaussian_process.GaussianProcessRegressor` + and :class:`gaussian_process.GaussianProcessClassifier` can now accept a list + of generic objects (e.g. strings, trees, graphs, etc.) as the ``X`` argument + to their training/prediction methods. + A user-defined kernel should be provided for computing the kernel matrix among + the generic objects, and should inherit from :class:`gaussian_process.kernels.GenericKernelMixin` + to notify the GPR/GPC model that it handles non-vectorial samples. + :pr:`15557` by :user:`Yu-Hang Tang `. + :mod:`sklearn.impute` ..................... diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py new file mode 100644 index 0000000000000..5b22c788ab3bf --- /dev/null +++ b/examples/gaussian_process/plot_gpr_on_structured_data.py @@ -0,0 +1,174 @@ +""" +========================================================================== +Gaussian processes on discrete data structures +========================================================================== + +This example illustrates the use of Gaussian processes for regression and +classification tasks on data that are not in fixed-length feature vector form. +This is achieved through the use of kernel functions that operates directly +on discrete structures such as variable-length sequences, trees, and graphs. + +Specifically, here the input variables are some gene sequences stored as +variable-length strings consisting of letters 'A', 'T', 'C', and 'G', +while the output variables are floating point numbers and True/False labels +in the regression and classification tasks, respectively. + +A kernel between the gene sequences is defined using R-convolution [1]_ by +integrating a binary letter-wise kernel over all pairs of letters among a pair +of strings. + +This example will generate three figures. + +In the first figure, we visualize the value of the kernel, i.e. the similarity +of the sequences, using a colormap. Brighter color here indicates higher +similarity. + +In the second figure, we show some regression result on a dataset of 6 +sequences. Here we use the 1st, 2nd, 4th, and 5th sequences as the training set +to make predictions on the 3rd and 6th sequences. + +In the third figure, we demonstrate a classification model by training on 6 +sequences and make predictions on another 5 sequences. The ground truth here is +simply whether there is at least one 'A' in the sequence. Here the model makes +four correct classifications and fails on one. + +.. [1] Haussler, D. (1999). Convolution kernels on discrete structures +(Vol. 646). Technical report, Department of Computer Science, University of +California at Santa Cruz. +""" +print(__doc__) + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.gaussian_process.kernels import Kernel, Hyperparameter +from sklearn.gaussian_process.kernels import GenericKernelMixin +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.base import clone + + +class SequenceKernel(GenericKernelMixin, Kernel): + ''' + A minimal (but valid) convolutional kernel for sequences of variable + lengths.''' + def __init__(self, + baseline_similarity=0.5, + baseline_similarity_bounds=(1e-5, 1)): + self.baseline_similarity = baseline_similarity + self.baseline_similarity_bounds = baseline_similarity_bounds + + @property + def hyperparameter_baseline_similarity(self): + return Hyperparameter("baseline_similarity", + "numeric", + self.baseline_similarity_bounds) + + def _f(self, s1, s2): + ''' + kernel value between a pair of sequences + ''' + return sum([1.0 if c1 == c2 else self.baseline_similarity + for c1 in s1 + for c2 in s2]) + + def _g(self, s1, s2): + ''' + kernel derivative between a pair of sequences + ''' + return sum([0.0 if c1 == c2 else 1.0 + for c1 in s1 + for c2 in s2]) + + def __call__(self, X, Y=None, eval_gradient=False): + if Y is None: + Y = X + + if eval_gradient: + return (np.array([[self._f(x, y) for y in Y] for x in X]), + np.array([[[self._g(x, y)] for y in Y] for x in X])) + else: + return np.array([[self._f(x, y) for y in Y] for x in X]) + + def diag(self, X): + return np.array([self._f(x, x) for x in X]) + + def is_stationary(self): + return False + + def clone_with_theta(self, theta): + cloned = clone(self) + cloned.theta = theta + return cloned + + +kernel = SequenceKernel() + +''' +Sequence similarity matrix under the kernel +=========================================== +''' + +X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA']) + +K = kernel(X) +D = kernel.diag(X) + +plt.figure(figsize=(8, 5)) +plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5))) +plt.xticks(np.arange(len(X)), X) +plt.yticks(np.arange(len(X)), X) +plt.title('Sequence similarity under the kernel') + +''' +Regression +========== +''' + +X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA']) +Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0]) + +training_idx = [0, 1, 3, 4] +gp = GaussianProcessRegressor(kernel=kernel) +gp.fit(X[training_idx], Y[training_idx]) + +plt.figure(figsize=(8, 5)) +plt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction') +plt.bar(training_idx, Y[training_idx], width=0.2, color='r', + alpha=1, label='training') +plt.xticks(np.arange(len(X)), X) +plt.title('Regression on sequences') +plt.legend() + +''' +Classification +============== +''' + +X_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT']) +# whether there are 'A's in the sequence +Y_train = np.array([True, True, True, False, False, False]) + +gp = GaussianProcessClassifier(kernel) +gp.fit(X_train, Y_train) + +X_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C'] +Y_test = [True, True, False, False, False] + +plt.figure(figsize=(8, 5)) +plt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train], + s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0), + label='training') +plt.scatter(len(X_train) + np.arange(len(X_test)), + [1.0 if c else -1.0 for c in Y_test], + s=100, marker='o', edgecolor='none', facecolor='r', label='truth') +plt.scatter(len(X_train) + np.arange(len(X_test)), + [1.0 if c else -1.0 for c in gp.predict(X_test)], + s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2, + label='prediction') +plt.xticks(np.arange(len(X_train) + len(X_test)), + np.concatenate((X_train, X_test))) +plt.yticks([-1, 1], [False, True]) +plt.title('Classification on sequences') +plt.legend() + +plt.show() diff --git a/sklearn/base.py b/sklearn/base.py index 7ededd7a70548..6eb2c238b0a44 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -383,8 +383,9 @@ def score(self, X, y, sample_weight=None): ---------- X : array-like of shape (n_samples, n_features) Test samples. For some estimators this may be a - precomputed kernel matrix instead, shape = (n_samples, - n_samples_fitted], where n_samples_fitted is the number of + precomputed kernel matrix or a list of generic objects instead, + shape = (n_samples, n_samples_fitted), + where n_samples_fitted is the number of samples used in the fitting for the estimator. y : array-like of shape (n_samples,) or (n_samples, n_outputs) diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 072cf80dba250..dc4eb6520c0b8 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -115,8 +115,10 @@ def optimizer(obj_func, initial_theta, bounds): Attributes ---------- - X_train_ : array-like of shape (n_samples, n_features) - Feature values in training data (also required for prediction) + X_train_ : sequence of length n_samples + Feature vectors or other representations of training data (also + required for prediction). Could either be array-like with shape = + (n_samples, n_features) or a list of objects. y_train_ : array-like of shape (n_samples,) Target values in training data (also required for prediction) @@ -160,8 +162,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data + X : sequence of length n_samples + Feature vectors or other representations of training data. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. y : array-like of shape (n_samples,) Target values, must be binary @@ -248,7 +252,10 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : sequence of length n_samples + Query points where the GP is evaluated for classification. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. Returns ------- @@ -270,7 +277,10 @@ def predict_proba(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : sequence of length n_samples + Query points where the GP is evaluated for classification. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. Returns ------- @@ -602,8 +612,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data + X : sequence of length n_samples + Feature vectors or other representations of training data. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. y : array-like of shape (n_samples,) Target values, must be binary @@ -612,7 +624,12 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, multi_output=False) + if self.kernel is None or self.kernel.requires_vector_input: + X, y = check_X_y(X, y, multi_output=False, + ensure_2d=True, dtype="numeric") + else: + X, y = check_X_y(X, y, multi_output=False, + ensure_2d=False, dtype=None) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, @@ -656,7 +673,10 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : sequence of length n_samples + Query points where the GP is evaluated for classification. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. Returns ------- @@ -664,7 +684,12 @@ def predict(self, X): Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self) - X = check_array(X) + + if self.kernel is None or self.kernel.requires_vector_input: + X = check_array(X, ensure_2d=True, dtype="numeric") + else: + X = check_array(X, ensure_2d=False, dtype=None) + return self.base_estimator_.predict(X) def predict_proba(self, X): @@ -672,7 +697,10 @@ def predict_proba(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : sequence of length n_samples + Query points where the GP is evaluated for classification. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. Returns ------- @@ -686,7 +714,12 @@ def predict_proba(self, X): raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") - X = check_array(X) + + if self.kernel is None or self.kernel.requires_vector_input: + X = check_array(X, ensure_2d=True, dtype="numeric") + else: + X = check_array(X, ensure_2d=False, dtype=None) + return self.base_estimator_.predict_proba(X) @property diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index a2be69abff794..db850b3e442f8 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -114,8 +114,10 @@ def optimizer(obj_func, initial_theta, bounds): Attributes ---------- - X_train_ : array-like of shape (n_samples, n_features) - Feature values in training data (also required for prediction) + X_train_ : sequence of length n_samples + Feature vectors or other representations of training data (also + required for prediction). Could either be array-like with shape = + (n_samples, n_features) or a list of objects. y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets) Target values in training data (also required for prediction) @@ -164,8 +166,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data + X : sequence of length n_samples + Feature vectors or other representations of training data. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values @@ -182,7 +186,12 @@ def fit(self, X, y): self._rng = check_random_state(self.random_state) - X, y = check_X_y(X, y, multi_output=True, y_numeric=True) + if self.kernel_.requires_vector_input: + X, y = check_X_y(X, y, multi_output=True, y_numeric=True, + ensure_2d=True, dtype="numeric") + else: + X, y = check_X_y(X, y, multi_output=True, y_numeric=True, + ensure_2d=False, dtype=None) # Normalize target value if self.normalize_y: @@ -273,8 +282,10 @@ def predict(self, X, return_std=False, return_cov=False): Parameters ---------- - X : array-like of shape (n_samples, n_features) - Query points where the GP is evaluated + X : sequence of length n_samples + Query points where the GP is evaluated. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. return_std : bool, default: False If True, the standard-deviation of the predictive distribution at @@ -302,7 +313,10 @@ def predict(self, X, return_std=False, return_cov=False): "Not returning standard deviation of predictions when " "returning full covariance.") - X = check_array(X) + if self.kernel is None or self.kernel.requires_vector_input: + X = check_array(X, ensure_2d=True, dtype="numeric") + else: + X = check_array(X, ensure_2d=False, dtype=None) if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior if self.kernel is None: @@ -357,8 +371,10 @@ def sample_y(self, X, n_samples=1, random_state=0): Parameters ---------- - X : array-like of shape (n_samples_X, n_features) - Query points where the GP samples are evaluated + X : sequence of length n_samples + Query points where the GP is evaluated. + Could either be array-like with shape = (n_samples, n_features) + or a list of objects. n_samples : int, default: 1 The number of samples drawn from the Gaussian process diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py index 1634113a009f3..51abe3ad24235 100644 --- a/sklearn/gaussian_process/kernels.py +++ b/sklearn/gaussian_process/kernels.py @@ -31,6 +31,7 @@ from ..metrics.pairwise import pairwise_kernels from ..base import clone +from ..utils.validation import _num_samples def _check_length_scale(X, length_scale): @@ -352,7 +353,7 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples Left argument of the returned kernel k(X, Y) Returns @@ -365,6 +366,13 @@ def diag(self, X): def is_stationary(self): """Returns whether the kernel is stationary. """ + @property + def requires_vector_input(self): + """Returns whether the kernel is defined on fixed-length feature + vectors or generic objects. Defaults to True for backward + compatibility.""" + return True + class NormalizedKernelMixin: """Mixin for kernels which are normalized: k(X, X)=1. @@ -381,7 +389,7 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples Left argument of the returned kernel k(X, Y) Returns @@ -403,6 +411,19 @@ def is_stationary(self): return True +class GenericKernelMixin: + """Mixin for kernels which operate on generic objects such as variable- + length sequences, trees, and graphs. + + .. versionadded:: 0.22 + """ + + @property + def requires_vector_input(self): + """Whether the kernel works only on fixed-length feature vectors.""" + return False + + class CompoundKernel(Kernel): """Kernel which is composed of a set of other kernels. @@ -481,12 +502,15 @@ def __call__(self, X, Y=None, eval_gradient=False): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples_X Left argument of the returned kernel k(X, Y) + Could either be array-like with shape = (n_samples_X, n_features) + or a list of objects. - Y : array, shape (n_samples_Y, n_features), (optional, default=None) + Y : sequence of length n_samples_Y Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. + is evaluated instead. Y could either be array-like with + shape = (n_samples_Y, n_features) or a list of objects. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel @@ -524,6 +548,12 @@ def is_stationary(self): """Returns whether the kernel is stationary. """ return np.all([kernel.is_stationary() for kernel in self.kernels]) + @property + def requires_vector_input(self): + """Returns whether the kernel is defined on discrete structures. """ + return np.any([kernel.requires_vector_input + for kernel in self.kernels]) + def diag(self, X): """Returns the diagonal of the kernel k(X, X). @@ -533,8 +563,9 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) + X : sequence of length n_samples_X + Argument to the kernel. Could either be array-like with + shape = (n_samples_X, n_features) or a list of objects. Returns ------- @@ -646,6 +677,12 @@ def is_stationary(self): """Returns whether the kernel is stationary. """ return self.k1.is_stationary() and self.k2.is_stationary() + @property + def requires_vector_input(self): + """Returns whether the kernel is stationary. """ + return (self.k1.requires_vector_input or + self.k2.requires_vector_input) + class Sum(KernelOperator): """Sum-kernel k1 + k2 of two kernels k1 and k2. @@ -670,12 +707,15 @@ def __call__(self, X, Y=None, eval_gradient=False): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples_X Left argument of the returned kernel k(X, Y) + Could either be array-like with shape = (n_samples_X, n_features) + or a list of objects. - Y : array, shape (n_samples_Y, n_features), (optional, default=None) + Y : sequence of length n_samples_Y Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. + is evaluated instead. Y could either be array-like with + shape = (n_samples_Y, n_features) or a list of objects. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel @@ -707,8 +747,9 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) + X : sequence of length n_samples_X + Argument to the kernel. Could either be array-like with + shape = (n_samples_X, n_features) or a list of objects. Returns ------- @@ -744,12 +785,15 @@ def __call__(self, X, Y=None, eval_gradient=False): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples_X Left argument of the returned kernel k(X, Y) + Could either be array-like with shape = (n_samples_X, n_features) + or a list of objects. - Y : array, shape (n_samples_Y, n_features), (optional, default=None) + Y : sequence of length n_samples_Y Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. + is evaluated instead. Y could either be array-like with + shape = (n_samples_Y, n_features) or a list of objects. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel @@ -782,8 +826,9 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) + X : sequence of length n_samples_X + Argument to the kernel. Could either be array-like with + shape = (n_samples_X, n_features) or a list of objects. Returns ------- @@ -896,12 +941,15 @@ def __call__(self, X, Y=None, eval_gradient=False): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples_X Left argument of the returned kernel k(X, Y) + Could either be array-like with shape = (n_samples_X, n_features) + or a list of objects. - Y : array, shape (n_samples_Y, n_features), (optional, default=None) + Y : sequence of length n_samples_Y Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. + is evaluated instead. Y could either be array-like with + shape = (n_samples_Y, n_features) or a list of objects. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel @@ -935,8 +983,9 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) + X : sequence of length n_samples_X + Argument to the kernel. Could either be array-like with + shape = (n_samples_X, n_features) or a list of objects. Returns ------- @@ -952,8 +1001,14 @@ def is_stationary(self): """Returns whether the kernel is stationary. """ return self.kernel.is_stationary() + @property + def requires_vector_input(self): + """Returns whether the kernel is defined on discrete structures. """ + return self.kernel.requires_vector_input + -class ConstantKernel(StationaryKernelMixin, Kernel): +class ConstantKernel(StationaryKernelMixin, GenericKernelMixin, + Kernel): """Constant kernel. Can be used as part of a product-kernel where it scales the magnitude of @@ -988,12 +1043,15 @@ def __call__(self, X, Y=None, eval_gradient=False): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples_X Left argument of the returned kernel k(X, Y) + Could either be array-like with shape = (n_samples_X, n_features) + or a list of objects. - Y : array, shape (n_samples_Y, n_features), (optional, default=None) + Y : sequence of length n_samples_Y Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. + is evaluated instead. Y could either be array-like with + shape = (n_samples_Y, n_features) or a list of objects. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel @@ -1009,21 +1067,20 @@ def __call__(self, X, Y=None, eval_gradient=False): hyperparameter of the kernel. Only returned when eval_gradient is True. """ - X = np.atleast_2d(X) if Y is None: Y = X elif eval_gradient: raise ValueError("Gradient can only be evaluated when Y is None.") - K = np.full((X.shape[0], Y.shape[0]), self.constant_value, + K = np.full((_num_samples(X), _num_samples(Y)), self.constant_value, dtype=np.array(self.constant_value).dtype) if eval_gradient: if not self.hyperparameter_constant_value.fixed: - return (K, np.full((X.shape[0], X.shape[0], 1), + return (K, np.full((_num_samples(X), _num_samples(X), 1), self.constant_value, dtype=np.array(self.constant_value).dtype)) else: - return K, np.empty((X.shape[0], X.shape[0], 0)) + return K, np.empty((_num_samples(X), _num_samples(X), 0)) else: return K @@ -1036,22 +1093,24 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) + X : sequence of length n_samples_X + Argument to the kernel. Could either be array-like with + shape = (n_samples_X, n_features) or a list of objects. Returns ------- K_diag : array, shape (n_samples_X,) Diagonal of kernel k(X, X) """ - return np.full(X.shape[0], self.constant_value, + return np.full(_num_samples(X), self.constant_value, dtype=np.array(self.constant_value).dtype) def __repr__(self): return "{0:.3g}**2".format(np.sqrt(self.constant_value)) -class WhiteKernel(StationaryKernelMixin, Kernel): +class WhiteKernel(StationaryKernelMixin, GenericKernelMixin, + Kernel): """White kernel. The main use-case of this kernel is as part of a sum-kernel where it @@ -1085,12 +1144,15 @@ def __call__(self, X, Y=None, eval_gradient=False): Parameters ---------- - X : array, shape (n_samples_X, n_features) + X : sequence of length n_samples_X Left argument of the returned kernel k(X, Y) + Could either be array-like with shape = (n_samples_X, n_features) + or a list of objects. - Y : array, shape (n_samples_Y, n_features), (optional, default=None) + Y : sequence of length n_samples_Y Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. + is evaluated instead. Y could either be array-like with + shape = (n_samples_Y, n_features) or a list of objects. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel @@ -1106,22 +1168,21 @@ def __call__(self, X, Y=None, eval_gradient=False): hyperparameter of the kernel. Only returned when eval_gradient is True. """ - X = np.atleast_2d(X) if Y is not None and eval_gradient: raise ValueError("Gradient can only be evaluated when Y is None.") if Y is None: - K = self.noise_level * np.eye(X.shape[0]) + K = self.noise_level * np.eye(_num_samples(X)) if eval_gradient: if not self.hyperparameter_noise_level.fixed: return (K, self.noise_level - * np.eye(X.shape[0])[:, :, np.newaxis]) + * np.eye(_num_samples(X))[:, :, np.newaxis]) else: - return K, np.empty((X.shape[0], X.shape[0], 0)) + return K, np.empty((_num_samples(X), _num_samples(X), 0)) else: return K else: - return np.zeros((X.shape[0], Y.shape[0])) + return np.zeros((_num_samples(X), _num_samples(Y))) def diag(self, X): """Returns the diagonal of the kernel k(X, X). @@ -1132,15 +1193,16 @@ def diag(self, X): Parameters ---------- - X : array, shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) + X : sequence of length n_samples_X + Argument to the kernel. Could either be array-like with + shape = (n_samples_X, n_features) or a list of objects. Returns ------- K_diag : array, shape (n_samples_X,) Diagonal of kernel k(X, X) """ - return np.full(X.shape[0], self.noise_level, + return np.full(_num_samples(X), self.noise_level, dtype=np.array(self.noise_level).dtype) def __repr__(self): diff --git a/sklearn/gaussian_process/tests/_mini_sequence_kernel.py b/sklearn/gaussian_process/tests/_mini_sequence_kernel.py new file mode 100644 index 0000000000000..c260a361e1e71 --- /dev/null +++ b/sklearn/gaussian_process/tests/_mini_sequence_kernel.py @@ -0,0 +1,51 @@ +from sklearn.gaussian_process.kernels import Kernel, Hyperparameter +from sklearn.gaussian_process.kernels import GenericKernelMixin +from sklearn.gaussian_process.kernels import StationaryKernelMixin +import numpy as np +from sklearn.base import clone + + +class MiniSeqKernel(GenericKernelMixin, + StationaryKernelMixin, + Kernel): + ''' + A minimal (but valid) convolutional kernel for sequences of variable + length. + ''' + def __init__(self, + baseline_similarity=0.5, + baseline_similarity_bounds=(1e-5, 1)): + self.baseline_similarity = baseline_similarity + self.baseline_similarity_bounds = baseline_similarity_bounds + + @property + def hyperparameter_baseline_similarity(self): + return Hyperparameter("baseline_similarity", + "numeric", + self.baseline_similarity_bounds) + + def _f(self, s1, s2): + return sum([1.0 if c1 == c2 else self.baseline_similarity + for c1 in s1 + for c2 in s2]) + + def _g(self, s1, s2): + return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2]) + + def __call__(self, X, Y=None, eval_gradient=False): + if Y is None: + Y = X + + if eval_gradient: + return (np.array([[self._f(x, y) for y in Y] for x in X]), + np.array([[[self._g(x, y)] for y in Y] for x in X])) + else: + return np.array([[self._f(x, y) for y in Y] for x in X]) + + def diag(self, X): + return np.array([self._f(x, x) for x in X]) + + def clone_with_theta(self, theta): + cloned = clone(self) + cloned.theta = theta + return cloned diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py index aec5cc147223f..72d550231f4ea 100644 --- a/sklearn/gaussian_process/tests/test_gpc.py +++ b/sklearn/gaussian_process/tests/test_gpc.py @@ -11,12 +11,15 @@ from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C +from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel from sklearn.utils._testing import assert_almost_equal, assert_array_equal def f(x): return np.sin(x) + + X = np.atleast_2d(np.linspace(0, 10, 30)).T X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T y = np.array(f(X).ravel() > 0, dtype=int) @@ -44,12 +47,22 @@ def test_predict_consistent(kernel): gpc.predict_proba(X)[:, 1] >= 0.5) +def test_predict_consistent_structured(): + # Check binary predict decision has also predicted probability above 0.5. + X = ['A', 'AB', 'B'] + y = np.array([True, False, True]) + kernel = MiniSeqKernel(baseline_similarity_bounds='fixed') + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_array_equal(gpc.predict(X), + gpc.predict_proba(X)[:, 1] >= 0.5) + + @pytest.mark.parametrize('kernel', non_fixed_kernels) def test_lml_improving(kernel): # Test that hyperparameter-tuning improves log-marginal likelihood. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert (gpc.log_marginal_likelihood(gpc.kernel_.theta) > - gpc.log_marginal_likelihood(kernel.theta)) + gpc.log_marginal_likelihood(kernel.theta)) @pytest.mark.parametrize('kernel', kernels) @@ -139,7 +152,7 @@ def optimizer(obj_func, initial_theta, bounds): gpc.fit(X, y_mc) # Checks that optimizer improved marginal likelihood assert (gpc.log_marginal_likelihood(gpc.kernel_.theta) > - gpc.log_marginal_likelihood(kernel.theta)) + gpc.log_marginal_likelihood(kernel.theta)) @pytest.mark.parametrize('kernel', kernels) diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index 64b177ce17c48..eb4bc6dec1761 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -13,6 +13,7 @@ from sklearn.gaussian_process.kernels \ import RBF, ConstantKernel as C, WhiteKernel from sklearn.gaussian_process.kernels import DotProduct +from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel from sklearn.utils._testing \ import (assert_array_less, @@ -53,12 +54,26 @@ def test_gpr_interpolation(kernel): assert_almost_equal(np.diag(y_cov), 0.) +def test_gpr_interpolation_structured(): + # Test the interpolating property for different kernels. + kernel = MiniSeqKernel(baseline_similarity_bounds='fixed') + X = ['A', 'B', 'C'] + y = np.array([1, 2, 3]) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + y_pred, y_cov = gpr.predict(X, return_cov=True) + + assert_almost_equal(kernel(X, eval_gradient=True)[1].ravel(), + (1 - np.eye(len(X))).ravel()) + assert_almost_equal(y_pred, y) + assert_almost_equal(np.diag(y_cov), 0.) + + @pytest.mark.parametrize('kernel', non_fixed_kernels) def test_lml_improving(kernel): # Test that hyperparameter-tuning improves log-marginal likelihood. gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) assert (gpr.log_marginal_likelihood(gpr.kernel_.theta) > - gpr.log_marginal_likelihood(kernel.theta)) + gpr.log_marginal_likelihood(kernel.theta)) @pytest.mark.parametrize('kernel', kernels) @@ -66,7 +81,7 @@ def test_lml_precomputed(kernel): # Test that lml of optimized kernel is stored correctly. gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) assert (gpr.log_marginal_likelihood(gpr.kernel_.theta) == - gpr.log_marginal_likelihood()) + gpr.log_marginal_likelihood()) @pytest.mark.parametrize('kernel', kernels) @@ -179,7 +194,7 @@ def test_anisotropic_kernel(): kernel = RBF([1.0, 1.0]) gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) assert (np.exp(gpr.kernel_.theta[1]) > - np.exp(gpr.kernel_.theta[0]) * 5) + np.exp(gpr.kernel_.theta[0]) * 5) def test_random_starts(): @@ -297,7 +312,7 @@ def optimizer(obj_func, initial_theta, bounds): gpr.fit(X, y) # Checks that optimizer improved marginal likelihood assert (gpr.log_marginal_likelihood(gpr.kernel_.theta) > - gpr.log_marginal_likelihood(gpr.kernel.theta)) + gpr.log_marginal_likelihood(gpr.kernel.theta)) def test_gpr_correct_error_message(): diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index ecb636d13103b..6aaadd48ef317 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -14,22 +14,22 @@ from sklearn.gaussian_process.kernels \ import (RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct, ConstantKernel, WhiteKernel, PairwiseKernel, KernelOperator, - Exponentiation, Kernel) + Exponentiation, Kernel, CompoundKernel) from sklearn.base import clone from sklearn.utils._testing import (assert_almost_equal, assert_array_equal, - assert_array_almost_equal, - assert_raise_message) + assert_array_almost_equal, + assert_raise_message) X = np.random.RandomState(0).normal(0, 1, (5, 2)) Y = np.random.RandomState(0).normal(0, 1, (6, 2)) -kernel_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0) +kernel_rbf_plus_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0) kernels = [RBF(length_scale=2.0), RBF(length_scale_bounds=(0.5, 2.0)), ConstantKernel(constant_value=10.0), 2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"), - 2.0 * RBF(length_scale=0.5), kernel_white, + 2.0 * RBF(length_scale=0.5), kernel_rbf_plus_white, 2.0 * RBF(length_scale=[0.5, 2.0]), 2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"), 2.0 * Matern(length_scale=0.5, nu=0.5), @@ -92,8 +92,7 @@ def test_kernel_theta(kernel): # Check that values returned in theta are consistent with # hyperparameter values (being their logarithms) for i, hyperparameter in enumerate(kernel.hyperparameters): - assert (theta[i] == - np.log(getattr(kernel, hyperparameter.name))) + assert (theta[i] == np.log(getattr(kernel, hyperparameter.name))) # Fixed kernel parameters must be excluded from theta and gradient. for i, hyperparameter in enumerate(kernel.hyperparameters): @@ -129,7 +128,7 @@ def test_kernel_theta(kernel): @pytest.mark.parametrize('kernel', [kernel for kernel in kernels # Identity is not satisfied on diagonal - if kernel != kernel_white]) + if kernel != kernel_rbf_plus_white]) def test_auto_vs_cross(kernel): # Auto-correlation and cross-correlation should be consistent. K_auto = kernel(X) @@ -186,6 +185,27 @@ def test_kernel_stationary(kernel): assert_almost_equal(K[0, 0], np.diag(K)) +@pytest.mark.parametrize('kernel', kernels) +def test_kernel_input_type(kernel): + # Test whether kernels is for vectors or structured data + if isinstance(kernel, Exponentiation): + assert(kernel.requires_vector_input == + kernel.kernel.requires_vector_input) + if isinstance(kernel, KernelOperator): + assert(kernel.requires_vector_input == + (kernel.k1.requires_vector_input or + kernel.k2.requires_vector_input)) + + +def test_compound_kernel_input_type(): + kernel = CompoundKernel([WhiteKernel(noise_level=3.0)]) + assert not kernel.requires_vector_input + + kernel = CompoundKernel([WhiteKernel(noise_level=3.0), + RBF(length_scale=2.0)]) + assert kernel.requires_vector_input + + def check_hyperparameters_equal(kernel1, kernel2): # Check that hyperparameters of two kernels are equal for attr in set(dir(kernel1) + dir(kernel2)): @@ -236,8 +256,7 @@ def test_kernel_clone_after_set_params(kernel): params['length_scale_bounds'] = bounds * 2 kernel_cloned.set_params(**params) kernel_cloned_clone = clone(kernel_cloned) - assert (kernel_cloned_clone.get_params() == - kernel_cloned.get_params()) + assert (kernel_cloned_clone.get_params() == kernel_cloned.get_params()) assert id(kernel_cloned_clone) != id(kernel_cloned) check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone) @@ -266,7 +285,7 @@ def test_kernel_versus_pairwise(kernel): # Check that GP kernels can also be used as pairwise kernels. # Test auto-kernel - if kernel != kernel_white: + if kernel != kernel_rbf_plus_white: # For WhiteKernel: k(X) != k(X,X). This is assumed by # pairwise_kernels K1 = kernel(X) From cd81de1ccc9773af119dc290c25cef34029a7c47 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 16 Nov 2019 13:17:56 +0100 Subject: [PATCH 019/448] MAINT Improve method detection in numpydoc validation script (#15532) --- maint_tools/test_docstrings.py | 36 ++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index 6e1376fcab040..c19edd3a26c4c 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -44,15 +44,22 @@ def get_all_methods(): estimators = all_estimators() - for name, estimator in estimators: + for name, Estimator in estimators: if name.startswith("_"): # skip private classes continue - methods = [el for el in dir(estimator) if not el.startswith("_")] + methods = [] + for name in dir(Estimator): + if name.startswith("_"): + continue + method_obj = getattr(Estimator, name) + if (hasattr(method_obj, '__call__') + or isinstance(method_obj, property)): + methods.append(name) methods.append(None) for method in sorted(methods, key=lambda x: str(x)): - yield estimator, method + yield Estimator, method def filter_errors(errors, method): @@ -112,7 +119,16 @@ def repr_errors(res, estimator=None, method: Optional[str] = None) -> str: raise NotImplementedError if estimator is not None: - obj_signature = signature(getattr(estimator, method)) + obj = getattr(estimator, method) + try: + obj_signature = signature(obj) + except TypeError: + # In particular we can't parse the signature of properties + obj_signature = ( + "\nParsing of the method signature failed, " + "possibly because this is a property." + ) + obj_name = estimator.__name__ + "." + method else: obj_signature = "" @@ -120,7 +136,7 @@ def repr_errors(res, estimator=None, method: Optional[str] = None) -> str: msg = "\n\n" + "\n\n".join( [ - res["file"], + str(res["file"]), obj_name + str(obj_signature), res["docstring"], "# Errors", @@ -133,10 +149,10 @@ def repr_errors(res, estimator=None, method: Optional[str] = None) -> str: return msg -@pytest.mark.parametrize("estimator, method", get_all_methods()) -def test_docstring(estimator, method, request): - base_import_path = estimator.__module__ - import_path = [base_import_path, estimator.__name__] +@pytest.mark.parametrize("Estimator, method", get_all_methods()) +def test_docstring(Estimator, method, request): + base_import_path = Estimator.__module__ + import_path = [base_import_path, Estimator.__name__] if method is not None: import_path.append(method) @@ -154,7 +170,7 @@ def test_docstring(estimator, method, request): res["errors"] = list(filter_errors(res["errors"], method)) if res["errors"]: - msg = repr_errors(res, estimator, method) + msg = repr_errors(res, Estimator, method) raise ValueError(msg) From d85d1e9ab12e4508e85c4f938c652b9f71cbe242 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sat, 16 Nov 2019 12:03:10 -0800 Subject: [PATCH 020/448] ENH Sets xlabel only if it is not defined in PDP (#15610) --- sklearn/inspection/_partial_dependence.py | 5 ++++- .../tests/test_plot_partial_dependence.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 785195e2fd767..be26223633841 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -933,9 +933,12 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): ylim = axi.get_ylim() axi.vlines(self.deciles[fx[0]], 0, 0.05, transform=trans, color='k') - axi.set_xlabel(self.feature_names[fx[0]]) axi.set_ylim(ylim) + # Set xlabel if it is not already set + if not axi.get_xlabel(): + axi.set_xlabel(self.feature_names[fx[0]]) + if len(values) == 1: if n_cols is None or i % n_cols == 0: axi.set_ylabel('Partial dependence') diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py index 71f9ec078f655..fd2342d1b9b05 100644 --- a/sklearn/inspection/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/tests/test_plot_partial_dependence.py @@ -256,6 +256,24 @@ def test_plot_partial_dependence_with_same_axes(pyplot, clf_boston, boston): feature_names=boston.feature_names, ax=ax) +def test_plot_partial_dependence_feature_name_reuse(pyplot, clf_boston, + boston): + # second call to plot does not change the feature names from the first + # call + + feature_names = boston.feature_names + disp = plot_partial_dependence(clf_boston, boston.data, + [0, 1], + grid_resolution=10, + feature_names=feature_names) + + plot_partial_dependence(clf_boston, boston.data, [0, 1], + grid_resolution=10, ax=disp.axes_) + + for i, ax in enumerate(disp.axes_.ravel()): + assert ax.get_xlabel() == feature_names[i] + + def test_plot_partial_dependence_multiclass(pyplot): grid_resolution = 25 clf_int = GradientBoostingClassifier(n_estimators=10, random_state=1) From 5483b0ecd1be727415607d9fe31dc95ffa3698eb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 16 Nov 2019 21:13:05 -0500 Subject: [PATCH 021/448] DOC Use anonymous reference in readme for rst standard to be happy (#15643) --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 6e491557ab541..98c39b4482fd2 100644 --- a/README.rst +++ b/README.rst @@ -31,7 +31,7 @@ SciPy and is distributed under the 3-Clause BSD license. The project was started in 2007 by David Cournapeau as a Google Summer of Code project, and since then many volunteers have contributed. See -the `About us `_ page +the `About us `__ page for a list of core contributors. It is currently maintained by a team of volunteers. @@ -138,7 +138,7 @@ Project History The project was started in 2007 by David Cournapeau as a Google Summer of Code project, and since then many volunteers have contributed. See -the `About us `_ page +the `About us `__ page for a list of core contributors. The project is currently maintained by a team of volunteers. From ee09095cea3a49c639047b502675fda1439e02b9 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Sun, 17 Nov 2019 05:51:20 +0100 Subject: [PATCH 022/448] DOC minor fixes in whats_new for the release (#15632) --- doc/whats_new/v0.22.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 512e0574896ea..8f4dd76c08677 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -7,7 +7,7 @@ Version 0.22.0 ============== -**In Development** +**November 29 2019** For a short description of the main highlights of the release, please refer to @@ -516,20 +516,20 @@ Changelog - |Fix| :class:`linear_model.LassoCV` no longer forces ``precompute=False`` when fitting the final model. :pr:`14591` by `Andreas Müller`_. -- |FIX| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` +- |Fix| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` now correctly scores when `cv=None`. :pr:`14864` by :user:`Venkatachalam N `. -- |FIX| Fixed a bug in :class:`linear_model.LogisticRegressionCV` where the +- |Fix| Fixed a bug in :class:`linear_model.LogisticRegressionCV` where the ``scores_``, ``n_iter_`` and ``coefs_paths_`` attribute would have a wrong ordering with ``penalty='elastic-net'``. :pr:`15044` by `Nicolas Hug`_ -- |FIX| :class:`linear_model.MultiTaskLassoCV` and +- |Fix| :class:`linear_model.MultiTaskLassoCV` and :class:`linear_model.MultiTaskElasticNetCV` with X of dtype int and `fit_intercept=True`. :pr:`15086` by :user:`Alex Gramfort `. -- |FIX| The liblinear solver now supports ``sample_weight``. +- |Fix| The liblinear solver now supports ``sample_weight``. :pr:`15038` by :user:`Guillaume Lemaitre `. :mod:`sklearn.manifold` @@ -804,7 +804,7 @@ Changelog - |Fix| fixed a bug in :class:`BaseLibSVM._sparse_fit` where n_SV=0 raised a ZeroDivisionError. :pr:`14894` by :user:`Danna Naser `. -- |FIX| The liblinear solver now supports ``sample_weight``. +- |Fix| The liblinear solver now supports ``sample_weight``. :pr:`15038` by :user:`Guillaume Lemaitre `. From d84785c5e1a1bc43407ec14613398b1316133062 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sat, 16 Nov 2019 22:10:14 -0800 Subject: [PATCH 023/448] [MRG] ENH: Add scaling to convergence warning for LBFGS (#15571) --- sklearn/linear_model/tests/test_logistic.py | 9 +++++++-- sklearn/utils/optimize.py | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 894040c2053bd..a98bb642e9be0 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -390,8 +390,13 @@ def test_logistic_regression_path_convergence_fail(): X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) y = [1] * 100 + [-1] * 100 Cs = [1e3] - assert_warns(ConvergenceWarning, _logistic_regression_path, - X, y, Cs=Cs, tol=0., max_iter=1, random_state=0, verbose=1) + + msg = (r"lbfgs failed to converge.+Increase the number of iterations or " + r"scale the data") + + with pytest.warns(ConvergenceWarning, match=msg): + _logistic_regression_path( + X, y, Cs=Cs, tol=0., max_iter=1, random_state=0, verbose=0) def test_liblinear_dual_random_state(): diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index f83d2ffc375ae..b03034e233617 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -234,7 +234,9 @@ def _check_optimize_result(solver, result, max_iter=None): if solver == "lbfgs": if result.status != 0: warnings.warn("{} failed to converge (status={}): {}. " - "Increase the number of iterations." + "Increase the number of iterations or scale the " + "data as shown in https://scikit-learn.org/stable/" + "modules/preprocessing.html" .format(solver, result.status, result.message), ConvergenceWarning, stacklevel=2) if max_iter is not None: From 93661c7fdda1bc26197f9028655859393adea504 Mon Sep 17 00:00:00 2001 From: cgsavard Date: Sun, 17 Nov 2019 02:17:13 -0700 Subject: [PATCH 024/448] included default values in SparseCoder class doc (#15600) --- sklearn/decomposition/_dict_learning.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 0eb69f5b5a74c..29839157ca33f 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -952,7 +952,7 @@ class SparseCoder(SparseCodingMixin, BaseEstimator): normalized to unit norm. transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \ - 'threshold'} + 'threshold'}, default='omp' Algorithm used to transform the data: lars: uses the least angle regression method (linear_model.lars_path) lasso_lars: uses Lars to compute the Lasso solution @@ -963,12 +963,12 @@ class SparseCoder(SparseCodingMixin, BaseEstimator): threshold: squashes to zero all coefficients less than alpha from the projection ``dictionary * X'`` - transform_n_nonzero_coefs : int, ``0.1 * n_features`` by default + transform_n_nonzero_coefs : int, default=0.1*n_features Number of nonzero coefficients to target in each column of the solution. This is only used by `algorithm='lars'` and `algorithm='omp'` and is overridden by `alpha` in the `omp` case. - transform_alpha : float, 1. by default + transform_alpha : float, default=1. If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the penalty applied to the L1 norm. If `algorithm='threshold'`, `alpha` is the absolute value of the @@ -977,23 +977,23 @@ class SparseCoder(SparseCodingMixin, BaseEstimator): the reconstruction error targeted. In this case, it overrides `n_nonzero_coefs`. - split_sign : bool, False by default + split_sign : bool, default=False Whether to split the sparse feature vector into the concatenation of its negative part and its positive part. This can improve the performance of downstream classifiers. - n_jobs : int or None, optional (default=None) + n_jobs : int or None, default=None Number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - positive_code : bool + positive_code : bool, default=False Whether to enforce positivity when finding the code. .. versionadded:: 0.20 - transform_max_iter : int, optional (default=1000) + transform_max_iter : int, default=1000 Maximum number of iterations to perform if `algorithm='lasso_cd'` or `lasso_lars`. From 54e78b7b07691729d5414f0c8b800f94519f15a8 Mon Sep 17 00:00:00 2001 From: Abbie Popa Date: Sun, 17 Nov 2019 02:25:54 -0800 Subject: [PATCH 025/448] DOC Birch numpydoc docstring validation (#15511) --- sklearn/cluster/_birch.py | 51 ++++++++++++++++++---------- sklearn/ensemble/_weight_boosting.py | 8 ++--- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 6ac9ec334a734..349ec19c6ff9c 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -388,25 +388,12 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): if partial_fit is used instead of fit, they are assigned to the last batch of data. - Examples + See Also -------- - >>> from sklearn.cluster import Birch - >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]] - >>> brc = Birch(n_clusters=None) - >>> brc.fit(X) - Birch(n_clusters=None) - >>> brc.predict(X) - array([0, 0, 0, 1, 1, 1]) - References - ---------- - * Tian Zhang, Raghu Ramakrishnan, Maron Livny - BIRCH: An efficient data clustering method for large databases. - https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf - - * Roberto Perdisci - JBirch - Java implementation of BIRCH clustering algorithm - https://code.google.com/archive/p/jbirch + MiniBatchKMeans + Alternative implementation that does incremental updates + of the centers' positions using mini-batches. Notes ----- @@ -421,6 +408,26 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): to it and the linear sum, squared sum and the number of samples of that subcluster are updated. This is done recursively till the properties of the leaf node are updated. + + References + ---------- + * Tian Zhang, Raghu Ramakrishnan, Maron Livny + BIRCH: An efficient data clustering method for large databases. + https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf + + * Roberto Perdisci + JBirch - Java implementation of BIRCH clustering algorithm + https://code.google.com/archive/p/jbirch + + Examples + -------- + >>> from sklearn.cluster import Birch + >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]] + >>> brc = Birch(n_clusters=None) + >>> brc.fit(X) + Birch(n_clusters=None) + >>> brc.predict(X) + array([0, 0, 0, 1, 1, 1]) """ def __init__(self, threshold=0.5, branching_factor=50, n_clusters=3, @@ -441,7 +448,12 @@ def fit(self, X, y=None): Input data. y : Ignored + Not used, present here for API consistency by convention. + Returns + ------- + self + Fitted estimator. """ self.fit_, self.partial_fit_ = True, False return self._fit(X) @@ -524,7 +536,12 @@ def partial_fit(self, X=None, y=None): step is done. y : Ignored + Not used, present here for API consistency by convention. + Returns + ------- + self + Fitted estimator. """ self.partial_fit_, self.fit_ = True, False if X is None: diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index d76e29fb37239..ebd9635cd52b3 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -224,7 +224,7 @@ def staged_score(self, X, y, sample_weight=None): Sample weights. Yields - ------- + ------ z : float """ X = self._validate_data(X) @@ -428,7 +428,7 @@ def fit(self, X, y, sample_weight=None): Returns ------- self : object - A fitted estimator. + Fitted estimator. """ # Check that algorithm is supported if self.algorithm not in ('SAMME', 'SAMME.R'): @@ -648,7 +648,7 @@ def staged_predict(self, X): DOK, or LIL. COO, DOK, and LIL are converted to CSR. Yields - ------- + ------ y : generator of array, shape = [n_samples] The predicted classes. """ @@ -719,7 +719,7 @@ def staged_decision_function(self, X): DOK, or LIL. COO, DOK, and LIL are converted to CSR. Yields - ------- + ------ score : generator of array, shape = [n_samples, k] The decision function of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. From 98cb91b545f9d630b2d37de996cccbe35a33695b Mon Sep 17 00:00:00 2001 From: "m.fab" Date: Sun, 17 Nov 2019 10:29:38 -0800 Subject: [PATCH 026/448] DOC numpydoc validation for VotingClassifier (#15497) --- maint_tools/test_docstrings.py | 9 +++++++++ sklearn/base.py | 7 ++++++- sklearn/ensemble/_base.py | 21 +++++++++++---------- sklearn/ensemble/_voting.py | 28 +++++++++++++--------------- 4 files changed, 39 insertions(+), 26 deletions(-) diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index c19edd3a26c4c..9c6c41f0b53dd 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -39,6 +39,15 @@ "SGDClassifier.score", "SGDClassifier.sparsify", "SGDClassifier.densify", + "VotingClassifier.fit", + "VotingClassifier.transform", + "VotingClassifier.predict", + "VotingClassifier.score", + "VotingClassifier.predict_proba", + "VotingClassifier.set_params", + "VotingClassifier.get_params", + "VotingClassifier.named_estimators", + "VotingClassifier$", ] diff --git a/sklearn/base.py b/sklearn/base.py index 6eb2c238b0a44..050bb4e2a522b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -1,4 +1,8 @@ -"""Base classes for all estimators.""" +""" +Base classes for all estimators. + +Used for VotingClassifier +""" # Author: Gael Varoquaux # License: BSD 3 clause @@ -334,6 +338,7 @@ def _get_tags(self): class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" + _estimator_type = "classifier" def score(self, X, y, sample_weight=None): diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 7f9036785b1a3..5db30b9bbc600 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -1,6 +1,4 @@ -""" -Base class for ensemble-based estimators. -""" +"""Base class for ensemble-based estimators.""" # Authors: Gilles Louppe # License: BSD 3 clause @@ -42,14 +40,13 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None): def _set_random_states(estimator, random_state=None): - """Sets fixed random_state parameters for an estimator + """Set fixed random_state parameters for an estimator. Finds all parameters ending ``random_state`` and sets them to integers derived from ``random_state``. Parameters ---------- - estimator : estimator supporting get/set_params Estimator with potential randomness managed by random_state parameters. @@ -106,6 +103,7 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): estimators_ : list of estimators The collection of fitted base estimators. """ + # overwrite _required_parameters from MetaEstimatorMixin _required_parameters = [] @@ -122,8 +120,10 @@ def __init__(self, base_estimator, n_estimators=10, # self.estimators_ needs to be filled by the derived classes in fit. def _validate_estimator(self, default=None): - """Check the estimator and the n_estimator attribute, set the - `base_estimator_` attribute.""" + """Check the estimator and the n_estimator attribute. + + Sets the base_estimator_` attributes. + """ if not isinstance(self.n_estimators, numbers.Integral): raise ValueError("n_estimators must be an integer, " "got {0}.".format(type(self.n_estimators))) @@ -159,15 +159,15 @@ def _make_estimator(self, append=True, random_state=None): return estimator def __len__(self): - """Returns the number of estimators in the ensemble.""" + """Return the number of estimators in the ensemble.""" return len(self.estimators_) def __getitem__(self, index): - """Returns the index'th estimator in the ensemble.""" + """Return the index'th estimator in the ensemble.""" return self.estimators_[index] def __iter__(self): - """Returns iterator over estimators in the ensemble.""" + """Return iterator over estimators in the ensemble.""" return iter(self.estimators_) @@ -204,6 +204,7 @@ class _BaseHeterogeneousEnsemble(MetaEstimatorMixin, _BaseComposition, training data. If an estimator has been set to `'drop'`, it will not appear in `estimators_`. """ + _required_parameters = ['estimators'] @property diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index bddf5f00b10af..3d6d8016cf6ed 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -41,21 +41,19 @@ class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): @property def _weights_not_none(self): - """Get the weights of not `None` estimators""" + """Get the weights of not `None` estimators.""" if self.weights is None: return None return [w for est, w in zip(self.estimators, self.weights) if est[1] not in (None, 'drop')] def _predict(self, X): - """Collect results from clf.predict calls. """ + """Collect results from clf.predict calls.""" return np.asarray([est.predict(X) for est in self.estimators_]).T @abstractmethod def fit(self, X, y, sample_weight=None): - """ - common fit operations. - """ + """Get common fit operations.""" names, clfs = self._validate_estimators() if (self.weights is not None and @@ -90,7 +88,7 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): Parameters ---------- - estimators : list of (string, estimator) tuples + estimators : list of (str, estimator) tuples Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones of those original estimators that will be stored in the class attribute ``self.estimators_``. An estimator can be set to ``'drop'`` @@ -138,6 +136,10 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): classes_ : array-like, shape (n_predictions,) The classes labels. + See Also + -------- + VotingRegressor: Prediction voting regressor. + Examples -------- >>> import numpy as np @@ -172,10 +174,6 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): [1 1 1 2 2 2] >>> print(eclf3.transform(X).shape) (6, 6) - - See also - -------- - VotingRegressor: Prediction voting regressor. """ def __init__(self, estimators, voting='hard', weights=None, n_jobs=None, @@ -187,7 +185,7 @@ def __init__(self, estimators, voting='hard', weights=None, n_jobs=None, self.flatten_transform = flatten_transform def fit(self, X, y, sample_weight=None): - """ Fit the estimators. + """Fit the estimators. Parameters ---------- @@ -206,6 +204,7 @@ def fit(self, X, y, sample_weight=None): Returns ------- self : object + """ check_classification_targets(y) if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: @@ -223,7 +222,7 @@ def fit(self, X, y, sample_weight=None): return super().fit(X, transformed_y, sample_weight) def predict(self, X): - """ Predict class labels for X. + """Predict class labels for X. Parameters ---------- @@ -235,7 +234,6 @@ def predict(self, X): maj : array-like, shape (n_samples,) Predicted class labels. """ - check_is_fitted(self) if self.voting == 'soft': maj = np.argmax(self.predict_proba(X), axis=1) @@ -252,11 +250,11 @@ def predict(self, X): return maj def _collect_probas(self, X): - """Collect results from clf.predict calls. """ + """Collect results from clf.predict calls.""" return np.asarray([clf.predict_proba(X) for clf in self.estimators_]) def _predict_proba(self, X): - """Predict class probabilities for X in 'soft' voting """ + """Predict class probabilities for X in 'soft' voting.""" check_is_fitted(self) avg = np.average(self._collect_probas(X), axis=0, weights=self._weights_not_none) From c311efd501538ceccec0d3def70f601331e090ca Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Nov 2019 13:39:43 +0100 Subject: [PATCH 027/448] FIX accept pandas Index for feature_names in PDP (#15639) --- sklearn/inspection/_partial_dependence.py | 3 +- .../tests/test_plot_partial_dependence.py | 20 +++++++++---- sklearn/utils/_testing.py | 22 ++++++++++++++ sklearn/utils/tests/test_testing.py | 20 ++++++++++++- sklearn/utils/tests/test_utils.py | 29 ++++--------------- 5 files changed, 64 insertions(+), 30 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index be26223633841..40a7f073ca818 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -623,7 +623,8 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, else: # define a list of numbered indices for a numpy array feature_names = [str(i) for i in range(n_features)] - elif isinstance(feature_names, np.ndarray): + elif hasattr(feature_names, "tolist"): + # convert numpy array or pandas index to a list feature_names = feature_names.tolist() if len(set(feature_names)) != len(feature_names): raise ValueError('feature_names should not contain duplicates.') diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py index fd2342d1b9b05..6a819aeeea157 100644 --- a/sklearn/inspection/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/tests/test_plot_partial_dependence.py @@ -10,6 +10,8 @@ from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LinearRegression +from sklearn.utils._testing import _convert_container + from sklearn.inspection import plot_partial_dependence @@ -86,12 +88,15 @@ def test_plot_partial_dependence(grid_resolution, pyplot, clf_boston, boston): @pytest.mark.parametrize( - "input_type, use_feature_names", - [('dataframe', False), ('dataframe', True), - ('list', True), ('array', True)] + "input_type, feature_names_type", + [('dataframe', None), + ('dataframe', 'list'), ('list', 'list'), ('array', 'list'), + ('dataframe', 'array'), ('list', 'array'), ('array', 'array'), + ('dataframe', 'series'), ('list', 'series'), ('array', 'series'), + ('dataframe', 'index'), ('list', 'index'), ('array', 'index')] ) def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston, - input_type, use_feature_names): + input_type, feature_names_type): if input_type == 'dataframe': pd = pytest.importorskip("pandas") X = pd.DataFrame(boston.data, columns=boston.feature_names) @@ -99,7 +104,12 @@ def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston, X = boston.data.tolist() else: X = boston.data - feature_names = boston.feature_names if use_feature_names else None + + if feature_names_type is None: + feature_names = None + else: + feature_names = _convert_container(boston.feature_names, + feature_names_type) grid_resolution = 25 # check with str features and array feature names and single column diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 4aaf7c1fd388c..806f302b78288 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -912,3 +912,25 @@ def assert_run_python_script(source_code, timeout=60): % e.output.decode('utf-8')) finally: os.unlink(source_file) + + +def _convert_container(container, constructor_name, columns_name=None): + if constructor_name == 'list': + return list(container) + elif constructor_name == 'tuple': + return tuple(container) + elif constructor_name == 'array': + return np.asarray(container) + elif constructor_name == 'sparse': + return sp.sparse.csr_matrix(container) + elif constructor_name == 'dataframe': + pd = pytest.importorskip('pandas') + return pd.DataFrame(container, columns=columns_name) + elif constructor_name == 'series': + pd = pytest.importorskip('pandas') + return pd.Series(container) + elif constructor_name == 'index': + pd = pytest.importorskip('pandas') + return pd.Index(container) + elif constructor_name == 'slice': + return slice(container[0], container[1]) diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index 64001bef9348b..46d75ede2cace 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -32,7 +32,8 @@ assert_raises_regex, TempMemmap, create_memmap_backed_data, - _delete_folder) + _delete_folder, + _convert_container) from sklearn.utils._testing import SkipTest from sklearn.tree import DecisionTreeClassifier @@ -674,3 +675,20 @@ def test_deprecated_helpers(callable, args): '0.24. Please use "assert" instead') with pytest.warns(FutureWarning, match=msg): callable(*args) + + +@pytest.mark.parametrize( + "constructor_name, container_type", + [('list', list), + ('tuple', tuple), + ('array', np.ndarray), + ('sparse', sparse.csr_matrix), + ('dataframe', pytest.importorskip('pandas').DataFrame), + ('series', pytest.importorskip('pandas').Series), + ('index', pytest.importorskip('pandas').Index), + ('slice', slice)] +) +def test_convert_container(constructor_name, container_type): + container = [0, 1] + assert isinstance(_convert_container(container, constructor_name), + container_type) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 2cf1e59a73f29..8031245105571 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -9,10 +9,12 @@ import scipy.sparse as sp from sklearn.utils._testing import (assert_raises, - assert_array_equal, - assert_allclose_dense_sparse, - assert_raises_regex, - assert_warns_message, assert_no_warnings) + assert_array_equal, + assert_allclose_dense_sparse, + assert_raises_regex, + assert_warns_message, + assert_no_warnings, + _convert_container) from sklearn.utils import check_random_state from sklearn.utils import _determine_key_type from sklearn.utils import deprecated @@ -236,25 +238,6 @@ def test_determine_key_type_slice_error(): _determine_key_type(slice(0, 2, 1), accept_slice=False) -def _convert_container(container, constructor_name, columns_name=None): - if constructor_name == 'list': - return list(container) - elif constructor_name == 'tuple': - return tuple(container) - elif constructor_name == 'array': - return np.asarray(container) - elif constructor_name == 'sparse': - return sp.csr_matrix(container) - elif constructor_name == 'dataframe': - pd = pytest.importorskip('pandas') - return pd.DataFrame(container, columns=columns_name) - elif constructor_name == 'series': - pd = pytest.importorskip('pandas') - return pd.Series(container) - elif constructor_name == 'slice': - return slice(container[0], container[1]) - - @pytest.mark.parametrize( "array_type", ["list", "array", "sparse", "dataframe"] ) From 816c98090ef2d8926e7935f63004da66bb9ef747 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 18 Nov 2019 10:17:17 -0600 Subject: [PATCH 028/448] MNT Reorder parameters in plot_confusion_matrix (#15645) * MNT Reorder parameters in plot_confusion_matrix * MNT Reorder parameters in plot_confusion_matrix * address comment --- sklearn/metrics/_plot/confusion_matrix.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index 821fde54cf946..be59c8dd9a847 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -117,9 +117,9 @@ def plot(self, include_values=True, cmap='viridis', return self -def plot_confusion_matrix(estimator, X, y_true, sample_weight=None, - labels=None, display_labels=None, - include_values=True, normalize=None, +def plot_confusion_matrix(estimator, X, y_true, labels=None, + sample_weight=None, normalize=None, + display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None): @@ -138,14 +138,19 @@ def plot_confusion_matrix(estimator, X, y_true, sample_weight=None, y : array-like of shape (n_samples,) Target values. - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. - labels : array-like of shape (n_classes,), default=None List of labels to index the matrix. This may be used to reorder or select a subset of labels. If `None` is given, those that appear at least once in `y_true` or `y_pred` are used in sorted order. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + normalize : {'true', 'pred', 'all'}, default=None + Normalizes confusion matrix over the true (rows), predicted (columns) + conditions or all the population. If None, confusion matrix will not be + normalized. + display_labels : array-like of shape (n_classes,), default=None Target names used for plotting. By default, `labels` will be used if it is defined, otherwise the unique labels of `y_true` and `y_pred` @@ -154,11 +159,6 @@ def plot_confusion_matrix(estimator, X, y_true, sample_weight=None, include_values : bool, default=True Includes values in confusion matrix. - normalize : {'true', 'pred', 'all'}, default=None - Normalizes confusion matrix over the true (rows), predicted (columns) - conditions or all the population. If None, confusion matrix will not be - normalized. - xticks_rotation : {'vertical', 'horizontal'} or float, \ default='vertical' Rotation of xtick labels. From 9dcdb8789aed7414902a31d751c936d7d414b557 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 18 Nov 2019 12:15:51 -0500 Subject: [PATCH 029/448] DOC Adds confusion matrix to whats new (#15656) --- doc/whats_new/v0.22.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 8f4dd76c08677..fc9fbad967e8c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -589,6 +589,9 @@ Changelog - |Feature| :func:`metrics.plot_precision_recall_curve` has been added to plot precision recall curves. :pr:`14936` by `Thomas Fan`_. +- |Feature| :func:`metrics.plot_confusion_matrix` has been added to plot + confusion matrices. :pr:`15083` by `Thomas Fan`_. + - |Feature| Added multiclass support to :func:`metrics.roc_auc_score` with corresponding scorers `'roc_auc_ovr'`, `'roc_auc_ovo'`, `'roc_auc_ovr_weighted'`, and `'roc_auc_ovo_weighted'`. From f7ed72aa44db7b851946e8010383b1947f585f86 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 19 Nov 2019 07:12:47 +1100 Subject: [PATCH 030/448] ENH reduce memory consumption in nan_euclidean_distances (#15615) --- sklearn/metrics/pairwise.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b3ebec934e3a0..cd1e65c282ebf 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -406,20 +406,23 @@ def nan_euclidean_distances(X, Y=None, squared=False, distances -= np.dot(XX, missing_Y.T) distances -= np.dot(missing_X, YY.T) - present_coords_cnt = np.dot(1 - missing_X, 1 - missing_Y.T) - present_mask = (present_coords_cnt != 0) - distances[present_mask] *= (X.shape[1] / present_coords_cnt[present_mask]) - if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. np.fill_diagonal(distances, 0.0) + present_X = 1 - missing_X + present_Y = present_X if Y is X else ~missing_Y + present_count = np.dot(present_X, present_Y.T) + distances[present_count == 0] = np.nan + # avoid divide by zero + np.maximum(1, present_count, out=present_count) + distances /= present_count + distances *= X.shape[1] + if not squared: np.sqrt(distances, out=distances) - # coordinates with no common coordinates have a nan distance - distances[~present_mask] = np.nan return distances From 60248946bd96f26a9534bac06546b2bd829ec63a Mon Sep 17 00:00:00 2001 From: th0rwas <55891641+th0rwas@users.noreply.github.com> Date: Tue, 19 Nov 2019 05:24:09 +0100 Subject: [PATCH 031/448] EXA Adds example for tree.ExtraTreeRegressor (#15213) --- sklearn/tree/_classes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e0ccb723d2e36..5bc66d59acc07 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1631,6 +1631,21 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", Machine Learning, 63(1), 3-42, 2006. + + Examples + -------- + >>> from sklearn.datasets import load_boston + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.ensemble import BaggingRegressor + >>> from sklearn.tree import ExtraTreeRegressor + >>> X, y = load_boston(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> extra_tree = ExtraTreeRegressor(random_state=0) + >>> reg = BaggingRegressor(extra_tree, random_state=0).fit( + ... X_train, y_train) + >>> reg.score(X_test, y_test) + 0.7823... """ def __init__(self, criterion="mse", From e254b30b6ea2fbdc3f05abeee71b4fc9a4b2e713 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 19 Nov 2019 03:56:37 -0500 Subject: [PATCH 032/448] MAINT run latest CI on Python 3.8 (#15637) --- azure-pipelines.yml | 2 +- sklearn/ensemble/tests/test_partial_dependence.py | 6 ++++++ sklearn/inspection/tests/test_plot_partial_dependence.py | 6 ++++++ sklearn/linear_model/tests/test_bayes.py | 2 +- sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py | 5 +++++ sklearn/metrics/_plot/tests/test_plot_precision_recall.py | 6 ++++++ sklearn/metrics/_plot/tests/test_plot_roc_curve.py | 6 ++++++ sklearn/neighbors/_base.py | 2 +- 8 files changed, 32 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9fdead91dd309..ceeb76b3f1446 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -68,7 +68,7 @@ jobs: pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' # FIXME: pinned until SciPy wheels are available for Pyhon 3.8 - PYTHON_VERSION: '3.7' + PYTHON_VERSION: '3.8' PYTEST_VERSION: '4.6.2' COVERAGE: 'true' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py index a56523a29836c..84ff0c004b68b 100644 --- a/sklearn/ensemble/tests/test_partial_dependence.py +++ b/sklearn/ensemble/tests/test_partial_dependence.py @@ -15,6 +15,12 @@ from sklearn.utils._testing import ignore_warnings +# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved +pytestmark = pytest.mark.filterwarnings( + "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" + "matplotlib.*") + + # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py index 6a819aeeea157..222ab0fc45ccd 100644 --- a/sklearn/inspection/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/tests/test_plot_partial_dependence.py @@ -15,6 +15,12 @@ from sklearn.inspection import plot_partial_dependence +# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved +pytestmark = pytest.mark.filterwarnings( + "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" + "matplotlib.*") + + @pytest.fixture(scope="module") def boston(): return load_boston() diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py index 9cbd8f9970d9d..e1922a010514f 100644 --- a/sklearn/linear_model/tests/test_bayes.py +++ b/sklearn/linear_model/tests/test_bayes.py @@ -209,7 +209,7 @@ def test_ard_accuracy_on_easy_problem(): X = np.random.RandomState(seed=seed).normal(size=(250, 3)) y = X[:, 1] - regressor = ARDRegression() + regressor = ARDRegression(n_iter=600) regressor.fit(X, y) abs_coef_error = np.abs(1 - regressor.coef_[1]) diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py index deb506143bdfb..dbd515f3527cc 100644 --- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py +++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py @@ -16,6 +16,11 @@ from sklearn.metrics import ConfusionMatrixDisplay +# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved +pytestmark = pytest.mark.filterwarnings( + "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" + "matplotlib.*") + @pytest.fixture(scope="module") def n_classes(): return 5 diff --git a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py index 03c6778254e18..1012e13027f5a 100644 --- a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py +++ b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py @@ -15,6 +15,12 @@ from sklearn.compose import make_column_transformer +# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved +pytestmark = pytest.mark.filterwarnings( + "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" + "matplotlib.*") + + def test_errors(pyplot): X, y_multiclass = make_classification(n_classes=3, n_samples=50, n_informative=3, diff --git a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py index 609a422c3d13a..65438d4610258 100644 --- a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py +++ b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py @@ -14,6 +14,12 @@ from sklearn.compose import make_column_transformer +# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved +pytestmark = pytest.mark.filterwarnings( + "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" + "matplotlib.*") + + @pytest.fixture(scope="module") def data(): return load_iris(return_X_y=True) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index ef97b9df93718..1ff45332b1e70 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -262,7 +262,7 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): """ assert graph.format == 'csr' - no_filter_needed = graph.data.max() <= radius + no_filter_needed = bool(graph.data.max() <= radius) if no_filter_needed: data, indices, indptr = graph.data, graph.indices, graph.indptr From b6f124c848ba83d5031013ca3a942711b626ef3e Mon Sep 17 00:00:00 2001 From: Nicholas Won Date: Tue, 19 Nov 2019 20:36:20 +0900 Subject: [PATCH 033/448] MAINT Improve variable order in BaseDecisionTree (#15664) --- sklearn/tree/_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 5bc66d59acc07..ea43716e20ae6 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -104,8 +104,8 @@ def __init__(self, self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features - self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes + self.random_state = random_state self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.class_weight = class_weight From a723bea9c93d500376abe1c227ce8a8247fd813a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 19 Nov 2019 08:31:52 -0500 Subject: [PATCH 034/448] [MRG] Enables CircleCI to fail when sphinx warns (#15633) --- build_tools/circle/build_doc.sh | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index a76f9a8a890c4..c3481bd6c58c1 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -58,6 +58,44 @@ get_build_type() { return fi changed_examples=$(echo "$filenames" | grep -E "^examples/(.*/)*plot_") + + # The following is used to extract the list of filenames of example python + # files that sphinx-gallery needs to run to generate png files used as + # figures or images in the .rst files from the documentation. + # If the contributor changes a .rst file in a PR we need to run all + # the examples mentioned in that file to get sphinx build the + # documentation without generating spurious warnings related to missing + # png files. + + if [[ -n "$filenames" ]] + then + # get rst files + rst_files="$(echo "$filenames" | grep -E "rst$")" + + # get lines with figure or images + img_fig_lines="$(echo "$rst_files" | xargs grep -shE "(figure|image)::")" + + # get only auto_examples + auto_example_files="$(echo "$img_fig_lines" | grep auto_examples | awk -F "/" '{print $NF}')" + + # remove "sphx_glr_" from path and accept replace _(\d\d\d|thumb).png with .py + scripts_names="$(echo "$auto_example_files" | sed 's/sphx_glr_//' | sed -e 's/_([[:digit:]][[:digit:]][[:digit:]]|thumb).png/.py/')" + + # get unique values + examples_in_rst="$(echo "$scripts_names" | uniq )" + fi + + # executed only if there are examples in the modified rst files + if [[ -n "$examples_in_rst" ]] + then + if [[ -n "$changed_examples" ]] + then + changed_examples="$changed_examples|$examples_in_rst" + else + changed_examples="$examples_in_rst" + fi + fi + if [[ -n "$changed_examples" ]] then echo BUILD: detected examples/ filename modified in $git_range: $changed_examples @@ -204,5 +242,12 @@ then echo "$warnings" | sed 's/\/home\/circleci\/project\//

/g' echo '' ) > 'doc/_build/html/stable/_changed.html' + + if [ "$warnings" != "/home/circleci/project/ no warnings" ] + then + echo "Sphinx generated warnings when building the documentation related to files modified in this PR." + echo "Please check doc/_build/html/stable/_changed.html" + exit 1 + fi fi From 663d052d3c7da2630357a92c3f5c59128b51480e Mon Sep 17 00:00:00 2001 From: Marie Douriez Date: Tue, 19 Nov 2019 06:54:41 -0800 Subject: [PATCH 035/448] [MRG] documentation for random_state in forest.py (#15516) * documentation for random_state in forests * move note to parameter * same for RandomForestRegressor * add doc for ExtraTreesRegressor and ExtraTreesClassifier * skip line * lint * move note back to where it was * add Glossary in RandomForestRegressor * adding description for RandomTreesEmbedding * small fix * correct description for RandomTreesEmbedding --- sklearn/ensemble/_forest.py | 53 ++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 2dd600dc8f984..e7a0b8e56dde8 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -962,10 +962,11 @@ class RandomForestClassifier(ForestClassifier): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -1278,10 +1279,11 @@ class RandomForestRegressor(ForestRegressor): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -1540,7 +1542,7 @@ class ExtraTreesClassifier(ForestClassifier): bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. If False, the - whole datset is used to build each tree. + whole dataset is used to build each tree. oob_score : bool, optional (default=False) Whether to use out-of-bag samples to estimate @@ -1554,10 +1556,14 @@ class ExtraTreesClassifier(ForestClassifier): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls 3 sources of randomness: + + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -1845,7 +1851,7 @@ class ExtraTreesRegressor(ForestRegressor): bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. If False, the - whole datset is used to build each tree. + whole dataset is used to build each tree. oob_score : bool, optional (default=False) Whether to use out-of-bag samples to estimate the R^2 on unseen data. @@ -1858,10 +1864,14 @@ class ExtraTreesRegressor(ForestRegressor): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls 3 sources of randomness: + + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -2088,10 +2098,9 @@ class RandomTreesEmbedding(BaseForest): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the generation of the random `y` used to fit the trees + and the draw of the splits for each feature at the trees' nodes. + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. From 43d8dee07b0c5e23237526d23a368fcfd66cc059 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 19 Nov 2019 23:48:26 +0100 Subject: [PATCH 036/448] MNT improve the convergence warning message for LogisticRegression (#15665) --- sklearn/linear_model/_logistic.py | 10 +++++++++- sklearn/linear_model/tests/test_logistic.py | 16 +++++++++++----- sklearn/utils/optimize.py | 19 ++++++++++++------- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 7921150e0fa01..de7ac323833e8 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -38,6 +38,12 @@ from ..metrics import get_scorer +_LOGISTIC_SOLVER_CONVERGENCE_MSG = ( + "Please also refer to the documentation for alternative solver options:\n" + " https://scikit-learn.org/stable/modules/linear_model.html" + "#logistic-regression") + + # .. some helper functions for logistic_regression_path .. def _intercept_dot(w, X, y): """Computes y * np.dot(X, w). @@ -928,7 +934,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, args=(X, target, 1. / C, sample_weight), options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} ) - n_iter_i = _check_optimize_result(solver, opt_res, max_iter) + n_iter_i = _check_optimize_result( + solver, opt_res, max_iter, + extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG) w0, loss = opt_res.x, opt_res.fun elif solver == 'newton-cg': args = (X, target, 1. / C, sample_weight) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index a98bb642e9be0..4886870806531 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -31,7 +31,6 @@ from sklearn.utils._testing import skip_if_no_parallel from sklearn.exceptions import ConvergenceWarning -from sklearn.exceptions import ChangedBehaviorWarning from sklearn.linear_model._logistic import ( LogisticRegression, logistic_regression_path, @@ -391,13 +390,20 @@ def test_logistic_regression_path_convergence_fail(): y = [1] * 100 + [-1] * 100 Cs = [1e3] - msg = (r"lbfgs failed to converge.+Increase the number of iterations or " - r"scale the data") - - with pytest.warns(ConvergenceWarning, match=msg): + # Check that the convergence message points to both a model agnostic + # advice (scaling the data) and to the logistic regression specific + # documentation that includes hints on the solver configuration. + with pytest.warns(ConvergenceWarning) as record: _logistic_regression_path( X, y, Cs=Cs, tol=0., max_iter=1, random_state=0, verbose=0) + assert len(record) == 1 + warn_msg = record[0].message.args[0] + assert "lbfgs failed to converge" in warn_msg + assert "Increase the number of iterations" in warn_msg + assert "scale the data" in warn_msg + assert "linear_model.html#logistic-regression" in warn_msg + def test_liblinear_dual_random_state(): # random_state is relevant for liblinear solver only if dual=True diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index b03034e233617..3534d85f1edef 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -213,7 +213,8 @@ def _newton_cg(grad_hess, func, grad, x0, args=(), tol=1e-4, return xk, k -def _check_optimize_result(solver, result, max_iter=None): +def _check_optimize_result(solver, result, max_iter=None, + extra_warning_msg=None): """Check the OptimizeResult for successful convergence Parameters @@ -233,12 +234,16 @@ def _check_optimize_result(solver, result, max_iter=None): # handle both scipy and scikit-learn solver names if solver == "lbfgs": if result.status != 0: - warnings.warn("{} failed to converge (status={}): {}. " - "Increase the number of iterations or scale the " - "data as shown in https://scikit-learn.org/stable/" - "modules/preprocessing.html" - .format(solver, result.status, result.message), - ConvergenceWarning, stacklevel=2) + warning_msg = ( + "{} failed to converge (status={}):\n{}.\n\n" + "Increase the number of iterations (max_iter) " + "or scale the data as shown in:\n" + " https://scikit-learn.org/stable/modules/" + "preprocessing.html." + ).format(solver, result.status, result.message.decode("latin1")) + if extra_warning_msg is not None: + warning_msg += "\n" + extra_warning_msg + warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2) if max_iter is not None: # In scipy <= 1.0.0, nit may exceed maxiter for lbfgs. # See https://github.com/scipy/scipy/issues/7854 From 6231d5ae5106ade493497e03082296658cc3c501 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 19 Nov 2019 16:56:34 -0600 Subject: [PATCH 037/448] MAINT Add _distributor_init.py (#15570) --- sklearn/__init__.py | 3 +++ sklearn/_distributor_init.py | 10 ++++++++++ 2 files changed, 13 insertions(+) create mode 100644 sklearn/_distributor_init.py diff --git a/sklearn/__init__.py b/sklearn/__init__.py index d49d93e224537..970d37480a370 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -90,6 +90,9 @@ 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] + # Allow distributors to run custom init code + from . import _distributor_init # noqa: F401 + def setup_module(module): """Fixture for the tests to assure globally controllable seeding of RNGs""" diff --git a/sklearn/_distributor_init.py b/sklearn/_distributor_init.py new file mode 100644 index 0000000000000..a0142ac80878f --- /dev/null +++ b/sklearn/_distributor_init.py @@ -0,0 +1,10 @@ +""" Distributor init file + +Distributors: you can add custom code here to support particular distributions +of scikit-learn. + +For example, this is a good place to put any checks for hardware requirements. + +The scikit-learn standard source distribution will not put code in this file, +so you can safely replace this file with your own version. +""" From 12196dab9b1b42f7526eedb6b7b18e04cd3286d2 Mon Sep 17 00:00:00 2001 From: Kesshi Jordan Date: Wed, 20 Nov 2019 00:12:53 -0800 Subject: [PATCH 038/448] MNT replaced check_consistent_length, etc with _check_sample_weight in BaseGradientBoosting (#15478) --- sklearn/ensemble/_gb.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 6488d5dd0e776..9db0a0322045d 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -56,7 +56,7 @@ from ..utils import deprecated from ..utils.fixes import logsumexp from ..utils.stats import _weighted_percentile -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.multiclass import check_classification_targets from ..exceptions import NotFittedError @@ -1442,13 +1442,8 @@ def fit(self, X, y, sample_weight=None, monitor=None): n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None - if sample_weight_is_none: - sample_weight = np.ones(n_samples, dtype=np.float32) - else: - sample_weight = column_or_1d(sample_weight, warn=True) - sample_weight_is_none = False - check_consistent_length(X, y, sample_weight) + sample_weight = _check_sample_weight(sample_weight, X) y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) y = column_or_1d(y, warn=True) From bb05e6beb95bdc5eca285ca24425faa23036adcb Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 20 Nov 2019 10:32:38 +0100 Subject: [PATCH 039/448] DOC fix plot_ in readme (#15674) --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 98c39b4482fd2..41197e178904a 100644 --- a/README.rst +++ b/README.rst @@ -55,7 +55,7 @@ scikit-learn requires: **Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** scikit-learn 0.21 and later require Python 3.5 or newer. -Scikit-learn plotting capabilities (i.e., functions start with "plot_" +Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and classes end with "Display") require Matplotlib (>= 1.5.1). For running the examples Matplotlib >= 1.5.1 is required. A few examples require scikit-image >= 0.12.3, a few examples require pandas >= 0.18.0. From e81884f9190be9d0d510abb5e83006bbb4956466 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 20 Nov 2019 12:00:50 +0100 Subject: [PATCH 040/448] TST Fix leak in tmp folders in LFW test when pillow is missing (#15676) --- sklearn/datasets/tests/test_lfw.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py index 3aa65a68bcdbf..19cda818d8d55 100644 --- a/sklearn/datasets/tests/test_lfw.py +++ b/sklearn/datasets/tests/test_lfw.py @@ -24,10 +24,10 @@ from sklearn.datasets.tests.test_common import check_return_X_y -SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_") -SCIKIT_LEARN_EMPTY_DATA = tempfile.mkdtemp(prefix="scikit_learn_empty_test_") +SCIKIT_LEARN_DATA = None +SCIKIT_LEARN_EMPTY_DATA = None +LFW_HOME = None -LFW_HOME = os.path.join(SCIKIT_LEARN_DATA, 'lfw_home') FAKE_NAMES = [ 'Abdelatif_Smith', 'Abhati_Kepler', @@ -44,6 +44,14 @@ def setup_module(): if not pillow_installed: raise SkipTest("PIL not installed.") + global SCIKIT_LEARN_DATA, SCIKIT_LEARN_EMPTY_DATA, LFW_HOME + + SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_") + LFW_HOME = os.path.join(SCIKIT_LEARN_DATA, 'lfw_home') + + SCIKIT_LEARN_EMPTY_DATA = tempfile.mkdtemp( + prefix="scikit_learn_empty_test_") + if not os.path.exists(LFW_HOME): os.makedirs(LFW_HOME) From ce1b171e395194e374f3060b4394e600421bc097 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 20 Nov 2019 14:04:48 +0100 Subject: [PATCH 041/448] MNT pin pytest in failing job on azure (#15677) --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ceeb76b3f1446..2ee751bb6473f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -57,7 +57,8 @@ jobs: SCIPY_VERSION: '0.17.0' PANDAS_VERSION: '*' CYTHON_VERSION: '*' - PYTEST_VERSION: '*' + # temporary pin pytest due to unknown failure with pytest 5.3 + PYTEST_VERSION: '5.2' PILLOW_VERSION: '4.0.0' MATPLOTLIB_VERSION: '1.5.1' # later version of joblib are not packaged in conda for Python 3.5 From 0831dfbe9595e54587f92dcafd30d030dcfcc631 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 20 Nov 2019 09:03:34 -0500 Subject: [PATCH 042/448] [MRG] BUG Fixes test_scale_and_stability in windows (#15661) --- doc/whats_new/v0.22.rst | 4 ++++ sklearn/cross_decomposition/_pls.py | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index fc9fbad967e8c..3349714e015ad 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -101,6 +101,7 @@ random sampling procedures. - :class:`linear_model.Ridge` when `X` is sparse. |Fix| - :class:`model_selection.StratifiedKFold` and any use of `cv=int` with a classifier. |Fix| +- :class:`cross_decomposition.CCA` when using scipy >= 1.3 |Fix| Details are listed in the changelog below. @@ -209,6 +210,9 @@ Changelog ``inverse_transform`` to transform data to the original space`. :pr:`15304` by :user:`Jaime Ferrando Huertas `. +- |Fix| :class:`cross_decomposition.CCA` now produces the same results with + scipy 1.3 and previous scipy versions. :pr:`15661` by `Thomas Fan`_. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index a429872020ad0..72ee5d4af6ba6 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -40,6 +40,18 @@ def _nipals_twoblocks_inner_loop(X, Y, mode="A", max_iter=500, tol=1e-06, ite = 1 X_pinv = Y_pinv = None eps = np.finfo(X.dtype).eps + + if mode == "B": + # Uses condition from scipy<1.3 in pinv2 which was changed in + # https://github.com/scipy/scipy/pull/10067. In scipy 1.3, the + # condition was changed to depend on the largest singular value + X_t = X.dtype.char.lower() + Y_t = Y.dtype.char.lower() + factor = {'f': 1E3, 'd': 1E6} + + cond_X = factor[X_t] * eps + cond_Y = factor[Y_t] * eps + # Inner loop of the Wold algo. while True: # 1.1 Update u: the X weights @@ -47,7 +59,7 @@ def _nipals_twoblocks_inner_loop(X, Y, mode="A", max_iter=500, tol=1e-06, if X_pinv is None: # We use slower pinv2 (same as np.linalg.pinv) for stability # reasons - X_pinv = pinv2(X, check_finite=False) + X_pinv = pinv2(X, check_finite=False, cond=cond_X) x_weights = np.dot(X_pinv, y_score) else: # mode A # Mode A regress each X column on y_score @@ -64,7 +76,8 @@ def _nipals_twoblocks_inner_loop(X, Y, mode="A", max_iter=500, tol=1e-06, # 2.1 Update y_weights if mode == "B": if Y_pinv is None: - Y_pinv = pinv2(Y, check_finite=False) # compute once pinv(Y) + # compute once pinv(Y) + Y_pinv = pinv2(Y, check_finite=False, cond=cond_Y) y_weights = np.dot(Y_pinv, x_score) else: # Mode A regress each Y column on x_score From 641b8631884e2cb19a27fd30440a41a1437e4e40 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 20 Nov 2019 16:06:41 +0100 Subject: [PATCH 043/448] MNT fix filtering of examples file to run by sphinx (#15680) --- build_tools/circle/build_doc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index c3481bd6c58c1..59c0fe659a2ad 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -79,7 +79,7 @@ get_build_type() { auto_example_files="$(echo "$img_fig_lines" | grep auto_examples | awk -F "/" '{print $NF}')" # remove "sphx_glr_" from path and accept replace _(\d\d\d|thumb).png with .py - scripts_names="$(echo "$auto_example_files" | sed 's/sphx_glr_//' | sed -e 's/_([[:digit:]][[:digit:]][[:digit:]]|thumb).png/.py/')" + scripts_names="$(echo "$auto_example_files" | sed 's/sphx_glr_//' | sed -E 's/_([[:digit:]][[:digit:]][[:digit:]]|thumb).png/.py/')" # get unique values examples_in_rst="$(echo "$scripts_names" | uniq )" From 10b8bf7c54c5bf3281ebc393e3df614c5fb63a8f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 20 Nov 2019 18:06:38 +0100 Subject: [PATCH 044/448] [MRG] Improve error message with implicit pos_label in _binary_clf_curve (#15562) --- sklearn/metrics/_ranking.py | 23 +++++++++----- sklearn/metrics/tests/test_ranking.py | 45 +++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index c271781638668..c88fe685e97c9 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -525,14 +525,23 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified + # classes.dtype.kind in ('O', 'U', 'S') is required to avoid + # triggering a FutureWarning by calling np.array_equal(a, b) + # when elements in the two arrays are not comparable. classes = np.unique(y_true) - if (pos_label is None and - not (np.array_equal(classes, [0, 1]) or - np.array_equal(classes, [-1, 1]) or - np.array_equal(classes, [0]) or - np.array_equal(classes, [-1]) or - np.array_equal(classes, [1]))): - raise ValueError("Data is not binary and pos_label is not specified") + if (pos_label is None and ( + classes.dtype.kind in ('O', 'U', 'S') or + not (np.array_equal(classes, [0, 1]) or + np.array_equal(classes, [-1, 1]) or + np.array_equal(classes, [0]) or + np.array_equal(classes, [-1]) or + np.array_equal(classes, [1])))): + classes_repr = ", ".join(repr(c) for c in classes) + raise ValueError("y_true takes value in {{{classes_repr}}} and " + "pos_label is not specified: either make y_true " + "take integer value in {{0, 1}} or {{-1, 1}} or " + "pass pos_label explicitly.".format( + classes_repr=classes_repr)) elif pos_label is None: pos_label = 1. diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 0275a26055915..ae0296718f43a 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -662,14 +662,53 @@ def test_auc_score_non_binary_class(): roc_auc_score(y_true, y_pred) -def test_binary_clf_curve(): +def test_binary_clf_curve_multiclass_error(): rng = check_random_state(404) y_true = rng.randint(0, 3, size=10) y_pred = rng.rand(10) msg = "multiclass format is not supported" + with pytest.raises(ValueError, match=msg): precision_recall_curve(y_true, y_pred) + with pytest.raises(ValueError, match=msg): + roc_curve(y_true, y_pred) + + +@pytest.mark.parametrize("curve_func", [ + precision_recall_curve, + roc_curve, +]) +def test_binary_clf_curve_implicit_pos_label(curve_func): + # Check that using string class labels raises an informative + # error for any supported string dtype: + msg = ("y_true takes value in {'a', 'b'} and pos_label is " + "not specified: either make y_true take integer " + "value in {0, 1} or {-1, 1} or pass pos_label " + "explicitly.") + with pytest.raises(ValueError, match=msg): + roc_curve(np.array(["a", "b"], dtype=' Date: Wed, 20 Nov 2019 09:57:09 -0800 Subject: [PATCH 045/448] DOC add transform y section to faq.rst (#15484) --- doc/faq.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/faq.rst b/doc/faq.rst index 1b216dd3d1dab..490ee2d16dfca 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -385,3 +385,23 @@ efficient to process for most operations. Extensive work would also be needed to support Pandas categorical types. Restricting input to homogeneous types therefore reduces maintenance cost and encourages usage of efficient data structures. + +Do you plan to implement transform for target y in a pipeline? +---------------------------------------------------------------------------- +Currently transform only works for features X in a pipeline. +There's a long-standing discussion about +not being able to transform y in a pipeline. +Follow on github issue +`#4143`_. +Meanwhile check out +:class:`sklearn.compose.TransformedTargetRegressor`, +`pipegraph`_, +`imbalanced-learn`_. +Note that Scikit-learn solved for the case where y +has an invertible transformation applied before training +and inverted after prediction. Scikit-learn intends to solve for +use cases where y should be transformed at training time +and not at test time, for resampling and similar uses, +like at imbalanced learn. +In general, these use cases can be solved +with a custom meta estimator rather than a Pipeline From ccac637855411169a256eb095302cff446b342be Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 20 Nov 2019 14:23:34 -0500 Subject: [PATCH 046/448] FIX Clip distances below 0 (#15683) --- sklearn/metrics/pairwise.py | 2 ++ sklearn/metrics/tests/test_pairwise.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index cd1e65c282ebf..05b5b01774773 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -406,6 +406,8 @@ def nan_euclidean_distances(X, Y=None, squared=False, distances -= np.dot(XX, missing_Y.T) distances -= np.dot(missing_X, YY.T) + np.clip(distances, 0, None, out=distances) + if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 21898136cb8f9..f76782c8b3795 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -871,6 +871,23 @@ def test_nan_euclidean_distances_not_trival(missing_value): assert_allclose(D6, D7) +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_nan_euclidean_distances_one_feature_match_positive(missing_value): + # First feature is the only feature that is non-nan and in both + # samples. The result of `nan_euclidean_distances` with squared=True + # should be non-negative. The non-squared version should all be close to 0. + X = np.array([[-122.27, 648., missing_value, 37.85], + [-122.27, missing_value, 2.34701493, missing_value]]) + + dist_squared = nan_euclidean_distances(X, missing_values=missing_value, + squared=True) + assert np.all(dist_squared >= 0) + + dist = nan_euclidean_distances(X, missing_values=missing_value, + squared=False) + assert_allclose(dist, 0.0) + + def test_cosine_distances(): # Check the pairwise Cosine distances computation rng = np.random.RandomState(1337) From 1c546cd9b1c229cba6f076f7532fadc27066f63e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 20 Nov 2019 16:03:25 -0500 Subject: [PATCH 047/448] CLN Move gradient and hessian closer to for loop in hist GBDT (#15686) --- .../gradient_boosting.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 78dab9419d0f7..cb708ecc576e7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -191,13 +191,6 @@ def fit(self, X, y): ) raw_predictions += self._baseline_prediction - # initialize gradients and hessians (empty arrays). - # shape = (n_trees_per_iteration, n_samples). - gradients, hessians = self.loss_.init_gradients_and_hessians( - n_samples=n_samples, - prediction_dim=self.n_trees_per_iteration_ - ) - # predictors is a matrix (list of lists) of TreePredictor objects # with shape (n_iter_, n_trees_per_iteration) self._predictors = predictors = [] @@ -278,17 +271,18 @@ def fit(self, X, y): X_binned_small_train, y_small_train = self._get_small_trainset( X_binned_train, y_train, self._random_seed) - # Initialize the gradients and hessians - gradients, hessians = self.loss_.init_gradients_and_hessians( - n_samples=n_samples, - prediction_dim=self.n_trees_per_iteration_ - ) - # Get the predictors from the previous fit predictors = self._predictors begin_at_stage = self.n_iter_ + # initialize gradients and hessians (empty arrays). + # shape = (n_trees_per_iteration, n_samples). + gradients, hessians = self.loss_.init_gradients_and_hessians( + n_samples=n_samples, + prediction_dim=self.n_trees_per_iteration_ + ) + for iteration in range(begin_at_stage, self.max_iter): if self.verbose: From 6b8e5452934f7f027aff6e97cb3fe2dd0e8ab24d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 21 Nov 2019 11:51:07 +0100 Subject: [PATCH 048/448] MNT bump the version of numpydoc and sphinx gallery (#15681) --- build_tools/circle/build_doc.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 59c0fe659a2ad..abc823facee15 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -172,8 +172,8 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python="${PYTHON_VERSION:-*}" \ joblib memory_profiler packaging source activate testenv -pip install sphinx-gallery==0.3.1 -pip install numpydoc==0.9 +pip install sphinx-gallery +pip install numpydoc # Build and install scikit-learn in dev mode python setup.py build_ext --inplace -j 3 From 2151b79a916e37a7f416cce6ba512ad464259bb9 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 21 Nov 2019 05:54:50 -0500 Subject: [PATCH 049/448] FIX Releases memory in liblinear (#15687) --- sklearn/svm/src/liblinear/linear.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp index 15202dabce7be..d9bdfb69c413d 100644 --- a/sklearn/svm/src/liblinear/linear.cpp +++ b/sklearn/svm/src/liblinear/linear.cpp @@ -515,6 +515,7 @@ Solver_MCSVM_CS::~Solver_MCSVM_CS() { delete[] B; delete[] G; + delete[] C; } int compare_double(const void *a, const void *b) From e06cc9e3fee1b8ed24f9c0f80d32025e5e80dde2 Mon Sep 17 00:00:00 2001 From: Chris Gregory Date: Thu, 21 Nov 2019 06:27:25 -0800 Subject: [PATCH 050/448] MNT Better error message for MinMaxScaler and sparse data (#15695) --- sklearn/preprocessing/_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 9514719de5a92..ef8b9c6db9e3b 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -365,8 +365,8 @@ def partial_fit(self, X, y=None): " than maximum. Got %s." % str(feature_range)) if sparse.issparse(X): - raise TypeError("MinMaxScaler does no support sparse input. " - "You may consider to use MaxAbsScaler instead.") + raise TypeError("MinMaxScaler does not support sparse input. " + "Consider using MaxAbsScaler instead.") X = check_array(X, estimator=self, dtype=FLOAT_DTYPES, From 0c2da0cb488b7cdc0c5b6bcf4aacef374bae309b Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 00:34:16 +0100 Subject: [PATCH 051/448] DOC Fix various sphinx warnings. (#15692) --- doc/faq.rst | 6 +++--- examples/gaussian_process/plot_gpr_on_structured_data.py | 4 ++-- sklearn/ensemble/_forest.py | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index 490ee2d16dfca..6972d79fd5513 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -392,11 +392,11 @@ Currently transform only works for features X in a pipeline. There's a long-standing discussion about not being able to transform y in a pipeline. Follow on github issue -`#4143`_. +`#4143 `_. Meanwhile check out :class:`sklearn.compose.TransformedTargetRegressor`, -`pipegraph`_, -`imbalanced-learn`_. +`pipegraph `_, +`imbalanced-learn `_. Note that Scikit-learn solved for the case where y has an invertible transformation applied before training and inverted after prediction. Scikit-learn intends to solve for diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py index 5b22c788ab3bf..64a84ab38647a 100644 --- a/examples/gaussian_process/plot_gpr_on_structured_data.py +++ b/examples/gaussian_process/plot_gpr_on_structured_data.py @@ -33,8 +33,8 @@ four correct classifications and fails on one. .. [1] Haussler, D. (1999). Convolution kernels on discrete structures -(Vol. 646). Technical report, Department of Computer Science, University of -California at Santa Cruz. + (Vol. 646). Technical report, Department of Computer Science, University + of California at Santa Cruz. """ print(__doc__) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index e7a0b8e56dde8..9c7d157314870 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1563,6 +1563,7 @@ class ExtraTreesClassifier(ForestClassifier): - the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``) - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) @@ -1871,6 +1872,7 @@ class ExtraTreesRegressor(ForestRegressor): - the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``) - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) From db59dd74df576345cc026cc0d6c99392d3649d3b Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 21 Nov 2019 22:55:57 -0500 Subject: [PATCH 052/448] [MRG] Fast, low memory, single linkage implementation (#11514) * First cut at basic single linkage internals * Refer to correct dist_metrics package * Add csgraph sparse implementation for single linkage * Add fast labelling/conversion from MST to single linkage tree; remove uneeded single_linkage.pyx file. * Ensure existing tests cover single linkage * Name cingle linkage labelling correctly. * Iterating toward correct solution. Still have to get n_clusters, compute_full_tree=False working * Get n_components correct. * Update docstrings. * Fix the parents array when we don't get the "full tree" * Add single linkage to agglomerative clustering example. * Add single linkage to digits agglomerative clustering example. * Update documentation to reflect the addition of single linkage. * Update documentation to reflect the addition of single linkage. * Pep8 fix for class declaration in cython * Fix heading in clustering docs * Update the digits clustering text to reflect the new reality. * Provide a more complete comparison of the different linkage methods, highlighting the relative strengths and weaknesses. * We don't need connectivity here, and we can ignore issues with warnings for spectral clustering. * Add an explicit test that single linkage successfully works on examples it should perform well on. * Update docs with a more complete comparison on linkage methods (scale to be determined?) * List formatting in example linkage comparison. * Flake8 fixes. * Flake8 fixes. * More Flake8 fixes. * Fix agglomerative plot example with correct subplot spec * Explicitly test linkages (including single) produce results identical to scipy.cluster.hierarchical * Fix comment on why we sort (consistency) * Make dense single linkage faster * Add docstring to new mst-linkage-core computations. * Add a test that new single linkage code matches scipy * Ensure we only attemtp this for metrics Jake implemented. * Per amueller; it's a long paper, ref the figure. * Clean up a few things. * Too many blank lines for flake8 * Bad scipy slink input * Flake8 fixes * Clean up cython a little; fix typo/carryover * Convert memoryview to numpy array on return * Just convert to the correct dtype * Update sklearn/cluster/_hierarchical.pyx Co-Authored-By: Nicolas Hug * Update sklearn/cluster/_hierarchical.pyx Co-Authored-By: Nicolas Hug * Update sklearn/cluster/_hierarchical.pyx Co-Authored-By: Nicolas Hug * Update sklearn/cluster/tests/test_hierarchical.py Co-Authored-By: Nicolas Hug * Fixes as per @NicolasHug suggestions. * Update renaming of params in test_hierarchical * Relative import? * Ah, it got renamed in master... * A bad merge on my part. * In principle this is in sklearn.neighbors now... * No; not that way... * Declare dim before use. * Update sklearn/cluster/tests/test_hierarchical.py Co-Authored-By: Nicolas Hug * Remaining fixes per Nicolas Hug. * Update sklearn/cluster/tests/test_hierarchical.py Co-Authored-By: Nicolas Hug * Fix flake8 issues. * Switch from stable to mergesort per jnotham * Update sklearn/cluster/_hierarchical.py Co-Authored-By: Nicolas Hug * Skip checks that are already validated. * Update docstring per Gael's suggestion * Add a benchmark script for agglomerative clustering * Fix some flake8 issues * No flake8 on the one line * Update parameters and output for benchmark hierarchical * Switch to 2D plotting for hierarchical benchmark * Wrong colormap name * Formatting fpr bench hierarchical * Add an item to WhatsNew --- benchmarks/bench_plot_hierarchical.py | 85 +++++++++++++++++++++ doc/modules/clustering.rst | 1 + doc/whats_new/v0.22.rst | 4 + sklearn/cluster/_hierarchical.py | 24 +++++- sklearn/cluster/_hierarchical_fast.pyx | 89 ++++++++++++++++++++++ sklearn/cluster/tests/test_hierarchical.py | 29 ++++++- 6 files changed, 229 insertions(+), 3 deletions(-) create mode 100644 benchmarks/bench_plot_hierarchical.py diff --git a/benchmarks/bench_plot_hierarchical.py b/benchmarks/bench_plot_hierarchical.py new file mode 100644 index 0000000000000..3c8cd4464a771 --- /dev/null +++ b/benchmarks/bench_plot_hierarchical.py @@ -0,0 +1,85 @@ +from collections import defaultdict +from time import time + +import numpy as np +from numpy import random as nr + +from sklearn.cluster import AgglomerativeClustering + + +def compute_bench(samples_range, features_range): + + it = 0 + results = defaultdict(lambda: []) + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('==============================') + print('Iteration %03d of %03d' % (it, max_it)) + print('n_samples %05d; n_features %02d' % (n_samples, n_features)) + print('==============================') + print() + data = nr.randint(-50, 51, (n_samples, n_features)) + + for linkage in ("single", "average", "complete", "ward"): + print(linkage.capitalize()) + tstart = time() + AgglomerativeClustering( + linkage=linkage, + n_clusters=10 + ).fit(data) + + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print() + + results[linkage].append(delta) + + return results + + +if __name__ == '__main__': + import matplotlib.pyplot as plt + + samples_range = np.linspace(1000, 15000, 8).astype(np.int) + features_range = np.array([2, 10, 20, 50]) + + results = compute_bench(samples_range, features_range) + + max_time = max([max(i) for i in [t for (label, t) in results.items()]]) + + colors = plt.get_cmap('tab10')(np.linspace(0, 1, 10))[:4] + lines = {linkage: None for linkage in results.keys()} + fig, axs = plt.subplots(2, 2, sharex=True, sharey=True) + fig.suptitle( + 'Scikit-learn agglomerative clustering benchmark results', + fontsize=16 + ) + for c, (label, timings) in zip(colors, + sorted(results.items())): + timing_by_samples = np.asarray(timings).reshape( + samples_range.shape[0], + features_range.shape[0] + ) + + for n in range(timing_by_samples.shape[1]): + ax = axs.flatten()[n] + lines[label], = ax.plot( + samples_range, + timing_by_samples[:, n], + color=c, + label=label + ) + ax.set_title('n_features = %d' % features_range[n]) + if n >= 2: + ax.set_xlabel('n_samples') + if n % 2 == 0: + ax.set_ylabel('time (s)') + + fig.subplots_adjust(right=0.8) + fig.legend([lines[link] for link in sorted(results.keys())], + sorted(results.keys()), loc="center right", fontsize=8) + + plt.show() diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 93f87989ab233..ed79304fcbdee 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1687,6 +1687,7 @@ Drawbacks Calinski-Harabasz Index ----------------------- + If the ground truth labels are not known, the Calinski-Harabasz index (:func:`sklearn.metrics.calinski_harabasz_score`) - also known as the Variance Ratio Criterion - can be used to evaluate the model, where a higher diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 3349714e015ad..e3b876b0a663b 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -168,6 +168,10 @@ Changelog `affinity='cosine'` and `X` has samples that are all-zeros. :pr:`7943` by :user:`mthorrell`. +- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more + more memory efficient implementation of single linkage clustering. + :pr:`11514` by :user:`Leland McInnes `. + :mod:`sklearn.compose` ...................... diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index eb3b989c7c815..f553a9e505eb5 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -18,6 +18,8 @@ from ..metrics.pairwise import paired_distances, pairwise_distances from ..utils import check_array from ..utils.validation import check_memory +from ..neighbors import DistanceMetric +from ..neighbors._dist_metrics import METRIC_MAPPING from . import _hierarchical_fast as _hierarchical from ._feature_agglomeration import AgglomerationTransform @@ -107,7 +109,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, mst_array = np.vstack([mst.row, mst.col, mst.data]).T # Sort edges of the min_spanning_tree by weight - mst_array = mst_array[np.argsort(mst_array.T[2]), :] + mst_array = mst_array[np.argsort(mst_array.T[2], kind='mergesort'), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = _hierarchical._single_linkage_label(mst_array) @@ -464,7 +466,25 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', X = affinity(X) i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] - out = hierarchy.linkage(X, method=linkage, metric=affinity) + if (linkage == 'single' + and affinity != 'precomputed' + and not callable(affinity) + and affinity in METRIC_MAPPING): + + # We need the fast cythonized metric from neighbors + dist_metric = DistanceMetric.get_metric(affinity) + + # The Cython routines used require contiguous arrays + X = np.ascontiguousarray(X, dtype=np.double) + + mst = _hierarchical.mst_linkage_core(X, dist_metric) + # Sort edges of the min_spanning_tree by weight + mst = mst[np.argsort(mst.T[2], kind='mergesort'), :] + + # Convert edge list into standard hierarchical clustering format + out = _hierarchical.single_linkage_label(mst) + else: + out = hierarchy.linkage(X, method=linkage, metric=affinity) children_ = out[:, :2].astype(np.int, copy=False) if return_distance: diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx index 3dd02d5aaa5ae..ec8c96410c25c 100644 --- a/sklearn/cluster/_hierarchical_fast.pyx +++ b/sklearn/cluster/_hierarchical_fast.pyx @@ -13,6 +13,7 @@ ctypedef np.int8_t INT8 np.import_array() +from ..neighbors._dist_metrics cimport DistanceMetric from ..utils._fast_dict cimport IntFloatDict # C++ @@ -26,6 +27,8 @@ ctypedef np.float64_t DTYPE_t ITYPE = np.intp ctypedef np.intp_t ITYPE_t +from numpy.math cimport INFINITY + ############################################################################### # Utilities for computing the ward momentum @@ -446,3 +449,89 @@ def single_linkage_label(L): raise ValueError("Input MST array must be sorted by weight") return _single_linkage_label(L) + + +# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378 +@cython.boundscheck(False) +@cython.nonecheck(False) +def mst_linkage_core( + DTYPE_t [:, ::1] raw_data, + DistanceMetric dist_metric): + """ + Compute the necessary elements of a minimum spanning + tree for computation of single linkage clustering. This + represents the MST-LINKAGE-CORE algorithm (Figure 6) from + *Modern hierarchical, agglomerative clustering algorithms* + by Daniel Mullner (https://arxiv.org/abs/1109.2378). + + In contrast to the scipy implementation is never computes + a full distance matrix, generating distances only as they + are needed and releasing them when no longer needed. + + Parameters + ---------- + raw_data: array of shape (n_samples, n_features) + The array of feature data to be clustered. Must be C-aligned + + dist_metric: DistanceMetric + A DistanceMetric object conforming to the API from + ``sklearn.neighbors._dist_metrics.pxd`` that will be + used to compute distances. + + Returns + ------- + mst_core_data: array of shape (n_samples, 3) + An array providing information from which one + can either compute an MST, or the linkage hierarchy + very efficiently. See https://arxiv.org/abs/1109.2378 + algorithm MST-LINKAGE-CORE for more details. + """ + cdef: + ITYPE_t n_samples = raw_data.shape[0] + np.int8_t[:] in_tree = np.zeros(n_samples, dtype=np.int8) + DTYPE_t[:, ::1] result = np.zeros((n_samples - 1, 3)) + + np.ndarray label_filter + + ITYPE_t current_node = 0 + ITYPE_t new_node + ITYPE_t i + ITYPE_t j + ITYPE_t num_features = raw_data.shape[1] + + DTYPE_t right_value + DTYPE_t left_value + DTYPE_t new_distance + + DTYPE_t[:] current_distances = np.full(n_samples, INFINITY) + + for i in range(n_samples - 1): + + in_tree[current_node] = 1 + + new_distance = INFINITY + new_node = 0 + + for j in range(n_samples): + if in_tree[j]: + continue + + right_value = current_distances[j] + left_value = dist_metric.dist(&raw_data[current_node, 0], + &raw_data[j, 0], + num_features) + + if left_value < right_value: + current_distances[j] = left_value + + if current_distances[j] < new_distance: + new_distance = current_distances[j] + new_node = j + + result[i, 0] = current_node + result[i, 1] = new_node + result[i, 2] = new_distance + current_node = new_node + + return np.array(result) + diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index bb93cb20395fd..06e2561df5de7 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -280,7 +280,7 @@ def assess_same_labelling(cut1, cut2): assert (co_clust[0] == co_clust[1]).all() -def test_scikit_vs_scipy(): +def test_sparse_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) @@ -314,6 +314,33 @@ def test_scikit_vs_scipy(): _hc_cut(n_leaves + 1, children, n_leaves) +# Make sure our custom mst_linkage_core gives +# the same results as scipy's builtin +@pytest.mark.parametrize('seed', range(5)) +def test_vector_scikit_single_vs_scipy_single(seed): + n_samples, n_features, n_clusters = 10, 5, 3 + rng = np.random.RandomState(seed) + X = .1 * rng.normal(size=(n_samples, n_features)) + X -= 4. * np.arange(n_samples)[:, np.newaxis] + X -= X.mean(axis=1)[:, np.newaxis] + + out = hierarchy.linkage(X, method='single') + children_scipy = out[:, :2].astype(np.int) + + children, _, n_leaves, _ = _TREE_BUILDERS['single'](X) + + # Sort the order of child nodes per row for consistency + children.sort(axis=1) + assert_array_equal(children, children_scipy, + 'linkage tree differs' + ' from scipy impl for' + ' single linkage.') + + cut = _hc_cut(n_clusters, children, n_leaves) + cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves) + assess_same_labelling(cut, cut_scipy) + + def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix From 308a54e3aed87ec54348a976f8b07f346200b033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 22 Nov 2019 22:27:40 +0100 Subject: [PATCH 053/448] CI Use new conda syntax to select blas (#15705) --- azure-pipelines.yml | 8 ++++---- build_tools/azure/install.sh | 9 ++------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2ee751bb6473f..5f84a1ae94857 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,7 +26,7 @@ jobs: pylatest_conda_mkl: DISTRIB: 'conda' PYTHON_VERSION: '*' - INSTALL_MKL: 'true' + BLAS: 'mkl' NUMPY_VERSION: '*' SCIPY_VERSION: '*' CYTHON_VERSION: '*' @@ -52,7 +52,7 @@ jobs: py35_conda_openblas: DISTRIB: 'conda' PYTHON_VERSION: '3.5' - INSTALL_MKL: 'false' + BLAS: 'openblas' NUMPY_VERSION: '1.11.0' SCIPY_VERSION: '0.17.0' PANDAS_VERSION: '*' @@ -96,7 +96,7 @@ jobs: pylatest_conda_mkl: DISTRIB: 'conda' PYTHON_VERSION: '*' - INSTALL_MKL: 'true' + BLAS: 'mkl' NUMPY_VERSION: '*' SCIPY_VERSION: '*' CYTHON_VERSION: '*' @@ -107,7 +107,7 @@ jobs: pylatest_conda_mkl_no_openmp: DISTRIB: 'conda' PYTHON_VERSION: '*' - INSTALL_MKL: 'true' + BLAS: 'mkl' NUMPY_VERSION: '*' SCIPY_VERSION: '*' CYTHON_VERSION: '*' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 61ee6bac7116f..084c497eb65aa 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -22,13 +22,8 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="python=$PYTHON_VERSION pip \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ - cython=$CYTHON_VERSION joblib=$JOBLIB_VERSION" - - if [[ "$INSTALL_MKL" == "true" ]]; then - TO_INSTALL="$TO_INSTALL mkl" - else - TO_INSTALL="$TO_INSTALL nomkl" - fi + cython=$CYTHON_VERSION joblib=$JOBLIB_VERSION\ + blas[build=$BLAS]" if [[ -n "$PANDAS_VERSION" ]]; then TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION" From c1ba7bfe43c2157a3fbb0acc1a49c9f14363f49c Mon Sep 17 00:00:00 2001 From: J-A16 Date: Sun, 24 Nov 2019 12:45:49 -0500 Subject: [PATCH 054/448] DOC versionadded labels for NearestNeighbors, KNeighborsRegressor and RadiusNeighborsRegressor (#15688) --- sklearn/neighbors/_regression.py | 4 ++++ sklearn/neighbors/_unsupervised.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index caaf3da7d74fe..e8eafacf9effe 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -30,6 +30,8 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, Read more in the :ref:`User Guide `. + .. versionadded:: 0.9 + Parameters ---------- n_neighbors : int, optional (default = 5) @@ -203,6 +205,8 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Read more in the :ref:`User Guide `. + .. versionadded:: 0.9 + Parameters ---------- radius : float, optional (default = 1.0) diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index d760840c00ea4..79599791a96a1 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -11,6 +11,8 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Read more in the :ref:`User Guide `. + .. versionadded:: 0.9 + Parameters ---------- n_neighbors : int, optional (default = 5) From 9f5b97119b08789740876e384d04b4a2e314bc8b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 25 Nov 2019 02:34:54 -0500 Subject: [PATCH 055/448] CLN Removes ccp_alpha from RandomTreesEmbedding (#15708) --- doc/whats_new/v0.22.rst | 1 - sklearn/ensemble/_forest.py | 12 +----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e3b876b0a663b..7bbd89dba76f1 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -829,7 +829,6 @@ Changelog :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`, - :class:`ensemble.RandomTreesEmbedding`, :class:`ensemble.GradientBoostingClassifier`, and :class:`ensemble.GradientBoostingRegressor`. :pr:`12887` by `Thomas Fan`_. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 9c7d157314870..e06c69bf1dae2 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2112,14 +2112,6 @@ class RandomTreesEmbedding(BaseForest): and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. - ccp_alpha : non-negative float, optional (default=0.0) - Complexity parameter used for Minimal Cost-Complexity Pruning. The - subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. See - :ref:`minimal_cost_complexity_pruning` for details. - - .. versionadded:: 0.22 - max_samples : int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. @@ -2163,7 +2155,6 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - ccp_alpha=0.0, max_samples=None): super().__init__( base_estimator=ExtraTreeRegressor(), @@ -2172,7 +2163,7 @@ def __init__(self, "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease", "min_impurity_split", - "random_state", "ccp_alpha"), + "random_state"), bootstrap=False, oob_score=False, n_jobs=n_jobs, @@ -2189,7 +2180,6 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output - self.ccp_alpha = ccp_alpha def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by tree embedding") From fc46a13d57be800da2a8a6b2f8e2621d132ac508 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 25 Nov 2019 03:38:33 -0600 Subject: [PATCH 056/448] DOC Wrong statement in release highlight (#15704) --- .../release_highlights/plot_release_highlights_0_22_0.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py index 45db9166bba3a..450700d143ca2 100644 --- a/examples/release_highlights/plot_release_highlights_0_22_0.py +++ b/examples/release_highlights/plot_release_highlights_0_22_0.py @@ -246,11 +246,10 @@ def test_sklearn_compatible_estimator(estimator, check): # classification. Two averaging strategies are currently supported: the # one-vs-one algorithm computes the average of the pairwise ROC AUC scores, and # the one-vs-rest algorithm computes the average of the ROC AUC scores for each -# class against all other classes. In both cases, the predicted labels are -# provided in an array with values from 0 to ``n_classes``, and the scores -# correspond to the probability estimates that a sample belongs to a particular -# class. The OvO and OvR algorithms supports weighting uniformly -# (``average='macro'``) and weighting by the prevalence +# class against all other classes. In both cases, the multiclass ROC AUC scores +# are computed from the probability estimates that a sample belongs to a +# particular class according to the model. The OvO and OvR algorithms support +# weighting uniformly (``average='macro'``) and weighting by the prevalence # (``average='weighted'``). # # Read more in the :ref:`User Guide `. From 4f97facc3a992c6e2459c3da86c9d69b0688d5ab Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 25 Nov 2019 11:40:05 +0100 Subject: [PATCH 057/448] DOC Further 0.22 whats_new cleanup (#15675) * whats_new cleanup * fix indent * reorder hgbt ones * add Guillaume to contributors and remove unnecessary entry --- doc/whats_new/_contributors.rst | 2 + doc/whats_new/v0.22.rst | 238 +++++++++++++++----------------- 2 files changed, 116 insertions(+), 124 deletions(-) diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index b148c7f1139ea..cc3957eca1592 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -175,3 +175,5 @@ .. _Thomas Fan: https://github.com/thomasjpfan .. _Nicolas Hug: https://github.com/NicolasHug + +.. _Guillaume Lemaitre: https://github.com/glemaitre diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 7bbd89dba76f1..f62e380085c82 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -182,7 +182,7 @@ Changelog - |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to select the proper columns when using a boolean list, with NumPy older than 1.12. - :pr:`14510` by :user:`Guillaume Lemaitre `. + :pr:`14510` by `Guillaume Lemaitre`_. - |Fix| Fixed a bug in :class:`compose.TransformedTargetRegressor` which did not pass `**fit_params` to the underlying regressor. @@ -197,6 +197,11 @@ Changelog :mod:`sklearn.cross_decomposition` .................................. +- |Feature| :class:`cross_decomposition.PLSCanonical` and + :class:`cross_decomposition.PLSRegression` have a new function + ``inverse_transform`` to transform data to the original space. + :pr:`15304` by :user:`Jaime Ferrando Huertas `. + - |Enhancement| :class:`decomposition.KernelPCA` now properly checks the eigenvalues found by the solver for numerical or conditioning issues. This ensures consistency of results across solvers (different choices for @@ -209,12 +214,7 @@ Changelog with a target matrix `Y` in which the first column was constant. :issue:`13609` by :user:`Camila Williamson `. -- |Feature| :class:`cross_decomposition.PLSCanonical` and - :class:`cross_decomposition.PLSRegression` have a new function - ``inverse_transform`` to transform data to the original space`. - :pr:`15304` by :user:`Jaime Ferrando Huertas `. - -- |Fix| :class:`cross_decomposition.CCA` now produces the same results with +- |Fix| :class:`cross_decomposition.CCA` now produces the same results with scipy 1.3 and previous scipy versions. :pr:`15661` by `Thomas Fan`_. :mod:`sklearn.datasets` @@ -234,17 +234,21 @@ Changelog `weights` parameter, i.e. list or numpy.array, instead of list only. :pr:`14764` by :user:`Cat Chenal `. +- |Enhancement| The parameter `normalize` was added to + :func:`datasets.fetch_20newsgroups_vectorized`. + :pr:`14740` by :user:`Stéphan Tulkens ` + - |Fix| Fixed a bug in :func:`datasets.fetch_openml`, which failed to load an OpenML dataset that contains an ignored feature. :pr:`14623` by :user:`Sarra Habchi `. - - |Enhancement| The parameter `normalize` was added to - :func:`datasets.fetch_20newsgroups_vectorized`. - :pr:`14740` by :user:`Stéphan Tulkens ` - :mod:`sklearn.decomposition` ............................ +- |Efficiency| :class:`decomposition.NMF(solver='mu')` fitted on sparse input + matrices now uses batching to avoid briefly allocating an array with size + (#non-zero elements, n_components). :pr:`15257` by `Mart Willocx `_. + - |Enhancement| :func:`decomposition.dict_learning()` and :func:`decomposition.dict_learning_online()` now accept `method_max_iter` and pass it to :meth:`decomposition.sparse_encode`. @@ -266,22 +270,18 @@ Changelog underlying :class:`linear_model.LassoLars` when `algorithm='lasso_lars'`. :issue:`12650` by `Adrin Jalali`_. -- |Efficiency| :class:`decomposition.NMF(solver='mu')` fitted on sparse input - matrices now uses batching to avoid briefly allocating an array with size - (#non-zero elements, n_components). :pr:`15257` by `Mart Willocx `_. - :mod:`sklearn.dummy` .................... +- |Fix| :class:`dummy.DummyClassifier` now handles checking the existence + of the provided constant in multiouput cases. + :pr:`14908` by :user:`Martina G. Vilas `. + - |API| The default value of the `strategy` parameter in :class:`dummy.DummyClassifier` will change from `'stratified'` in version 0.22 to `'prior'` in 0.24. A FutureWarning is raised when the default value is used. :pr:`15382` by `Thomas Fan`_. -- |Fix| :class:`dummy.DummyClassifier` now handles checking the existence - of the provided constant in multiouput cases. - :pr:`14908` by :user:`Martina G. Vilas `. - - |API| The ``outputs_2d_`` attribute is deprecated in :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor`. It is equivalent to ``n_outputs > 1``. :pr:`14933` by `Nicolas Hug`_ @@ -305,25 +305,37 @@ Changelog and `Olivier Grisel`_. - |Feature| Estimators now have an additional `warm_start` parameter that enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. + - |Feature| :func:`inspection.partial_dependence` and + :func:`inspection.plot_partial_dependence` now support the fast 'recursion' + method for both estimators. :pr:`13769` by `Nicolas Hug`_. - |Enhancement| for :class:`ensemble.HistGradientBoostingClassifier` the training loss or score is now monitored on a class-wise stratified subsample to preserve the class balance of the original training set. :pr:`14194` by :user:`Johann Faouzi `. - - |Feature| :func:`inspection.partial_dependence` and - :func:`inspection.plot_partial_dependence` now support the fast 'recursion' - method for both estimators. :pr:`13769` by `Nicolas Hug`_. - |Enhancement| :class:`ensemble.HistGradientBoostingRegressor` now supports the 'least_absolute_deviation' loss. :pr:`13896` by `Nicolas Hug`_. - |Fix| Estimators now bin the training and validation data separately to avoid any data leak. :pr:`13933` by `Nicolas Hug`_. - |Fix| Fixed a bug where early stopping would break with string targets. - :pr:`14710` by :user:`Guillaume Lemaitre `. + :pr:`14710` by `Guillaume Lemaitre`_. - |Fix| :class:`ensemble.HistGradientBoostingClassifier` now raises an error if ``categorical_crossentropy`` loss is given for a binary classification problem. :pr:`14869` by `Adrin Jalali`_. Note that pickles from 0.21 will not work in 0.22. +- |Enhancement| Addition of ``max_samples`` argument allows limiting + size of bootstrap samples to be less than size of dataset. Added to + :class:`ensemble.ForestClassifier`, + :class:`ensemble.ForestRegressor`, + :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, + :class:`ensemble.ExtraTreesClassifier`, + :class:`ensemble.ExtraTreesRegressor`, + :class:`ensemble.RandomTreesEmbedding`. :pr:`14682` by + :user:`Matt Hancock ` and + :pr:`5963` by :user:`Pablo Duboue `. + - |Fix| :func:`ensemble.VotingClassifier.predict_proba` will no longer be present when `voting='hard'`. :pr:`14287` by `Thomas Fan`_. @@ -339,42 +351,30 @@ Changelog failing when the underlying estimators were not outputting consistent array dimensions. Note that it should be replaced by refactoring the common tests in the future. - :pr:`14305` by :user:`Guillaume Lemaitre `. + :pr:`14305` by `Guillaume Lemaitre`_. - |Fix| :class:`ensemble.AdaBoostClassifier` computes probabilities based on the decision function as in the literature. Thus, `predict` and `predict_proba` give consistent results. - :pr:`14114` by :user:`Guillaume Lemaitre `. - -- |API| ``presort`` is now deprecated in - :class:`ensemble.GradientBoostingClassifier` and - :class:`ensemble.GradientBoostingRegressor`, and the parameter has no effect. - Users are recommended to use :class:`ensemble.HistGradientBoostingClassifier` - and :class:`ensemble.HistGradientBoostingRegressor` instead. - :pr:`14907` by `Adrin Jalali`_. - -- |Enhancement| Addition of ``max_samples`` argument allows limiting - size of bootstrap samples to be less than size of dataset. Added to - :class:`ensemble.ForestClassifier`, - :class:`ensemble.ForestRegressor`, - :class:`ensemble.RandomForestClassifier`, - :class:`ensemble.RandomForestRegressor`, - :class:`ensemble.ExtraTreesClassifier`, - :class:`ensemble.ExtraTreesRegressor`, - :class:`ensemble.RandomTreesEmbedding`. :pr:`14682` by - :user:`Matt Hancock ` and - :pr:`5963` by :user:`Pablo Duboue `. + :pr:`14114` by `Guillaume Lemaitre`_. - |Fix| Stacking and Voting estimators now ensure that their underlying estimators are either all classifiers or all regressors. :class:`ensemble.StackingClassifier`, :class:`ensemble.StackingRegressor`, and :class:`ensemble.VotingClassifier` and :class:`VotingRegressor` now raise consistent error messages. - :pr:`15084` by :user:`Guillaume Lemaitre `. + :pr:`15084` by `Guillaume Lemaitre`_. - |Fix| :class:`ensemble.AdaBoostRegressor` where the loss should be normalized by the max of the samples with non-null weights only. - :pr:`14294` by :user:`Guillaume Lemaitre `. + :pr:`14294` by `Guillaume Lemaitre`_. + +- |API| ``presort`` is now deprecated in + :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor`, and the parameter has no effect. + Users are recommended to use :class:`ensemble.HistGradientBoostingClassifier` + and :class:`ensemble.HistGradientBoostingRegressor` instead. + :pr:`14907` by `Adrin Jalali`_. :mod:`sklearn.feature_extraction` ................................. @@ -390,11 +390,6 @@ Changelog :class:`feature_extraction.text.VectorizerMixin` can now be pickled. :pr:`14430` by :user:`Dillon Niederhut `. -- |API| Deprecated unused `copy` param for - :meth:`feature_extraction.text.TfidfVectorizer.transform` it will be - removed in v0.24. :pr:`14520` by - :user:`Guillem G. Subies `. - - |Fix| :func:`feature_extraction.text.strip_accents_unicode` now correctly removes accents from strings that are in NFKD normalized form. :pr:`15100` by :user:`Daniel Grady `. @@ -403,6 +398,11 @@ Changelog an `OverflowError` during the `transform` operation when producing a `scipy.sparse` matrix on large input data. :pr:`15463` by :user:`Norvan Sahiner `. +- |API| Deprecated unused `copy` param for + :meth:`feature_extraction.text.TfidfVectorizer.transform` it will be + removed in v0.24. :pr:`14520` by + :user:`Guillem G. Subies `. + :mod:`sklearn.feature_selection` ................................ @@ -429,11 +429,6 @@ Changelog the kernel attribute is modified, but may result in a performance improvement. :pr:`14378` by :user:`Masashi Shibata `. -- |API| From version 0.24 :meth:`gaussian_process.kernels.Kernel.get_params` will raise an - ``AttributeError`` rather than return ``None`` for parameters that are in the - estimator's constructor but not stored as attributes on the instance. - :pr:`14464` by `Joel Nothman`_. - - |Feature| Gaussian process models on structured data: :class:`gaussian_process.GaussianProcessRegressor` and :class:`gaussian_process.GaussianProcessClassifier` can now accept a list of generic objects (e.g. strings, trees, graphs, etc.) as the ``X`` argument @@ -443,28 +438,29 @@ Changelog to notify the GPR/GPC model that it handles non-vectorial samples. :pr:`15557` by :user:`Yu-Hang Tang `. +- |API| From version 0.24 :meth:`gaussian_process.kernels.Kernel.get_params` will raise an + ``AttributeError`` rather than return ``None`` for parameters that are in the + estimator's constructor but not stored as attributes on the instance. + :pr:`14464` by `Joel Nothman`_. + :mod:`sklearn.impute` ..................... - |MajorFeature| Added :class:`impute.KNNImputer`, to impute missing values using k-Nearest Neighbors. :issue:`12852` by :user:`Ashim Bhattarai ` and - `Thomas Fan`_. - -- |Enhancement| Adds parameter `add_indicator` to :class:`impute.KNNImputer` - to get indicator of missing data. - :pr:`15010` by :user:`Guillaume Lemaitre `. + `Thomas Fan`_ and :pr:`15010` by `Guillaume Lemaitre`_. - |Feature| :class:`impute.IterativeImputer` has new `skip_compute` flag that is False by default, which, when True, will skip computation on features that have no missing values during the fit phase. :issue:`13773` by :user:`Sergey Feldman `. -- |Fix| :class:`impute.IterativeImputer` now works when there is only one feature. - By :user:`Sergey Feldman `. - - |Efficiency| :meth:`impute.MissingIndicator.fit_transform` avoid repeated computation of the masked matrix. :pr:`14356` by :user:`Harsh Soni `. +- |Fix| :class:`impute.IterativeImputer` now works when there is only one feature. + By :user:`Sergey Feldman `. + - |Fix| Fixed a bug in :class:`impute.IterativeImputer` where features where imputed in the reverse desired order with ``imputation_order`` either ``"ascending"`` or ``"descending"``. :pr:`15393` by @@ -491,7 +487,7 @@ Changelog and :class:`pipeline.Pipeline` containing :class:`compose.ColumnTransformer`. In addition :func:`inspection.plot_partial_dependence` will use the column names by default when a dataframe is passed. - :pr:`14028` and :pr:`15429` by :user:`Guillaume Lemaitre `. + :pr:`14028` and :pr:`15429` by `Guillaume Lemaitre`_. :mod:`sklearn.kernel_approximation` ................................... @@ -503,15 +499,15 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Efficiency| The 'liblinear' logistic regression solver is now faster and + requires less memory. + :pr:`14108`, :pr:`14170`, :pr:`14296` by :user:`Alex Henrie `. + - |Enhancement| :class:`linear_model.BayesianRidge` now accepts hyperparameters ``alpha_init`` and ``lambda_init`` which can be used to set the initial value of the maximization procedure in :term:`fit`. :pr:`13618` by :user:`Yoshihiro Uchida `. -- |Efficiency| The 'liblinear' logistic regression solver is now faster and - requires less memory. - :pr:`14108`, :pr:`14170`, :pr:`14296` by :user:`Alex Henrie `. - - |Fix| :class:`linear_model.Ridge` now correctly fits an intercept when `X` is sparse, `solver="auto"` and `fit_intercept=True`, because the default solver in this configuration has changed to `sparse_cg`, which can fit an intercept @@ -519,7 +515,7 @@ Changelog - |Fix| :class:`linear_model.Ridge` with `solver='sag'` now accepts F-ordered and non-contiguous arrays and makes a conversion instead of failing. - :pr:`14458` by :user:`Guillaume Lemaitre `. + :pr:`14458` by `Guillaume Lemaitre`_. - |Fix| :class:`linear_model.LassoCV` no longer forces ``precompute=False`` when fitting the final model. :pr:`14591` by `Andreas Müller`_. @@ -538,7 +534,7 @@ Changelog :pr:`15086` by :user:`Alex Gramfort `. - |Fix| The liblinear solver now supports ``sample_weight``. - :pr:`15038` by :user:`Guillaume Lemaitre `. + :pr:`15038` by `Guillaume Lemaitre`_. :mod:`sklearn.manifold` ....................... @@ -557,9 +553,6 @@ Changelog ``method="barnes-hut"`` by computing the gradient in parallel. :pr:`13213` by :user:`Thomas Moreau ` -- |API| Deprecate ``training_data_`` unused attribute in - :class:`manifold.Isomap`. :issue:`10482` by `Tom Dupre la Tour`_. - - |Fix| Fixed a bug where :func:`manifold.spectral_embedding` (and therefore :class:`manifold.SpectralEmbedding` and :class:`cluster.SpectralClustering`) computed wrong eigenvalues with ``eigen_solver='amg'`` when @@ -571,9 +564,16 @@ Changelog :issue:`13393` by :user:`Andrew Knyazev ` :pr:`13707` by :user:`Scott White ` +- |API| Deprecate ``training_data_`` unused attribute in + :class:`manifold.Isomap`. :issue:`10482` by `Tom Dupre la Tour`_. + :mod:`sklearn.metrics` ...................... +- |MajorFeature| :func:`metrics.plot_roc_curve` has been added to plot roc + curves. This function introduces the visualization API described in + the :ref:`User Guide `. :pr:`14357` by `Thomas Fan`_. + - |Feature| Added a new parameter ``zero_division`` to multiple classification metrics: :func:`precision_score`, :func:`recall_score`, :func:`f1_score`, :func:`fbeta_score`, :func:`precision_recall_fscore_support`, @@ -581,10 +581,6 @@ Changelog ill-defined metrics. :pr:`14900` by :user:`Marc Torrellas Socastro `. -- |MajorFeature| :func:`metrics.plot_roc_curve` has been added to plot roc - curves. This function introduces the visualization API described in - the :ref:`User Guide `. :pr:`14357` by `Thomas Fan`_. - - |Feature| Added the :func:`metrics.pairwise.nan_euclidean_distances` metric, which calculates euclidean distances in the presence of missing values. :issue:`12852` by :user:`Ashim Bhattarai ` and `Thomas Fan`_. @@ -615,6 +611,10 @@ Changelog :pr:`13938` by :user:`Christian Lorentzen ` and `Roman Yurchak`_. +- |Efficiency| Improved performance of + :func:`metrics.pairwise.manhattan_distances` in the case of sparse matrices. + :pr:`15049` by `Paolo Toccaceli `. + - |Enhancement| The parameter ``beta`` in :func:`metrics.fbeta_score` is updated to accept the zero and `float('+inf')` value. :pr:`13231` by :user:`Dong-hee Na `. @@ -626,23 +626,11 @@ Changelog - |Enhancement| Allow computing averaged metrics in the case of no true positives. :pr:`14595` by `Andreas Müller`_. -- |Fix| Raise a ValueError in :func:`metrics.silhouette_score` when a - precomputed distance matrix contains non-zero diagonal entries. - :pr:`12258` by :user:`Stephen Tierney `. - - |Enhancement| Multilabel metrics now supports list of lists as input. :pr:`14865` :user:`Srivatsan Ramesh `, :user:`Herilalaina Rakotoarison `, :user:`Léonard Binet `. -- |API| ``scoring="neg_brier_score"`` should be used instead of - ``scoring="brier_score_loss"`` which is now deprecated. - :pr:`14898` by :user:`Stefan Matcovici `. - -- |Efficiency| Improved performance of - :func:`metrics.pairwise.manhattan_distances` in the case of sparse matrices. - :pr:`15049` by `Paolo Toccaceli `. - - |Enhancement| :func:`metrics.median_absolute_error` now supports ``multioutput`` parameter. :pr:`14732` by :user:`Agamemnon Krasoulis `. @@ -656,6 +644,14 @@ Changelog overall. :pr:`15625` by `Guillaume Lemaitre `. +- |Fix| Raise a ValueError in :func:`metrics.silhouette_score` when a + precomputed distance matrix contains non-zero diagonal entries. + :pr:`12258` by :user:`Stephen Tierney `. + +- |API| ``scoring="neg_brier_score"`` should be used instead of + ``scoring="brier_score_loss"`` which is now deprecated. + :pr:`14898` by :user:`Stefan Matcovici `. + :mod:`sklearn.model_selection` .............................. @@ -676,16 +672,16 @@ Changelog where one test set could be `n_classes` larger than another. Test sets should now be near-equally sized. :pr:`14704` by `Joel Nothman`_. +- |Fix| The `cv_results_` attribute of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV` now only contains unfitted + estimators. This potentially saves a lot of memory since the state of the + estimators isn't stored. :pr:`#15096` by `Andreas Müller`_. + - |API| :class:`model_selection.KFold` and :class:`model_selection.StratifiedKFold` now raise a warning if `random_state` is set but `shuffle` is False. This will raise an error in 0.24. -- |Fix| The `cv_results_` attribute of :class:`model_selection.GridSearchCV` - and :class:`model_selection.RandomizedSearchCV` now only contains unfitted - estimators. This potentially saves a lot of memory since the state of the - estimators isn't stored. :pr:`#15096` by :user:`Andreas Müller `. - :mod:`sklearn.multioutput` .......................... @@ -752,13 +748,13 @@ Changelog the final estimator does. :pr:`13806` by :user:`Anaël Beaugnon `. +- |Fix| The `fit` in :class:`~pipeline.FeatureUnion` now accepts `fit_params` + to pass to the underlying transformers. :pr:`15119` by `Adrin Jalali`_. + - |API| `None` as a transformer is now deprecated in :class:`pipeline.FeatureUnion`. Please use `'drop'` instead. :pr:`15053` by `Thomas Fan`_. -- |Fix| The `fit` in :class:`~pipeline.FeatureUnion` now accepts `fit_params` - to pass to the underlying transformers. :pr:`15119` by `Adrin Jalali`_. - :mod:`sklearn.preprocessing` ............................ @@ -787,7 +783,6 @@ Changelog :pr:`13925` by :user:`Isaac S. Robson ` and :pr:`15524` by :user:`Xun Tang `. - :mod:`sklearn.svm` .................. @@ -816,7 +811,7 @@ Changelog ZeroDivisionError. :pr:`14894` by :user:`Danna Naser `. - |Fix| The liblinear solver now supports ``sample_weight``. - :pr:`15038` by :user:`Guillaume Lemaitre `. + :pr:`15038` by `Guillaume Lemaitre`_. :mod:`sklearn.tree` @@ -856,22 +851,7 @@ Changelog :func:`~utils.estimator_checks.parametrize_with_checks`, to parametrize estimator checks for a list of estimators. :pr:`14381` by `Thomas Fan`_. -- |API| The following utils have been deprecated and are now private: - - - ``utils.choose_check_classifiers_labels`` - - ``utils.enforce_estimator_tags_y`` - - ``utils.optimize.newton_cg`` - - ``utils.random.random_choice_csc`` - - ``utils.safe_indexing`` - - ``utils.mocking`` - - ``utils.fast_dict`` - - ``utils.seq_dataset`` - - ``utils.weight_vector`` - - ``utils.fixes.parallel_helper`` (removed) - - All of ``utils.testing`` except for ``all_estimators`` which is now in - ``utils``. - -- A new random variable, :class:`utils.fixes.loguniform` implements a +- |Feature| A new random variable, :class:`utils.fixes.loguniform` implements a log-uniform random variable (e.g., for use in RandomizedSearchCV). For example, the outcomes ``1``, ``10`` and ``100`` are all equally likely for ``loguniform(1, 100)``. See :issue:`11232` by @@ -882,7 +862,7 @@ Changelog ``axis`` parameter to index array-like across rows and columns. The column indexing can be done on NumPy array, SciPy sparse matrix, and Pandas DataFrame. An additional refactoring was done. :pr:`14035` and :pr:`14475` - by :user:`Guillaume Lemaitre `. + by `Guillaume Lemaitre`_. - |Enhancement| :func:`utils.extmath.safe_sparse_dot` works between 3D+ ndarray and sparse matrix. @@ -904,6 +884,18 @@ Changelog - ``mocking.CheckingClassifier`` - ``optimize.newton_cg`` - ``random.random_choice_csc`` + - ``utils.choose_check_classifiers_labels`` + - ``utils.enforce_estimator_tags_y`` + - ``utils.optimize.newton_cg`` + - ``utils.random.random_choice_csc`` + - ``utils.safe_indexing`` + - ``utils.mocking`` + - ``utils.fast_dict`` + - ``utils.seq_dataset`` + - ``utils.weight_vector`` + - ``utils.fixes.parallel_helper`` (removed) + - All of ``utils.testing`` except for ``all_estimators`` which is now in + ``utils``. :mod:`sklearn.isotonic` .................................. @@ -912,10 +904,13 @@ Changelog when `X.dtype == 'float32'` and `X.dtype != y.dtype`. :pr:`14902` by :user:`Lucas `. - Miscellaneous ............. +- |Fix| Port `lobpcg` from SciPy which implement some bug fixes but only + available in 1.3+. + :pr:`13609` and :pr:`14971` by `Guillaume Lemaitre`_. + - |API| Scikit-learn now converts any input data structure implementing a duck array to a numpy array (using ``__array__``) to ensure consistent behavior instead of relying on ``__array_function__`` (see `NEP 18 @@ -926,10 +921,6 @@ Miscellaneous using a non-fitted estimators are now more uniform. :pr:`13013` by :user:`Agamemnon Krasoulis `. -- |Fix| Port `lobpcg` from SciPy which implement some bug fixes but only - available in 1.3+. - :pr:`13609` and :pr:`14971` by :user:`Guillaume Lemaitre `. - Changes to estimator checks --------------------------- @@ -966,5 +957,4 @@ These changes mostly affect library developers. - |Fix| Added ``check_transformer_data_not_an_array`` to checks where missing - |Fix| The estimators tags resolution now follows the regular MRO. They used - to be overridable only once. :pr:`14884` by :user:`Andreas Müller - `. + to be overridable only once. :pr:`14884` by `Andreas Müller`_. From c4733f4895c1becdf587b38970f6f7066656e3f9 Mon Sep 17 00:00:00 2001 From: Brian Wignall Date: Tue, 26 Nov 2019 07:31:40 -0500 Subject: [PATCH 058/448] MNT Fix some easy-to-make typos (#15720) --- build_tools/azure/install.sh | 2 +- doc/developers/advanced_installation.rst | 2 +- doc/modules/computing.rst | 4 ++-- doc/modules/model_evaluation.rst | 2 +- doc/modules/neighbors.rst | 2 +- doc/whats_new/v0.20.rst | 2 +- doc/whats_new/v0.21.rst | 2 +- doc/whats_new/v0.22.rst | 2 +- examples/inspection/plot_partial_dependence.py | 2 +- sklearn/decomposition/_dict_learning.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/binning.py | 6 +++--- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 2 +- sklearn/ensemble/tests/test_gradient_boosting.py | 2 +- sklearn/externals/_arff.py | 2 +- sklearn/metrics/_regression.py | 2 +- sklearn/metrics/tests/test_common.py | 2 +- sklearn/metrics/tests/test_score_objects.py | 6 +++--- sklearn/model_selection/_search.py | 4 ++-- 18 files changed, 25 insertions(+), 25 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 084c497eb65aa..f4f60df8a9626 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -11,7 +11,7 @@ make_conda() { } version_ge() { - # The two version numbers are seperated with a new line is piped to sort + # The two version numbers are separated with a new line is piped to sort # -rV. The -V activates for version number sorting and -r sorts in # decending order. If the first argument is the top element of the sort, it # is greater than or equal to the second argument. diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index bcb305116676d..c58eb14e828d2 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -374,7 +374,7 @@ Finally, build the package using the standard command:: pip install --verbose --editable . -For the upcomming FreeBSD 12.1 and 11.3 versions, OpenMP will be included in +For the upcoming FreeBSD 12.1 and 11.3 versions, OpenMP will be included in the base system and these steps will not be necessary. .. _OpenMP: https://en.wikipedia.org/wiki/OpenMP diff --git a/doc/modules/computing.rst b/doc/modules/computing.rst index 176b8e22fca1c..246085d436cde 100644 --- a/doc/modules/computing.rst +++ b/doc/modules/computing.rst @@ -529,7 +529,7 @@ Joblib-based parallelism ........................ When the underlying implementation uses joblib, the number of workers -(threads or processes) that are spawned in parallel can be controled via the +(threads or processes) that are spawned in parallel can be controlled via the ``n_jobs`` parameter. .. note:: @@ -666,7 +666,7 @@ Python runtime :working_memory: - the optimal size of temporary arrays used by some algoritms. + the optimal size of temporary arrays used by some algorithms. .. _environment_variable: diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 1410887c4c51f..7af1e46578de6 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1720,7 +1720,7 @@ relevant), NDCG can be used. For one sample, given the vector of continuous ground-truth values for each target :math:`y \in \mathbb{R}^{M}`, where :math:`M` is the number of outputs, and -the prediction :math:`\hat{y}`, which induces the ranking funtion :math:`f`, the +the prediction :math:`\hat{y}`, which induces the ranking function :math:`f`, the DCG score is .. math:: diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index 7f72aa68c38db..9aa27a53501b8 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -581,7 +581,7 @@ implementation with special data types. The precomputed neighbors training point as its own neighbor in the count of `n_neighbors`. However, for compatibility reasons with other estimators which use the other definition, one extra neighbor will be computed when `mode == 'distance'`. - To maximise compatiblity with all estimators, a safe choice is to always + To maximise compatibility with all estimators, a safe choice is to always include one extra neighbor in a custom nearest neighbors estimator, since unnecessary neighbors will be filtered by following estimators. diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 4e3a4891b70e2..2eaf3199fbc3c 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -709,7 +709,7 @@ Support for Python 3.3 has been officially dropped. - |Feature| |Fix| :class:`decomposition.SparsePCA` now exposes ``normalize_components``. When set to True, the train and test data are - centered with the train mean repsectively during the fit phase and the + centered with the train mean respectively during the fit phase and the transform phase. This fixes the behavior of SparsePCA. When set to False, which is the default, the previous abnormal behaviour still holds. The False value is for backward compatibility and should not be used. :issue:`11585` diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 59e3774e76c69..94099723dd0ec 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -295,7 +295,7 @@ Support for Python 3.4 and below has been officially dropped. ...................... - |MajorFeature| A new clustering algorithm: :class:`cluster.OPTICS`: an - algoritm related to :class:`cluster.DBSCAN`, that has hyperparameters easier + algorithm related to :class:`cluster.DBSCAN`, that has hyperparameters easier to set and that scales better, by :user:`Shane `, `Adrin Jalali`_, :user:`Erich Schubert `, `Hanmin Qin`_, and :user:`Assia Benbihi `. diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f62e380085c82..3a7e644d79ee7 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -799,7 +799,7 @@ Changelog - |Fix| :class:`svm.SVC`, :class:`svm.SVR`, :class:`svm.NuSVR` and :class:`svm.OneClassSVM` when received values negative or zero for parameter ``sample_weight`` in method fit(), generated an - invalid model. This behavior occured only in some border scenarios. + invalid model. This behavior occurred only in some border scenarios. Now in these cases, fit() will fail with an Exception. :pr:`14286` by :user:`Alex Shacked `. diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 526ace208e30f..d74c6363dec06 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -14,7 +14,7 @@ :class:`~sklearn.ensemble.HistGradientBoostingRegressor` trained on the California housing dataset. The example is taken from [1]_. -The plots show four 1-way and two 1-way partial dependence plots (ommitted for +The plots show four 1-way and two 1-way partial dependence plots (omitted for :class:`~sklearn.neural_network.MLPRegressor` due to computation time). The target variables for the one-way PDP are: median income (`MedInc`), average occupants per household (`AvgOccup`), median house age (`HouseAge`), and diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 29839157ca33f..a2f3f601f4127 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -704,7 +704,7 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, inner_stats : tuple of (A, B) ndarrays Inner sufficient statistics that are kept by the algorithm. Passing them at initialization is useful in online settings, to - avoid loosing the history of the evolution. + avoid losing the history of the evolution. A (n_components, n_components) is the dictionary covariance matrix. B (n_features, n_components) is the data approximation matrix @@ -1351,7 +1351,7 @@ class MiniBatchDictionaryLearning(SparseCodingMixin, BaseEstimator): inner_stats_ : tuple of (A, B) ndarrays Internal sufficient statistics that are kept by the algorithm. - Keeping them is useful in online settings, to avoid loosing the + Keeping them is useful in online settings, to avoid losing the history of the evolution, but they shouldn't have any use for the end user. A (n_components, n_components) is the dictionary covariance matrix. diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 18cddca2d867f..a4dec15763940 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -32,7 +32,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): instead of the quantiles. subsample : int or None If ``n_samples > subsample``, then ``sub_samples`` samples will be - randomly choosen to compute the quantiles. If ``None``, the whole data + randomly chosen to compute the quantiles. If ``None``, the whole data is used. random_state: int or numpy.random.RandomState or None Pseudo-random number generator to control the random sub-sampling. @@ -107,7 +107,7 @@ class _BinMapper(TransformerMixin, BaseEstimator): instead of the quantiles. subsample : int or None, optional (default=2e5) If ``n_samples > subsample``, then ``sub_samples`` samples will be - randomly choosen to compute the quantiles. If ``None``, the whole data + randomly chosen to compute the quantiles. If ``None``, the whole data is used. random_state: int or numpy.random.RandomState or None, \ optional (default=None) @@ -126,7 +126,7 @@ class _BinMapper(TransformerMixin, BaseEstimator): equal to ``n_bins - 1``. missing_values_bin_idx_ : uint8 The index of the bin where missing values are mapped. This is a - constant accross all features. This corresponds to the last bin, and + constant across all features. This corresponds to the last bin, and it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_`` is less than ``n_bins - 1`` for a given feature, then there are empty (and unused) bins. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 117539a424119..87950eab38a97 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -413,7 +413,7 @@ def test_infinite_values_missing_values(): # High level test making sure that inf and nan values are properly handled # when both are present. This is similar to # test_split_on_nan_with_infinite_values() in test_grower.py, though we - # cannot check the predicitons for binned values here. + # cannot check the predictions for binned values here. X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1) y_isnan = np.isnan(X.ravel()) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index d0100a1724a52..5fe9dee573d1d 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1311,7 +1311,7 @@ def test_gradient_boosting_with_init(gb, dataset_maker, init_estimator): # Check that GradientBoostingRegressor works when init is a sklearn # estimator. # Check that an error is raised if trying to fit with sample weight but - # inital estimator does not support sample weight + # initial estimator does not support sample weight X, y = dataset_maker() sample_weight = np.random.RandomState(42).rand(100) diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 4db55eb6d6c02..bf3cbfc9a9b98 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -98,7 +98,7 @@ The above keys must follow the case which were described, i.e., the keys are case sensitive. The attribute type ``attribute_type`` must be one of these strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or -``STRING``. For nominal attributes, the ``atribute_type`` must be a list of +``STRING``. For nominal attributes, the ``attribute_type`` must be a list of strings. In this format, the XOR dataset presented above can be represented as a python diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index d0226e62bb7ec..6c3c83a0c0c7c 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -717,7 +717,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): message = ("Mean Tweedie deviance error with power={} can only be used on " .format(power)) if power < 0: - # 'Extreme stable', y_true any realy number, y_pred > 0 + # 'Extreme stable', y_true any real number, y_pred > 0 if (y_pred <= 0).any(): raise ValueError(message + "strictly positive y_pred.") dev = 2 * (np.power(np.maximum(y_true, 0), 2 - power) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 5f93810f0b407..991af61537012 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -115,7 +115,7 @@ "unnormalized_accuracy_score": partial(accuracy_score, normalize=False), # `confusion_matrix` returns absolute values and hence behaves unnormalized - # . Naming it with an unnormalized_ prefix is neccessary for this module to + # . Naming it with an unnormalized_ prefix is necessary for this module to # skip sample_weight scaling checks which will fail for unnormalized # metrics. "unnormalized_confusion_matrix": confusion_matrix, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 00ff5a3a0563e..64e88f37ed2bc 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -649,7 +649,7 @@ def predict(self, X): def test_multimetric_scorer_sanity_check(): - # scoring dictionary returned is the same as calling each scorer seperately + # scoring dictionary returned is the same as calling each scorer separately scorers = {'a1': 'accuracy', 'a2': 'accuracy', 'll1': 'neg_log_loss', 'll2': 'neg_log_loss', 'ra1': 'roc_auc', 'ra2': 'roc_auc'} @@ -664,13 +664,13 @@ def test_multimetric_scorer_sanity_check(): result = multi_scorer(clf, X, y) - seperate_scores = { + separate_scores = { name: get_scorer(name)(clf, X, y) for name in ['accuracy', 'neg_log_loss', 'roc_auc']} for key, value in result.items(): score_name = scorers[key] - assert_allclose(value, seperate_scores[score_name]) + assert_allclose(value, separate_scores[score_name]) @pytest.mark.parametrize('scorer_name, metric', [ diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 4c9b082d355fd..e6a8493ef6250 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -948,7 +948,7 @@ class GridSearchCV(BaseSearchCV): returns the selected ``best_index_`` given ``cv_results_``. In that case, the ``best_estimator_`` and ``best_parameters_`` will be set according to the returned ``best_index_`` while the ``best_score_`` - attribute will not be availble. + attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this @@ -1278,7 +1278,7 @@ class RandomizedSearchCV(BaseSearchCV): returns the selected ``best_index_`` given the ``cv_results``. In that case, the ``best_estimator_`` and ``best_parameters_`` will be set according to the returned ``best_index_`` while the ``best_score_`` - attribute will not be availble. + attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this From 63cd6004bc4f31222ac0b17b47691718123a4e22 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 26 Nov 2019 06:33:01 -0600 Subject: [PATCH 059/448] DOC Typo in the error message of _binary_clf_curve (#15703) --- sklearn/metrics/_ranking.py | 2 +- sklearn/metrics/tests/test_ranking.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index c88fe685e97c9..71731025e5649 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -539,7 +539,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): classes_repr = ", ".join(repr(c) for c in classes) raise ValueError("y_true takes value in {{{classes_repr}}} and " "pos_label is not specified: either make y_true " - "take integer value in {{0, 1}} or {{-1, 1}} or " + "take value in {{0, 1}} or {{-1, 1}} or " "pass pos_label explicitly.".format( classes_repr=classes_repr)) elif pos_label is None: diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index ae0296718f43a..7ce7cf3e3814e 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -683,7 +683,7 @@ def test_binary_clf_curve_implicit_pos_label(curve_func): # Check that using string class labels raises an informative # error for any supported string dtype: msg = ("y_true takes value in {'a', 'b'} and pos_label is " - "not specified: either make y_true take integer " + "not specified: either make y_true take " "value in {0, 1} or {-1, 1} or pass pos_label " "explicitly.") with pytest.raises(ValueError, match=msg): @@ -695,7 +695,7 @@ def test_binary_clf_curve_implicit_pos_label(curve_func): # The error message is slightly different for bytes-encoded # class labels, but otherwise the behavior is the same: msg = ("y_true takes value in {b'a', b'b'} and pos_label is " - "not specified: either make y_true take integer " + "not specified: either make y_true take " "value in {0, 1} or {-1, 1} or pass pos_label " "explicitly.") with pytest.raises(ValueError, match=msg): From 6419f65ec0183c01316f20c797ffcdc868ab141e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 26 Nov 2019 14:32:52 +0100 Subject: [PATCH 060/448] FIX remove max_samples in RandomTreesEmbedding (#15693) --- doc/whats_new/v0.22.rst | 5 +---- sklearn/ensemble/_forest.py | 16 ++-------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 3a7e644d79ee7..e14601c1b52a7 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -326,13 +326,10 @@ Changelog - |Enhancement| Addition of ``max_samples`` argument allows limiting size of bootstrap samples to be less than size of dataset. Added to - :class:`ensemble.ForestClassifier`, - :class:`ensemble.ForestRegressor`, :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`, - :class:`ensemble.ExtraTreesRegressor`, - :class:`ensemble.RandomTreesEmbedding`. :pr:`14682` by + :class:`ensemble.ExtraTreesRegressor`. :pr:`14682` by :user:`Matt Hancock ` and :pr:`5963` by :user:`Pablo Duboue `. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index e06c69bf1dae2..eba59c232531b 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2112,17 +2112,6 @@ class RandomTreesEmbedding(BaseForest): and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. - max_samples : int or float, default=None - If bootstrap is True, the number of samples to draw from X - to train each base estimator. - - - If None (default), then draw `X.shape[0]` samples. - - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0, 1)`. - - .. versionadded:: 0.22 - Attributes ---------- estimators_ : list of DecisionTreeClassifier @@ -2154,8 +2143,7 @@ def __init__(self, n_jobs=None, random_state=None, verbose=0, - warm_start=False, - max_samples=None): + warm_start=False): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -2170,7 +2158,7 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - max_samples=max_samples) + max_samples=None) self.max_depth = max_depth self.min_samples_split = min_samples_split From 9e5819aa413ce907134ee5704abba43ad8a61827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 26 Nov 2019 17:37:26 +0100 Subject: [PATCH 061/448] MNT Include all pxd files in the package (#15626) * include all pxd files in package * _libsvm _liblinear to pxi + add a test * simplify stuff + use pathlib * cln * more cln --- maint_tools/check_pxd_in_installation.py | 59 +++++++++++++++++++ setup.py | 1 + .../svm/{_liblinear.pxd => _liblinear.pxi} | 5 +- sklearn/svm/_liblinear.pyx | 2 + sklearn/svm/{_libsvm.pxd => _libsvm.pxi} | 2 - sklearn/svm/_libsvm.pyx | 2 + sklearn/tree/setup.py | 4 -- sklearn/utils/_weight_vector.pxd | 7 --- 8 files changed, 66 insertions(+), 16 deletions(-) create mode 100644 maint_tools/check_pxd_in_installation.py rename sklearn/svm/{_liblinear.pxd => _liblinear.pxi} (98%) rename sklearn/svm/{_libsvm.pxd => _libsvm.pxi} (99%) diff --git a/maint_tools/check_pxd_in_installation.py b/maint_tools/check_pxd_in_installation.py new file mode 100644 index 0000000000000..83c4b706294ad --- /dev/null +++ b/maint_tools/check_pxd_in_installation.py @@ -0,0 +1,59 @@ +"""Utility for testing presence and usability of .pxd files in the installation + +Usage: +------ +python check_pxd_in_installation.py path/to/install_dir/of/scikit-learn +""" + +import os +import sys +import pathlib +import tempfile +import textwrap +import subprocess + + +sklearn_dir = pathlib.Path(sys.argv[1]) +pxd_files = list(sklearn_dir.glob("**/*.pxd")) + +print("> Found pxd files:") +for pxd_file in pxd_files: + print(' -', pxd_file) + +print("\n> Trying to compile a cython extension cimporting all corresponding " + "modules\n") +with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = pathlib.Path(tmpdir) + # A cython test file which cimports all modules corresponding to found + # pxd files. + # e.g. sklearn/tree/_utils.pxd becomes `cimport sklearn.tree._utils` + with open(tmpdir / 'tst.pyx', 'w') as f: + for pxd_file in pxd_files: + to_import = str(pxd_file.relative_to(sklearn_dir)) + to_import = to_import.replace(os.path.sep, '.') + to_import = to_import.replace('.pxd', '') + f.write('cimport sklearn.' + to_import + '\n') + + # A basic setup file to build the test file. + # We set the language to c++ and we use numpy.get_include() because + # some modules require it. + with open(tmpdir / 'setup_tst.py', 'w') as f: + f.write(textwrap.dedent( + """ + from distutils.core import setup + from distutils.extension import Extension + from Cython.Build import cythonize + import numpy + + extensions = [Extension("tst", + sources=["tst.pyx"], + language="c++", + include_dirs=[numpy.get_include()])] + + setup(ext_modules=cythonize(extensions)) + """)) + + subprocess.run(["python", "setup_tst.py", "build_ext", "-i"], + check=True, cwd=tmpdir) + + print("\n> Compilation succeeded !") diff --git a/setup.py b/setup.py index 3ed5c786a17c3..80b1b8ecc9391 100755 --- a/setup.py +++ b/setup.py @@ -259,6 +259,7 @@ def setup_package(): 'scipy>={}'.format(SCIPY_MIN_VERSION), 'joblib>={}'.format(JOBLIB_MIN_VERSION) ], + package_data={'': ['*.pxd']}, **extra_setuptools_args) if len(sys.argv) == 1 or ( diff --git a/sklearn/svm/_liblinear.pxd b/sklearn/svm/_liblinear.pxi similarity index 98% rename from sklearn/svm/_liblinear.pxd rename to sklearn/svm/_liblinear.pxi index 0f10e54a532fe..148bf694dab4f 100644 --- a/sklearn/svm/_liblinear.pxd +++ b/sklearn/svm/_liblinear.pxi @@ -1,6 +1,3 @@ -cimport numpy as np - - cdef extern from "_cython_blas_helpers.h": ctypedef double (*dot_func)(int, double*, int, double*, int) ctypedef void (*axpy_func)(int, double, double*, int, double*, int) @@ -12,6 +9,7 @@ cdef extern from "_cython_blas_helpers.h": scal_func scal nrm2_func nrm2 + cdef extern from "linear.h": cdef struct feature_node cdef struct problem @@ -28,6 +26,7 @@ cdef extern from "linear.h": void free_and_destroy_model (model **) void destroy_param (parameter *) + cdef extern from "liblinear_helper.c": void copy_w(void *, model *, int) parameter *set_parameter(int, double, double, int, char *, char *, int, int, double) diff --git a/sklearn/svm/_liblinear.pyx b/sklearn/svm/_liblinear.pyx index 2f042748d94a0..9dd15e0716c7f 100644 --- a/sklearn/svm/_liblinear.pyx +++ b/sklearn/svm/_liblinear.pyx @@ -9,6 +9,8 @@ cimport numpy as np from ..utils._cython_blas cimport _dot, _axpy, _scal, _nrm2 +include "_liblinear.pxi" + np.import_array() diff --git a/sklearn/svm/_libsvm.pxd b/sklearn/svm/_libsvm.pxi similarity index 99% rename from sklearn/svm/_libsvm.pxd rename to sklearn/svm/_libsvm.pxi index 2664a335a372f..a3c8f1c33dd1e 100644 --- a/sklearn/svm/_libsvm.pxd +++ b/sklearn/svm/_libsvm.pxi @@ -1,5 +1,3 @@ -cimport numpy as np - ################################################################################ # Includes diff --git a/sklearn/svm/_libsvm.pyx b/sklearn/svm/_libsvm.pyx index 8f8e9f7465823..079a791fef3b6 100644 --- a/sklearn/svm/_libsvm.pyx +++ b/sklearn/svm/_libsvm.pyx @@ -35,6 +35,8 @@ import numpy as np cimport numpy as np from libc.stdlib cimport free +include "_libsvm.pxi" + cdef extern from *: ctypedef struct svm_parameter: pass diff --git a/sklearn/tree/setup.py b/sklearn/tree/setup.py index 2b9819795b74b..079ae9d869075 100644 --- a/sklearn/tree/setup.py +++ b/sklearn/tree/setup.py @@ -31,10 +31,6 @@ def configuration(parent_package="", top_path=None): extra_compile_args=["-O3"]) config.add_subpackage("tests") - config.add_data_files("_criterion.pxd") - config.add_data_files("_splitter.pxd") - config.add_data_files("_tree.pxd") - config.add_data_files("_utils.pxd") return config diff --git a/sklearn/utils/_weight_vector.pxd b/sklearn/utils/_weight_vector.pxd index 1f38bb7e0981f..fc1b47a50ef1f 100644 --- a/sklearn/utils/_weight_vector.pxd +++ b/sklearn/utils/_weight_vector.pxd @@ -1,12 +1,5 @@ """Efficient (dense) parameter vector implementation for linear models. """ -cimport numpy as np - - -cdef extern from "math.h": - cdef extern double sqrt(double x) - - cdef class WeightVector(object): cdef double *w_data_ptr cdef double *aw_data_ptr From f4da713c44d9529da2d82d76e0c0c6ef7e0e4795 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Nov 2019 04:41:31 +0100 Subject: [PATCH 062/448] FIX add pos_label when computing AP in plot_precision_recall_curve (#15739) --- .../metrics/_plot/precision_recall_curve.py | 1 + .../_plot/tests/test_plot_precision_recall.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py index d2b84059c3c0e..d515b9aa86b1d 100644 --- a/sklearn/metrics/_plot/precision_recall_curve.py +++ b/sklearn/metrics/_plot/precision_recall_curve.py @@ -161,6 +161,7 @@ def plot_precision_recall_curve(estimator, X, y, pos_label=pos_label, sample_weight=sample_weight) average_precision = average_precision_score(y, y_pred, + pos_label=pos_label, sample_weight=sample_weight) viz = PrecisionRecallDisplay(precision, recall, average_precision, estimator.__class__.__name__) diff --git a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py index 1012e13027f5a..60e06ed34ad01 100644 --- a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py +++ b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py @@ -7,6 +7,7 @@ from sklearn.metrics import average_precision_score from sklearn.metrics import precision_recall_curve from sklearn.datasets import make_classification +from sklearn.datasets import load_breast_cancer from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.linear_model import LogisticRegression from sklearn.exceptions import NotFittedError @@ -132,3 +133,23 @@ def test_precision_recall_curve_pipeline(pyplot, clf): clf.fit(X, y) disp = plot_precision_recall_curve(clf, X, y) assert disp.estimator_name == clf.__class__.__name__ + + +def test_precision_recall_curve_string_labels(pyplot): + # regression test #15738 + cancer = load_breast_cancer() + X = cancer.data + y = cancer.target_names[cancer.target] + + lr = make_pipeline(StandardScaler(), LogisticRegression()) + lr.fit(X, y) + for klass in cancer.target_names: + assert klass in lr.classes_ + disp = plot_precision_recall_curve(lr, X, y) + + y_pred = lr.predict_proba(X)[:, 1] + avg_prec = average_precision_score(y, y_pred, + pos_label=lr.classes_[1]) + + assert disp.average_precision == pytest.approx(avg_prec) + assert disp.estimator_name == lr.__class__.__name__ From a117db5755db3feb64c0508829b45de55e4a77e5 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 1 Dec 2019 14:08:30 -0500 Subject: [PATCH 063/448] MNT Activates github actions (#15746) --- .github/workflows/activate.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/activate.yml diff --git a/.github/workflows/activate.yml b/.github/workflows/activate.yml new file mode 100644 index 0000000000000..f6aede6289ebf --- /dev/null +++ b/.github/workflows/activate.yml @@ -0,0 +1,21 @@ +# Simple first task to activate GitHub actions. +# This won't run until is merged, but future actions will +# run on PRs, so we can see we don't break things in more +# complex actions added later, like real builds. +# +# TODO: Remove this once another action exists +name: Activate + +on: + push: + branches: master + pull_request: + branches: master + +jobs: + activate: + name: Activate actions + runs-on: ubuntu-latest + steps: + - name: Activate + run: echo "GitHub actions ok" From a91304ecd76ef37aab19c9108864cdcfa8b41ec5 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 2 Dec 2019 12:03:25 +0100 Subject: [PATCH 064/448] DOC Pre 0.22 release (#15735) * add python 3.8 to setup.py * add authors to whats_new * update release date * move whats new entry to 0.23 * add news section to index.html * pep8 Co-Authored-By: Thomas J Fan --- doc/templates/index.html | 2 ++ doc/whats_new/v0.22.rst | 68 +++++++++++++++++++++++++++++++++++++--- doc/whats_new/v0.23.rst | 9 +++--- setup.py | 1 + 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/doc/templates/index.html b/doc/templates/index.html index b867d8c517cf5..aa7139bd9b505 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -156,6 +156,8 @@

News

On-going development: What's new (Changelog)

December 2019. scikit-learn 0.22 is available for download (Changelog). +

Scikit-learn from 0.21 requires Python 3.5 or greater.

July 2019. scikit-learn 0.21.3 (Changelog) and 0.20.4 (Changelog) are available for download. diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e14601c1b52a7..f3f69a8299b0a 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -7,7 +7,7 @@ Version 0.22.0 ============== -**November 29 2019** +**December 3 2019** For a short description of the main highlights of the release, please refer to @@ -168,10 +168,6 @@ Changelog `affinity='cosine'` and `X` has samples that are all-zeros. :pr:`7943` by :user:`mthorrell`. -- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more - more memory efficient implementation of single linkage clustering. - :pr:`11514` by :user:`Leland McInnes `. - :mod:`sklearn.compose` ...................... @@ -955,3 +951,65 @@ These changes mostly affect library developers. - |Fix| The estimators tags resolution now follows the regular MRO. They used to be overridable only once. :pr:`14884` by `Andreas Müller`_. + + +Code and Documentation Contributors +----------------------------------- + +Thanks to everyone who has contributed to the maintenance and improvement of the +project since version 0.20, including: + +Aaron Alphonsus, Abbie Popa, Abdur-Rahmaan Janhangeer, abenbihi, Abhinav Sagar, +Abhishek Jana, Abraham K. Lagat, Adam J. Stewart, Aditya Vyas, Adrin Jalali, +Agamemnon Krasoulis, Alec Peters, Alessandro Surace, Alexandre de Siqueira, +Alexandre Gramfort, alexgoryainov, Alex Henrie, Alex Itkes, alexshacked, Allen +Akinkunle, Anaël Beaugnon, Anders Kaseorg, Andrea Maldonado, Andrea Navarrete, +Andreas Mueller, Andreas Schuderer, Andrew Nystrom, Angela Ambroz, Anisha +Keshavan, Ankit Jha, Antonio Gutierrez, Anuja Kelkar, Archana Alva, +arnaudstiegler, arpanchowdhry, ashimb9, Ayomide Bamidele, Baran Buluttekin, +barrycg, Bharat Raghunathan, Bill Mill, Biswadip Mandal, blackd0t, Brian G. +Barkley, Brian Wignall, Bryan Yang, c56pony, camilaagw, cartman_nabana, +catajara, Cat Chenal, Cathy, cgsavard, Charles Vesteghem, Chiara Marmo, Chris +Gregory, Christian Lorentzen, Christos Aridas, Dakota Grusak, Daniel Grady, +Daniel Perry, Danna Naser, DatenBergwerk, David Dormagen, deeplook, Dillon +Niederhut, Dong-hee Na, Dougal J. Sutherland, DrGFreeman, Dylan Cashman, +edvardlindelof, Eric Larson, Eric Ndirangu, Eunseop Jeong, Fanny, +federicopisanu, Felix Divo, flaviomorelli, FranciDona, Franco M. Luque, Frank +Hoang, Frederic Haase, g0g0gadget, Gabriel Altay, Gabriel do Vale Rios, Gael +Varoquaux, ganevgv, gdex1, getgaurav2, Gideon Sonoiya, Gordon Chen, gpapadok, +Greg Mogavero, Grzegorz Szpak, Guillaume Lemaitre, Guillem García Subies, +H4dr1en, hadshirt, Hailey Nguyen, Hanmin Qin, Hannah Bruce Macdonald, Harsh +Mahajan, Harsh Soni, Honglu Zhang, Hossein Pourbozorg, Ian Sanders, Ingrid +Spielman, J-A16, jaehong park, Jaime Ferrando Huertas, James Hill, James Myatt, +Jay, jeremiedbb, Jérémie du Boisberranger, jeromedockes, Jesper Dramsch, Joan +Massich, Joanna Zhang, Joel Nothman, Johann Faouzi, Jonathan Rahn, Jon Cusick, +Jose Ortiz, Kanika Sabharwal, Katarina Slama, kellycarmody, Kennedy Kang'ethe, +Kensuke Arai, Kesshi Jordan, Kevad, Kevin Loftis, Kevin Winata, Kevin Yu-Sheng +Li, Kirill Dolmatov, Kirthi Shankar Sivamani, krishna katyal, Lakshmi Krishnan, +Lakshya KD, LalliAcqua, lbfin, Leland McInnes, Léonard Binet, Loic Esteve, +loopyme, lostcoaster, Louis Huynh, lrjball, Luca Ionescu, Lutz Roeder, +MaggieChege, Maithreyi Venkatesh, Maltimore, Maocx, Marc Torrellas, Marie +Douriez, Markus, Markus Frey, Martina G. Vilas, Martin Oywa, Martin Thoma, +Masashi SHIBATA, Maxwell Aladago, mbillingr, m-clare, Meghann Agarwal, m.fab, +Micah Smith, miguelbarao, Miguel Cabrera, Mina Naghshhnejad, Ming Li, motmoti, +mschaffenroth, mthorrell, Natasha Borders, nezar-a, Nicolas Hug, Nidhin +Pattaniyil, Nikita Titov, Nishan Singh Mann, Nitya Mandyam, norvan, +notmatthancock, novaya, nxorable, Oleg Stikhin, Oleksandr Pavlyk, Olivier +Grisel, Omar Saleem, Owen Flanagan, panpiort8, Paolo, Paolo Toccaceli, Paresh +Mathur, Paula, Peng Yu, Peter Marko, pierretallotte, poorna-kumar, pspachtholz, +qdeffense, Rajat Garg, Raphaël Bournhonesque, Ray, Ray Bell, Rebekah Kim, Reza +Gharibi, Richard Payne, Richard W, rlms, Robert Juergens, Rok Mihevc, Roman +Feldbauer, Roman Yurchak, R Sanjabi, RuchitaGarde, Ruth Waithera, Sackey, Sam +Dixon, Samesh Lakhotia, Samuel Taylor, Sarra Habchi, Scott Gigante, Scott +Sievert, Scott White, Sebastian Pölsterl, Sergey Feldman, SeWook Oh, she-dares, +Shreya V, Shubham Mehta, Shuzhe Xiao, SimonCW, smarie, smujjiga, Sönke +Behrends, Soumirai, Sourav Singh, stefan-matcovici, steinfurt, Stéphane +Couvreur, Stephan Tulkens, Stephen Cowley, Stephen Tierney, SylvainLan, +th0rwas, theoptips, theotheo, Thierno Ibrahima DIOP, Thomas Edwards, Thomas J +Fan, Thomas Moreau, Thomas Schmitt, Tilen Kusterle, Tim Bicker, Timsaur, Tim +Staley, Tirth Patel, Tola A, Tom Augspurger, Tom Dupré la Tour, topisan, Trevor +Stephens, ttang131, Urvang Patel, Vathsala Achar, veerlosar, Venkatachalam N, +Victor Luzgin, Vincent Jeanselme, Vincent Lostanlen, Vladimir Korolev, +vnherdeiro, Wenbo Zhao, Wendy Hu, willdarnell, William de Vazelhes, +wolframalpha, xavier dupré, xcjason, x-martian, xsat, xun-tang, Yinglr, +yokasre, Yu-Hang "Maxin" Tang, Yulia Zamriy, Zhao Feng diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 5cec206a0fe95..c57855dd774b2 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -44,8 +44,9 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. -:mod:`sklearn.module` -..................... +:mod:`sklearn.cluster` +...................... -- |Fix| example fix in model XXX. :pr:`xxxx` or :issue:`xxxx` by - :user:`name ` +- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more + more memory efficient implementation of single linkage clustering. + :pr:`11514` by :user:`Leland McInnes `. diff --git a/setup.py b/setup.py index 80b1b8ecc9391..a8d0a81ea02c8 100755 --- a/setup.py +++ b/setup.py @@ -247,6 +247,7 @@ def setup_package(): 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', ('Programming Language :: Python :: ' 'Implementation :: CPython'), ('Programming Language :: Python :: ' From e28e0dbea96ffaa8455436e0ea9e16df04f0f2f8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 2 Dec 2019 16:41:04 +0100 Subject: [PATCH 065/448] DOC Mention intel conda channel in installation doc (#14247) --- doc/install.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/install.rst b/doc/install.rst index d45e9f3367ff1..1e6ed734e1085 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -230,6 +230,26 @@ library for Windows, Mac OSX and Linux. Anaconda offers scikit-learn as part of its free distribution. +Intel conda channel +------------------- + +Intel maintains a dedicated conda channel that ships scikit-learn:: + + $ conda install -c intel scikit-learn + +This version of scikit-learn comes with alternative solvers for some common +estimators. Those solvers come from the DAAL C++ library and are optimized for +multi-core Intel CPUs. + +Note that those solvers are not enabled by default, please refer to the +`daal4py `_ documentation +for more details. + +Compatibility with the standard scikit-learn solvers is checked by running the +full scikit-learn test suite via automated continuous integration as reported +on https://github.com/IntelPython/daal4py. + + WinPython for Windows ----------------------- From d06ef03bd3a730761235b33e18ed376a9b05930d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 2 Dec 2019 19:16:55 +0100 Subject: [PATCH 066/448] Configure twitter-action for @sklearn_commits (#15758) --- .github/workflows/twitter.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/twitter.yml diff --git a/.github/workflows/twitter.yml b/.github/workflows/twitter.yml new file mode 100644 index 0000000000000..d0b41e1c684a0 --- /dev/null +++ b/.github/workflows/twitter.yml @@ -0,0 +1,25 @@ +# Tweet the URL of a commit on @sklearn_commits whenever a push event +# happens on the master branch +name: Twitter Push Notification + + +on: + push: + branches: + - master + + +jobs: + tweet: + name: Twitter Notification + runs-on: ubuntu-latest + steps: + - name: Tweet URL of last commit as @sklearn_commits + uses: xorilog/twitter-action@0.1 + with: + args: "-message \"https://github.com/scikit-learn/scikit-learn/commit/${{ github.sha }}\"" + env: + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_SECRET: ${{ secrets.TWITTER_ACCESS_SECRET }} From cc88ec97b49387c9653934188667b45fc2d27a7d Mon Sep 17 00:00:00 2001 From: cgsavard Date: Mon, 2 Dec 2019 12:54:37 -0700 Subject: [PATCH 067/448] DOC fixed default values in dbscan (#15753) --- sklearn/cluster/_dbscan.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 3b3ccb1fbe6dc..a464e3951673a 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -156,18 +156,18 @@ class DBSCAN(ClusterMixin, BaseEstimator): Parameters ---------- - eps : float, optional + eps : float, default=0.5 The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples : int, optional + min_samples : int, default=5 The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. - metric : string, or callable + metric : string, or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for @@ -179,27 +179,27 @@ class DBSCAN(ClusterMixin, BaseEstimator): .. versionadded:: 0.17 metric *precomputed* to accept precomputed sparse matrix. - metric_params : dict, optional + metric_params : dict, default=None Additional keyword arguments for the metric function. .. versionadded:: 0.19 - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. - leaf_size : int, optional (default = 30) + leaf_size : int, default=30 Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : float, optional + p : float, default=None The power of the Minkowski metric to be used to calculate distance between points. - n_jobs : int or None, optional (default=None) + n_jobs : int or None, default=None The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` From 479548ac9b675dba10110e0ae7df2a9e1f73ade6 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 3 Dec 2019 18:32:57 +0100 Subject: [PATCH 068/448] MAINT remove placeholder github actions workflow (#15767) --- .github/workflows/activate.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/activate.yml diff --git a/.github/workflows/activate.yml b/.github/workflows/activate.yml deleted file mode 100644 index f6aede6289ebf..0000000000000 --- a/.github/workflows/activate.yml +++ /dev/null @@ -1,21 +0,0 @@ -# Simple first task to activate GitHub actions. -# This won't run until is merged, but future actions will -# run on PRs, so we can see we don't break things in more -# complex actions added later, like real builds. -# -# TODO: Remove this once another action exists -name: Activate - -on: - push: - branches: master - pull_request: - branches: master - -jobs: - activate: - name: Activate actions - runs-on: ubuntu-latest - steps: - - name: Activate - run: echo "GitHub actions ok" From c6f09ee5e37d417845c82d6e0d489fc76c92d125 Mon Sep 17 00:00:00 2001 From: cgsavard Date: Wed, 4 Dec 2019 00:08:36 -0700 Subject: [PATCH 069/448] DOC fix incorrect branch reference in contributing doc (#15779) --- doc/developers/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 863ecfb7741b3..8d12187ade00b 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -251,7 +251,7 @@ modifying code and submitting a PR: to record your changes in Git, then push the changes to your GitHub account with:: - $ git push -u origin my-feature + $ git push -u origin my_feature 10. Follow `these `_ From f344d3f249cf0f2846a6b9a2bc4af58bcebd8169 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 4 Dec 2019 18:09:57 +1100 Subject: [PATCH 070/448] DOC relabel Feature -> Efficiency in change log (#15770) --- doc/whats_new/v0.22.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f3f69a8299b0a..7b0c031f9196b 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -416,12 +416,6 @@ Changelog :mod:`sklearn.gaussian_process` ............................... -- |Feature| :func:`gaussian_process.GaussianProcessClassifier.log_marginal_likelihood` - and :func:`gaussian_process.GaussianProcessRegressor.log_marginal_likelihood` now - accept a ``clone_kernel=True`` keyword argument. When set to ``False``, - the kernel attribute is modified, but may result in a performance improvement. - :pr:`14378` by :user:`Masashi Shibata `. - - |Feature| Gaussian process models on structured data: :class:`gaussian_process.GaussianProcessRegressor` and :class:`gaussian_process.GaussianProcessClassifier` can now accept a list of generic objects (e.g. strings, trees, graphs, etc.) as the ``X`` argument @@ -431,6 +425,12 @@ Changelog to notify the GPR/GPC model that it handles non-vectorial samples. :pr:`15557` by :user:`Yu-Hang Tang `. +- |Efficiency| :func:`gaussian_process.GaussianProcessClassifier.log_marginal_likelihood` + and :func:`gaussian_process.GaussianProcessRegressor.log_marginal_likelihood` now + accept a ``clone_kernel=True`` keyword argument. When set to ``False``, + the kernel attribute is modified, but may result in a performance improvement. + :pr:`14378` by :user:`Masashi Shibata `. + - |API| From version 0.24 :meth:`gaussian_process.kernels.Kernel.get_params` will raise an ``AttributeError`` rather than return ``None`` for parameters that are in the estimator's constructor but not stored as attributes on the instance. From 0bbe923fe637944bd15f831403b18884d6655d54 Mon Sep 17 00:00:00 2001 From: cgsavard Date: Wed, 4 Dec 2019 03:24:00 -0700 Subject: [PATCH 071/448] DOC fixed Birch default value (#15780) --- sklearn/cluster/_birch.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 349ec19c6ff9c..0a16586caae9a 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -334,20 +334,20 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): Parameters ---------- - threshold : float, default 0.5 + threshold : float, default=0.5 The radius of the subcluster obtained by merging a new sample and the closest subcluster should be lesser than the threshold. Otherwise a new subcluster is started. Setting this value to be very low promotes splitting and vice-versa. - branching_factor : int, default 50 + branching_factor : int, default=50 Maximum number of CF subclusters in each node. If a new samples enters such that the number of subclusters exceed the branching_factor then that node is split into two nodes with the subclusters redistributed in each. The parent subcluster of that node is removed and two new subclusters are added as parents of the 2 split nodes. - n_clusters : int, instance of sklearn.cluster model, default 3 + n_clusters : int, instance of sklearn.cluster model, default=3 Number of clusters after the final clustering step, which treats the subclusters from the leaves as new samples. @@ -361,10 +361,10 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): - `int` : the model fit is :class:`AgglomerativeClustering` with `n_clusters` set to be equal to the int. - compute_labels : bool, default True + compute_labels : bool, default=True Whether or not to compute labels for each fit. - copy : bool, default True + copy : bool, default=True Whether or not to make a copy of the given data. If set to False, the initial data will be overwritten. From 69884d5fd207e0229318334a52f88dc0470af069 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 09:53:15 -0500 Subject: [PATCH 072/448] ENH: Improves speed of one hot encoding (#15762) --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/preprocessing/_encoders.py | 12 +++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index c57855dd774b2..a1cf4b4dd7d00 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -50,3 +50,9 @@ Changelog - |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more more memory efficient implementation of single linkage clustering. :pr:`11514` by :user:`Leland McInnes `. + +:mod:`sklearn.preprocessing` +............................ + +- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at + transforming. :pr:`15762` by `Thomas Fan`_. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b9c6d3adf8393..36512e359c7ed 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -405,12 +405,14 @@ def transform(self, X): n_values = [len(cats) for cats in self.categories_] mask = X_mask.ravel() - n_values = np.array([0] + n_values) - feature_indices = np.cumsum(n_values) + feature_indices = np.cumsum([0] + n_values) indices = (X_int + feature_indices[:-1]).ravel()[mask] - indptr = X_mask.sum(axis=1).cumsum() - indptr = np.insert(indptr, 0, 0) - data = np.ones(n_samples * n_features)[mask] + + indptr = np.empty(n_samples + 1, dtype=np.int) + indptr[0] = 0 + np.sum(X_mask, axis=1, out=indptr[1:]) + np.cumsum(indptr[1:], out=indptr[1:]) + data = np.ones(indptr[-1]) out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), From 3a3178ec31e84a97a60a9bf109ed39527e962392 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 4 Dec 2019 12:21:56 -0500 Subject: [PATCH 073/448] MNT remove deprecated loss functions in gb.py (#15790) --- sklearn/ensemble/_gb.py | 1027 --------------------------------------- 1 file changed, 1027 deletions(-) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 9db0a0322045d..c3971e019a088 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -61,1033 +61,6 @@ from ..exceptions import NotFittedError -# FIXME: 0.23 -# All the losses and corresponding init estimators have been moved to the -# _losses module in 0.21. We deprecate them and keep them here for now in case -# someone has imported them. None of these losses can be used as a parameter -# to a GBDT estimator anyway (loss param only accepts strings). - -@deprecated("QuantileEstimator is deprecated in version " - "0.21 and will be removed in version 0.23.") -class QuantileEstimator: - """An estimator predicting the alpha-quantile of the training targets. - - Parameters - ---------- - alpha : float - The quantile - """ - def __init__(self, alpha=0.9): - if not 0 < alpha < 1.0: - raise ValueError("`alpha` must be in (0, 1.0) but was %r" % alpha) - self.alpha = alpha - - def fit(self, X, y, sample_weight=None): - """Fit the estimator. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data - - y : array, shape (n_samples, n_targets) - Target values. Will be cast to X's dtype if necessary - - sample_weight : numpy array of shape (n_samples,) - Individual weights for each sample - """ - if sample_weight is None: - self.quantile = np.percentile(y, self.alpha * 100.0) - else: - self.quantile = _weighted_percentile(y, sample_weight, - self.alpha * 100.0) - - def predict(self, X): - """Predict labels - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Samples. - - Returns - ------- - y : array, shape (n_samples,) - Returns predicted values. - """ - check_is_fitted(self) - - y = np.empty((X.shape[0], 1), dtype=np.float64) - y.fill(self.quantile) - return y - - -@deprecated("MeanEstimator is deprecated in version " - "0.21 and will be removed in version 0.23.") -class MeanEstimator: - """An estimator predicting the mean of the training targets.""" - def fit(self, X, y, sample_weight=None): - """Fit the estimator. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data - - y : array, shape (n_samples, n_targets) - Target values. Will be cast to X's dtype if necessary - - sample_weight : numpy array of shape (n_samples,) - Individual weights for each sample - """ - if sample_weight is None: - self.mean = np.mean(y) - else: - self.mean = np.average(y, weights=sample_weight) - - def predict(self, X): - """Predict labels - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Samples. - - Returns - ------- - y : array, shape (n_samples,) - Returns predicted values. - """ - check_is_fitted(self) - - y = np.empty((X.shape[0], 1), dtype=np.float64) - y.fill(self.mean) - return y - - -@deprecated("LogOddsEstimator is deprecated in version " - "0.21 and will be removed in version 0.23.") -class LogOddsEstimator: - """An estimator predicting the log odds ratio.""" - scale = 1.0 - - def fit(self, X, y, sample_weight=None): - """Fit the estimator. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data - - y : array, shape (n_samples, n_targets) - Target values. Will be cast to X's dtype if necessary - - sample_weight : numpy array of shape (n_samples,) - Individual weights for each sample - """ - # pre-cond: pos, neg are encoded as 1, 0 - if sample_weight is None: - pos = np.sum(y) - neg = y.shape[0] - pos - else: - pos = np.sum(sample_weight * y) - neg = np.sum(sample_weight * (1 - y)) - - if neg == 0 or pos == 0: - raise ValueError('y contains non binary labels.') - self.prior = self.scale * np.log(pos / neg) - - def predict(self, X): - """Predict labels - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Samples. - - Returns - ------- - y : array, shape (n_samples,) - Returns predicted values. - """ - check_is_fitted(self) - - y = np.empty((X.shape[0], 1), dtype=np.float64) - y.fill(self.prior) - return y - - -@deprecated("ScaledLogOddsEstimator is deprecated in version " - "0.21 and will be removed in version 0.23.") -class ScaledLogOddsEstimator(LogOddsEstimator): - """Log odds ratio scaled by 0.5 -- for exponential loss. """ - scale = 0.5 - - -@deprecated("PriorProbablityEstimator is deprecated in version " - "0.21 and will be removed in version 0.23.") -class PriorProbabilityEstimator: - """An estimator predicting the probability of each - class in the training data. - """ - def fit(self, X, y, sample_weight=None): - """Fit the estimator. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data - - y : array, shape (n_samples, n_targets) - Target values. Will be cast to X's dtype if necessary - - sample_weight : array, shape (n_samples,) - Individual weights for each sample - """ - if sample_weight is None: - sample_weight = np.ones_like(y, dtype=np.float64) - class_counts = np.bincount(y, weights=sample_weight) - self.priors = class_counts / class_counts.sum() - - def predict(self, X): - """Predict labels - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Samples. - - Returns - ------- - y : array, shape (n_samples,) - Returns predicted values. - """ - check_is_fitted(self) - - y = np.empty((X.shape[0], self.priors.shape[0]), dtype=np.float64) - y[:] = self.priors - return y - - -@deprecated("Using ZeroEstimator is deprecated in version " - "0.21 and will be removed in version 0.23.") -class ZeroEstimator: - """An estimator that simply predicts zero. - - .. deprecated:: 0.21 - Using ``ZeroEstimator`` or ``init='zero'`` is deprecated in version - 0.21 and will be removed in version 0.23. - - """ - - def fit(self, X, y, sample_weight=None): - """Fit the estimator. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data - - y : numpy, shape (n_samples, n_targets) - Target values. Will be cast to X's dtype if necessary - - sample_weight : array, shape (n_samples,) - Individual weights for each sample - """ - if np.issubdtype(y.dtype, np.signedinteger): - # classification - self.n_classes = np.unique(y).shape[0] - if self.n_classes == 2: - self.n_classes = 1 - else: - # regression - self.n_classes = 1 - - def predict(self, X): - """Predict labels - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Samples. - - Returns - ------- - y : array, shape (n_samples,) - Returns predicted values. - """ - check_is_fitted(self) - - y = np.empty((X.shape[0], self.n_classes), dtype=np.float64) - y.fill(0.0) - return y - - def predict_proba(self, X): - return self.predict(X) - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class LossFunction(metaclass=ABCMeta): - """Abstract base class for various loss functions. - - Parameters - ---------- - n_classes : int - Number of classes - - Attributes - ---------- - K : int - The number of regression trees to be induced; - 1 for regression and binary classification; - ``n_classes`` for multi-class classification. - """ - - is_multi_class = False - - def __init__(self, n_classes): - self.K = n_classes - - def init_estimator(self): - """Default ``init`` estimator for loss function. """ - raise NotImplementedError() - - @abstractmethod - def __call__(self, y, pred, sample_weight=None): - """Compute the loss. - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - - @abstractmethod - def negative_gradient(self, y, y_pred, **kargs): - """Compute the negative gradient. - - Parameters - ---------- - y : array, shape (n_samples,) - The target labels. - - y_pred : array, shape (n_samples,) - The predictions. - """ - - def update_terminal_regions(self, tree, X, y, residual, y_pred, - sample_weight, sample_mask, - learning_rate=0.1, k=0): - """Update the terminal regions (=leaves) of the given tree and - updates the current predictions of the model. Traverses tree - and invokes template method `_update_terminal_region`. - - Parameters - ---------- - tree : tree.Tree - The tree object. - X : array, shape (n, m) - The data array. - y : array, shape (n,) - The target labels. - residual : array, shape (n,) - The residuals (usually the negative gradient). - y_pred : array, shape (n,) - The predictions. - sample_weight : array, shape (n,) - The weight of each sample. - sample_mask : array, shape (n,) - The sample mask to be used. - learning_rate : float, default=0.1 - learning rate shrinks the contribution of each tree by - ``learning_rate``. - k : int, default 0 - The index of the estimator being updated. - - """ - # compute leaf for each sample in ``X``. - terminal_regions = tree.apply(X) - - # mask all which are not in sample mask. - masked_terminal_regions = terminal_regions.copy() - masked_terminal_regions[~sample_mask] = -1 - - # update each leaf (= perform line search) - for leaf in np.where(tree.children_left == TREE_LEAF)[0]: - self._update_terminal_region(tree, masked_terminal_regions, - leaf, X, y, residual, - y_pred[:, k], sample_weight) - - # update predictions (both in-bag and out-of-bag) - y_pred[:, k] += (learning_rate - * tree.value[:, 0, 0].take(terminal_regions, axis=0)) - - @abstractmethod - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - """Template method for updating terminal regions (=leaves). """ - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class RegressionLossFunction(LossFunction, metaclass=ABCMeta): - """Base class for regression loss functions. - - Parameters - ---------- - n_classes : int - Number of classes - """ - def __init__(self, n_classes): - if n_classes != 1: - raise ValueError("``n_classes`` must be 1 for regression but " - "was %r" % n_classes) - super().__init__(n_classes) - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class LeastSquaresError(RegressionLossFunction): - """Loss function for least squares (LS) estimation. - Terminal regions need not to be updated for least squares. - - Parameters - ---------- - n_classes : int - Number of classes - """ - - def init_estimator(self): - return MeanEstimator() - - def __call__(self, y, pred, sample_weight=None): - """Compute the least squares loss. - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - if sample_weight is None: - return np.mean((y - pred.ravel()) ** 2.0) - else: - return (1.0 / sample_weight.sum() * - np.sum(sample_weight * ((y - pred.ravel()) ** 2.0))) - - def negative_gradient(self, y, pred, **kargs): - """Compute the negative gradient. - - Parameters - ---------- - y : array, shape (n_samples,) - The target labels. - - pred : array, shape (n_samples,) - The predictions. - """ - return y - pred.ravel() - - def update_terminal_regions(self, tree, X, y, residual, y_pred, - sample_weight, sample_mask, - learning_rate=0.1, k=0): - """Least squares does not need to update terminal regions. - - But it has to update the predictions. - - Parameters - ---------- - tree : tree.Tree - The tree object. - X : array, shape (n, m) - The data array. - y : array, shape (n,) - The target labels. - residual : array, shape (n,) - The residuals (usually the negative gradient). - y_pred : array, shape (n,) - The predictions. - sample_weight : array, shape (n,) - The weight of each sample. - sample_mask : array, shape (n,) - The sample mask to be used. - learning_rate : float, default=0.1 - learning rate shrinks the contribution of each tree by - ``learning_rate``. - k : int, default 0 - The index of the estimator being updated. - """ - # update predictions - y_pred[:, k] += learning_rate * tree.predict(X).ravel() - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - pass - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class LeastAbsoluteError(RegressionLossFunction): - """Loss function for least absolute deviation (LAD) regression. - - Parameters - ---------- - n_classes : int - Number of classes - """ - def init_estimator(self): - return QuantileEstimator(alpha=0.5) - - def __call__(self, y, pred, sample_weight=None): - """Compute the least absolute error. - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - if sample_weight is None: - return np.abs(y - pred.ravel()).mean() - else: - return (1.0 / sample_weight.sum() * - np.sum(sample_weight * np.abs(y - pred.ravel()))) - - def negative_gradient(self, y, pred, **kargs): - """Compute the negative gradient. - - 1.0 if y - pred > 0.0 else -1.0 - - Parameters - ---------- - y : array, shape (n_samples,) - The target labels. - - pred : array, shape (n_samples,) - The predictions. - """ - pred = pred.ravel() - return 2.0 * (y - pred > 0.0) - 1.0 - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - """LAD updates terminal regions to median estimates. """ - terminal_region = np.where(terminal_regions == leaf)[0] - sample_weight = sample_weight.take(terminal_region, axis=0) - diff = y.take(terminal_region, axis=0) - pred.take(terminal_region, axis=0) - tree.value[leaf, 0, 0] = _weighted_percentile(diff, sample_weight, percentile=50) - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class HuberLossFunction(RegressionLossFunction): - """Huber loss function for robust regression. - - M-Regression proposed in Friedman 2001. - - References - ---------- - J. Friedman, Greedy Function Approximation: A Gradient Boosting - Machine, The Annals of Statistics, Vol. 29, No. 5, 2001. - - Parameters - ---------- - n_classes : int - Number of classes - - alpha : float - Percentile at which to extract score - """ - - def __init__(self, n_classes, alpha=0.9): - super().__init__(n_classes) - self.alpha = alpha - self.gamma = None - - def init_estimator(self): - return QuantileEstimator(alpha=0.5) - - def __call__(self, y, pred, sample_weight=None): - """Compute the Huber loss. - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - pred = pred.ravel() - diff = y - pred - gamma = self.gamma - if gamma is None: - if sample_weight is None: - gamma = np.percentile(np.abs(diff), self.alpha * 100) - else: - gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100) - - gamma_mask = np.abs(diff) <= gamma - if sample_weight is None: - sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2.0) - lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) - gamma / 2.0)) - loss = (sq_loss + lin_loss) / y.shape[0] - else: - sq_loss = np.sum(0.5 * sample_weight[gamma_mask] * diff[gamma_mask] ** 2.0) - lin_loss = np.sum(gamma * sample_weight[~gamma_mask] * - (np.abs(diff[~gamma_mask]) - gamma / 2.0)) - loss = (sq_loss + lin_loss) / sample_weight.sum() - return loss - - def negative_gradient(self, y, pred, sample_weight=None, **kargs): - """Compute the negative gradient. - - Parameters - ---------- - y : array, shape (n_samples,) - The target labels. - - pred : array, shape (n_samples,) - The predictions. - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - pred = pred.ravel() - diff = y - pred - if sample_weight is None: - gamma = np.percentile(np.abs(diff), self.alpha * 100) - else: - gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100) - gamma_mask = np.abs(diff) <= gamma - residual = np.zeros((y.shape[0],), dtype=np.float64) - residual[gamma_mask] = diff[gamma_mask] - residual[~gamma_mask] = gamma * np.sign(diff[~gamma_mask]) - self.gamma = gamma - return residual - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - terminal_region = np.where(terminal_regions == leaf)[0] - sample_weight = sample_weight.take(terminal_region, axis=0) - gamma = self.gamma - diff = (y.take(terminal_region, axis=0) - - pred.take(terminal_region, axis=0)) - median = _weighted_percentile(diff, sample_weight, percentile=50) - diff_minus_median = diff - median - tree.value[leaf, 0] = median + np.mean( - np.sign(diff_minus_median) * - np.minimum(np.abs(diff_minus_median), gamma)) - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class QuantileLossFunction(RegressionLossFunction): - """Loss function for quantile regression. - - Quantile regression allows to estimate the percentiles - of the conditional distribution of the target. - - Parameters - ---------- - n_classes : int - Number of classes. - - alpha : float, optional (default = 0.9) - The percentile - """ - def __init__(self, n_classes, alpha=0.9): - super().__init__(n_classes) - self.alpha = alpha - self.percentile = alpha * 100.0 - - def init_estimator(self): - return QuantileEstimator(self.alpha) - - def __call__(self, y, pred, sample_weight=None): - """Compute the Quantile loss. - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - pred = pred.ravel() - diff = y - pred - alpha = self.alpha - - mask = y > pred - if sample_weight is None: - loss = (alpha * diff[mask].sum() - - (1.0 - alpha) * diff[~mask].sum()) / y.shape[0] - else: - loss = ((alpha * np.sum(sample_weight[mask] * diff[mask]) - - (1.0 - alpha) * np.sum(sample_weight[~mask] * diff[~mask])) / - sample_weight.sum()) - return loss - - def negative_gradient(self, y, pred, **kargs): - """Compute the negative gradient. - - Parameters - ---------- - y : array, shape (n_samples,) - The target labels. - - pred : array, shape (n_samples,) - The predictions. - """ - alpha = self.alpha - pred = pred.ravel() - mask = y > pred - return (alpha * mask) - ((1.0 - alpha) * ~mask) - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - terminal_region = np.where(terminal_regions == leaf)[0] - diff = (y.take(terminal_region, axis=0) - - pred.take(terminal_region, axis=0)) - sample_weight = sample_weight.take(terminal_region, axis=0) - - val = _weighted_percentile(diff, sample_weight, self.percentile) - tree.value[leaf, 0] = val - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class ClassificationLossFunction(LossFunction, metaclass=ABCMeta): - """Base class for classification loss functions. """ - - def _score_to_proba(self, score): - """Template method to convert scores to probabilities. - - the does not support probabilities raises AttributeError. - """ - raise TypeError('%s does not support predict_proba' % type(self).__name__) - - @abstractmethod - def _score_to_decision(self, score): - """Template method to convert scores to decisions. - - Returns int arrays. - """ - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class BinomialDeviance(ClassificationLossFunction): - """Binomial deviance loss function for binary classification. - - Binary classification is a special case; here, we only need to - fit one tree instead of ``n_classes`` trees. - - Parameters - ---------- - n_classes : int - Number of classes. - """ - def __init__(self, n_classes): - if n_classes != 2: - raise ValueError("{0:s} requires 2 classes; got {1:d} class(es)" - .format(self.__class__.__name__, n_classes)) - # we only need to fit one tree for binary clf. - super().__init__(1) - - def init_estimator(self): - return LogOddsEstimator() - - def __call__(self, y, pred, sample_weight=None): - """Compute the deviance (= 2 * negative log-likelihood). - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - # logaddexp(0, v) == log(1.0 + exp(v)) - pred = pred.ravel() - if sample_weight is None: - return -2.0 * np.mean((y * pred) - np.logaddexp(0.0, pred)) - else: - return (-2.0 / sample_weight.sum() * - np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred)))) - - def negative_gradient(self, y, pred, **kargs): - """Compute the residual (= negative gradient). - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - """ - return y - expit(pred.ravel()) - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - """Make a single Newton-Raphson step. - - our node estimate is given by: - - sum(w * (y - prob)) / sum(w * prob * (1 - prob)) - - we take advantage that: y - prob = residual - """ - terminal_region = np.where(terminal_regions == leaf)[0] - residual = residual.take(terminal_region, axis=0) - y = y.take(terminal_region, axis=0) - sample_weight = sample_weight.take(terminal_region, axis=0) - - numerator = np.sum(sample_weight * residual) - denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual)) - - # prevents overflow and division by zero - if abs(denominator) < 1e-150: - tree.value[leaf, 0, 0] = 0.0 - else: - tree.value[leaf, 0, 0] = numerator / denominator - - def _score_to_proba(self, score): - proba = np.ones((score.shape[0], 2), dtype=np.float64) - proba[:, 1] = expit(score.ravel()) - proba[:, 0] -= proba[:, 1] - return proba - - def _score_to_decision(self, score): - proba = self._score_to_proba(score) - return np.argmax(proba, axis=1) - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class MultinomialDeviance(ClassificationLossFunction): - """Multinomial deviance loss function for multi-class classification. - - For multi-class classification we need to fit ``n_classes`` trees at - each stage. - - Parameters - ---------- - n_classes : int - Number of classes - """ - - is_multi_class = True - - def __init__(self, n_classes): - if n_classes < 3: - raise ValueError("{0:s} requires more than 2 classes.".format( - self.__class__.__name__)) - super().__init__(n_classes) - - def init_estimator(self): - return PriorProbabilityEstimator() - - def __call__(self, y, pred, sample_weight=None): - """Compute the Multinomial deviance. - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - # create one-hot label encoding - Y = np.zeros((y.shape[0], self.K), dtype=np.float64) - for k in range(self.K): - Y[:, k] = y == k - - if sample_weight is None: - return np.sum(-1 * (Y * pred).sum(axis=1) + - logsumexp(pred, axis=1)) - else: - return np.sum(-1 * sample_weight * (Y * pred).sum(axis=1) + - logsumexp(pred, axis=1)) - - def negative_gradient(self, y, pred, k=0, **kwargs): - """Compute negative gradient for the ``k``-th class. - - Parameters - ---------- - y : array, shape (n_samples,) - The target labels. - - pred : array, shape (n_samples,) - The predictions. - - k : int, optional (default=0) - The index of the class - """ - return y - np.nan_to_num(np.exp(pred[:, k] - - logsumexp(pred, axis=1))) - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - """Make a single Newton-Raphson step. """ - terminal_region = np.where(terminal_regions == leaf)[0] - residual = residual.take(terminal_region, axis=0) - y = y.take(terminal_region, axis=0) - sample_weight = sample_weight.take(terminal_region, axis=0) - - numerator = np.sum(sample_weight * residual) - numerator *= (self.K - 1) / self.K - - denominator = np.sum(sample_weight * (y - residual) * - (1.0 - y + residual)) - - # prevents overflow and division by zero - if abs(denominator) < 1e-150: - tree.value[leaf, 0, 0] = 0.0 - else: - tree.value[leaf, 0, 0] = numerator / denominator - - def _score_to_proba(self, score): - return np.nan_to_num( - np.exp(score - (logsumexp(score, axis=1)[:, np.newaxis]))) - - def _score_to_decision(self, score): - proba = self._score_to_proba(score) - return np.argmax(proba, axis=1) - - -@deprecated("All Losses in sklearn.ensemble.gradient_boosting are " - "deprecated in version " - "0.21 and will be removed in version 0.23.") -class ExponentialLoss(ClassificationLossFunction): - """Exponential loss function for binary classification. - - Same loss as AdaBoost. - - References - ---------- - Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007 - - Parameters - ---------- - n_classes : int - Number of classes. - """ - def __init__(self, n_classes): - if n_classes != 2: - raise ValueError("{0:s} requires 2 classes; got {1:d} class(es)" - .format(self.__class__.__name__, n_classes)) - # we only need to fit one tree for binary clf. - super().__init__(1) - - def init_estimator(self): - return ScaledLogOddsEstimator() - - def __call__(self, y, pred, sample_weight=None): - """Compute the exponential loss - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - - sample_weight : array-like, shape (n_samples,), optional - Sample weights. - """ - pred = pred.ravel() - if sample_weight is None: - return np.mean(np.exp(-(2. * y - 1.) * pred)) - else: - return (1.0 / sample_weight.sum() * - np.sum(sample_weight * np.exp(-(2 * y - 1) * pred))) - - def negative_gradient(self, y, pred, **kargs): - """Compute the residual (= negative gradient). - - Parameters - ---------- - y : array, shape (n_samples,) - True labels - - pred : array, shape (n_samples,) - Predicted labels - """ - y_ = -(2. * y - 1.) - return y_ * np.exp(y_ * pred.ravel()) - - def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, - residual, pred, sample_weight): - terminal_region = np.where(terminal_regions == leaf)[0] - pred = pred.take(terminal_region, axis=0) - y = y.take(terminal_region, axis=0) - sample_weight = sample_weight.take(terminal_region, axis=0) - - y_ = 2. * y - 1. - - numerator = np.sum(y_ * sample_weight * np.exp(-y_ * pred)) - denominator = np.sum(sample_weight * np.exp(-y_ * pred)) - - # prevents overflow and division by zero - if abs(denominator) < 1e-150: - tree.value[leaf, 0, 0] = 0.0 - else: - tree.value[leaf, 0, 0] = numerator / denominator - - def _score_to_proba(self, score): - proba = np.ones((score.shape[0], 2), dtype=np.float64) - proba[:, 1] = expit(2.0 * score.ravel()) - proba[:, 0] -= proba[:, 1] - return proba - - def _score_to_decision(self, score): - return (score.ravel() >= 0.0).astype(np.int) - - class VerboseReporter: """Reports verbose output to stdout. From 4cdbde308552b8700da30ed1c1c4f5085a24abde Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 4 Dec 2019 12:22:56 -0500 Subject: [PATCH 074/448] MNT remove deprecated ensemble.partial_dependence (#15789) --- doc/modules/classes.rst | 7 - sklearn/ensemble/__init__.py | 3 +- sklearn/ensemble/partial_dependence.py | 441 ------------------ .../ensemble/tests/test_partial_dependence.py | 283 ----------- 4 files changed, 1 insertion(+), 733 deletions(-) delete mode 100644 sklearn/ensemble/partial_dependence.py delete mode 100644 sklearn/ensemble/tests/test_partial_dependence.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index f7d3fe6791407..fc4a962b214a5 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1625,10 +1625,3 @@ To be removed in 0.23 metrics.jaccard_similarity_score linear_model.logistic_regression_path utils.safe_indexing - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - ensemble.partial_dependence.partial_dependence - ensemble.partial_dependence.plot_partial_dependence diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index ae3f98db5cbf1..27acb2fbcf00a 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -21,7 +21,6 @@ from ._stacking import StackingClassifier from ._stacking import StackingRegressor -from . import partial_dependence __all__ = ["BaseEnsemble", "RandomForestClassifier", "RandomForestRegressor", @@ -31,4 +30,4 @@ "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "VotingRegressor", "StackingClassifier", "StackingRegressor", - "partial_dependence"] + ] diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py deleted file mode 100644 index 07f3eba29a136..0000000000000 --- a/sklearn/ensemble/partial_dependence.py +++ /dev/null @@ -1,441 +0,0 @@ -"""Partial dependence plots for tree ensembles. """ - -# Authors: Peter Prettenhofer -# License: BSD 3 clause - -# Note: function here are deprecated. We don't call the new versions because -# the API slightly changes (namely partial_dependence does not have the grid -# parameter anymore.) - -from itertools import count -import numbers - -import numpy as np -from scipy.stats.mstats import mquantiles -from joblib import Parallel, delayed - -from ..utils.extmath import cartesian -from ..utils import check_array -from ..utils.validation import check_is_fitted -from ..tree._tree import DTYPE -from ..utils import deprecated - -from ._gb import BaseGradientBoosting - - -__all__ = [ - 'partial_dependence', - 'plot_partial_dependence', -] - - -def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100): - """Generate a grid of points based on the ``percentiles of ``X``. - - The grid is generated by placing ``grid_resolution`` equally - spaced points between the ``percentiles`` of each column - of ``X``. - - Parameters - ---------- - X : ndarray - The data - percentiles : tuple of floats - The percentiles which are used to construct the extreme - values of the grid axes. - grid_resolution : int - The number of equally spaced points that are placed - on the grid. - - Returns - ------- - grid : ndarray - All data points on the grid; ``grid.shape[1] == X.shape[1]`` - and ``grid.shape[0] == grid_resolution * X.shape[1]``. - axes : seq of ndarray - The axes with which the grid has been created. - """ - if len(percentiles) != 2: - raise ValueError('percentile must be tuple of len 2') - if not all(0. <= x <= 1. for x in percentiles): - raise ValueError('percentile values must be in [0, 1]') - - axes = [] - emp_percentiles = mquantiles(X, prob=percentiles, axis=0) - for col in range(X.shape[1]): - uniques = np.unique(X[:, col]) - if uniques.shape[0] < grid_resolution: - # feature has low resolution use unique vals - axis = uniques - else: - # create axis based on percentiles and grid resolution - axis = np.linspace(emp_percentiles[0, col], - emp_percentiles[1, col], - num=grid_resolution, endpoint=True) - axes.append(axis) - - return cartesian(axes), axes - - -@deprecated("The function ensemble.partial_dependence has been deprecated " - "in favour of inspection.partial_dependence in 0.21 " - "and will be removed in 0.23.") -def partial_dependence(gbrt, target_variables, grid=None, X=None, - percentiles=(0.05, 0.95), grid_resolution=100): - """Partial dependence of ``target_variables``. - - Partial dependence plots show the dependence between the joint values - of the ``target_variables`` and the function represented - by the ``gbrt``. - - Read more in the :ref:`User Guide `. - - .. deprecated:: 0.21 - This function was deprecated in version 0.21 in favor of - :func:`sklearn.inspection.partial_dependence` and will be - removed in 0.23. - - Parameters - ---------- - gbrt : BaseGradientBoosting - A fitted gradient boosting model. - - target_variables : array-like, dtype=int - The target features for which the partial dependency should be - computed (size should be smaller than 3 for visual renderings). - - grid : array-like of shape (n_points, n_target_variables) - The grid of ``target_variables`` values for which the - partial dependency should be evaluated (either ``grid`` or ``X`` - must be specified). - - X : array-like of shape (n_samples, n_features) - The data on which ``gbrt`` was trained. It is used to generate - a ``grid`` for the ``target_variables``. The ``grid`` comprises - ``grid_resolution`` equally spaced points between the two - ``percentiles``. - - percentiles : (low, high), default=(0.05, 0.95) - The lower and upper percentile used create the extreme values - for the ``grid``. Only if ``X`` is not None. - - grid_resolution : int, default=100 - The number of equally spaced points on the ``grid``. - - Returns - ------- - pdp : array, shape=(n_classes, n_points) - The partial dependence function evaluated on the ``grid``. - For regression and binary classification ``n_classes==1``. - - axes : seq of ndarray or None - The axes with which the grid has been created or None if - the grid has been given. - - Examples - -------- - >>> samples = [[0, 0, 2], [1, 0, 0]] - >>> labels = [0, 1] - >>> from sklearn.ensemble import GradientBoostingClassifier - >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels) - >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2) - >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP - (array([[-4.52..., 4.52...]]), [array([ 0., 1.])]) - """ - if not isinstance(gbrt, BaseGradientBoosting): - raise ValueError('gbrt has to be an instance of BaseGradientBoosting') - check_is_fitted(gbrt) - if (grid is None and X is None) or (grid is not None and X is not None): - raise ValueError('Either grid or X must be specified') - - target_variables = np.asarray(target_variables, dtype=np.int32, - order='C').ravel() - - if any([not (0 <= fx < gbrt.n_features_) for fx in target_variables]): - raise ValueError('target_variables must be in [0, %d]' - % (gbrt.n_features_ - 1)) - - if X is not None: - X = check_array(X, dtype=DTYPE, order='C') - grid, axes = _grid_from_X(X[:, target_variables], percentiles, - grid_resolution) - else: - assert grid is not None - # dont return axes if grid is given - axes = None - # grid must be 2d - if grid.ndim == 1: - grid = grid[:, np.newaxis] - if grid.ndim != 2: - raise ValueError('grid must be 2d but is %dd' % grid.ndim) - - grid = np.asarray(grid, dtype=DTYPE, order='C') - assert grid.shape[1] == target_variables.shape[0] - - n_trees_per_stage = gbrt.estimators_.shape[1] - n_estimators = gbrt.estimators_.shape[0] - pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64, - order='C') - for stage in range(n_estimators): - for k in range(n_trees_per_stage): - tree = gbrt.estimators_[stage, k].tree_ - tree.compute_partial_dependence(grid, target_variables, pdp[k]) - pdp *= gbrt.learning_rate - - return pdp, axes - - -@deprecated("The function ensemble.plot_partial_dependence has been " - "deprecated in favour of " - "sklearn.inspection.plot_partial_dependence in " - " 0.21 and will be removed in 0.23.") -def plot_partial_dependence(gbrt, X, features, feature_names=None, - label=None, n_cols=3, grid_resolution=100, - percentiles=(0.05, 0.95), n_jobs=None, - verbose=0, ax=None, line_kw=None, - contour_kw=None, **fig_kw): - """Partial dependence plots for ``features``. - - The ``len(features)`` plots are arranged in a grid with ``n_cols`` - columns. Two-way partial dependence plots are plotted as contour - plots. - - Read more in the :ref:`User Guide `. - - .. deprecated:: 0.21 - This function was deprecated in version 0.21 in favor of - :func:`sklearn.inspection.plot_partial_dependence` and will be - removed in 0.23. - - Parameters - ---------- - gbrt : BaseGradientBoosting - A fitted gradient boosting model. - - X : array-like of shape (n_samples, n_features) - The data on which ``gbrt`` was trained. - - features : seq of ints, strings, or tuples of ints or strings - If seq[i] is an int or a tuple with one int value, a one-way - PDP is created; if seq[i] is a tuple of two ints, a two-way - PDP is created. - If feature_names is specified and seq[i] is an int, seq[i] - must be < len(feature_names). - If seq[i] is a string, feature_names must be specified, and - seq[i] must be in feature_names. - - feature_names : seq of str - Name of each feature; feature_names[i] holds - the name of the feature with index i. - - label : object - The class label for which the PDPs should be computed. - Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``. - - n_cols : int - The number of columns in the grid plot (default: 3). - - grid_resolution : int, default=100 - The number of equally spaced points on the axes. - - percentiles : (low, high), default=(0.05, 0.95) - The lower and upper percentile used to create the extreme values - for the PDP axes. - - n_jobs : int or None, optional (default=None) - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - verbose : int - Verbose output during PD computations. Defaults to 0. - - ax : Matplotlib axis object, default None - An axis object onto which the plots will be drawn. - - line_kw : dict - Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. - For one-way partial dependence plots. - - contour_kw : dict - Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. - For two-way partial dependence plots. - - ``**fig_kw`` : dict - Dict with keywords passed to the figure() call. - Note that all keywords not recognized above will be automatically - included here. - - Returns - ------- - fig : figure - The Matplotlib Figure object. - - axs : seq of Axis objects - A seq of Axis objects, one for each subplot. - - Examples - -------- - >>> from sklearn.datasets import make_friedman1 - >>> from sklearn.ensemble import GradientBoostingRegressor - >>> X, y = make_friedman1() - >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) - >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP - ... - """ - import matplotlib.pyplot as plt - from matplotlib import transforms - from matplotlib.ticker import MaxNLocator - from matplotlib.ticker import ScalarFormatter - - if not isinstance(gbrt, BaseGradientBoosting): - raise ValueError('gbrt has to be an instance of BaseGradientBoosting') - check_is_fitted(gbrt) - - # set label_idx for multi-class GBRT - if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2: - if label is None: - raise ValueError('label is not given for multi-class PDP') - label_idx = np.searchsorted(gbrt.classes_, label) - if gbrt.classes_[label_idx] != label: - raise ValueError('label %s not in ``gbrt.classes_``' % str(label)) - else: - # regression and binary classification - label_idx = 0 - - X = check_array(X, dtype=DTYPE, order='C') - if gbrt.n_features_ != X.shape[1]: - raise ValueError('X.shape[1] does not match gbrt.n_features_') - - if line_kw is None: - line_kw = {'color': 'green'} - if contour_kw is None: - contour_kw = {} - - # convert feature_names to list - if feature_names is None: - # if not feature_names use fx indices as name - feature_names = [str(i) for i in range(gbrt.n_features_)] - elif isinstance(feature_names, np.ndarray): - feature_names = feature_names.tolist() - - def convert_feature(fx): - if isinstance(fx, str): - try: - fx = feature_names.index(fx) - except ValueError: - raise ValueError('Feature %s not in feature_names' % fx) - return fx - - # convert features into a seq of int tuples - tmp_features = [] - for fxs in features: - if isinstance(fxs, (numbers.Integral, str)): - fxs = (fxs,) - try: - fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32) - except TypeError: - raise ValueError('features must be either int, str, or tuple ' - 'of int/str') - if not (1 <= np.size(fxs) <= 2): - raise ValueError('target features must be either one or two') - - tmp_features.append(fxs) - - features = tmp_features - - names = [] - try: - for fxs in features: - l = [] - # explicit loop so "i" is bound for exception below - for i in fxs: - l.append(feature_names[i]) - names.append(l) - except IndexError: - raise ValueError('All entries of features must be less than ' - 'len(feature_names) = {0}, got {1}.' - .format(len(feature_names), i)) - - # compute PD functions - pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(partial_dependence)(gbrt, fxs, X=X, - grid_resolution=grid_resolution, - percentiles=percentiles) - for fxs in features) - - # get global min and max values of PD grouped by plot type - pdp_lim = {} - for pdp, axes in pd_result: - min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max() - n_fx = len(axes) - old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) - min_pd = min(min_pd, old_min_pd) - max_pd = max(max_pd, old_max_pd) - pdp_lim[n_fx] = (min_pd, max_pd) - - # create contour levels for two-way plots - if 2 in pdp_lim: - Z_level = np.linspace(*pdp_lim[2], num=8) - - if ax is None: - fig = plt.figure(**fig_kw) - else: - fig = ax.get_figure() - fig.clear() - - n_cols = min(n_cols, len(features)) - n_rows = int(np.ceil(len(features) / float(n_cols))) - axs = [] - for i, fx, name, (pdp, axes) in zip(count(), features, names, - pd_result): - ax = fig.add_subplot(n_rows, n_cols, i + 1) - - if len(axes) == 1: - ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw) - else: - # make contour plot - assert len(axes) == 2 - XX, YY = np.meshgrid(axes[0], axes[1]) - Z = pdp[label_idx].reshape(list(map(np.size, axes))).T - CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, - colors='k') - ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], - vmin=Z_level[0], alpha=0.75, **contour_kw) - ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) - - # plot data deciles + axes labels - deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) - trans = transforms.blended_transform_factory(ax.transData, - ax.transAxes) - ylim = ax.get_ylim() - ax.vlines(deciles, [0], 0.05, transform=trans, color='k') - ax.set_xlabel(name[0]) - ax.set_ylim(ylim) - - # prevent x-axis ticks from overlapping - ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) - tick_formatter = ScalarFormatter() - tick_formatter.set_powerlimits((-3, 4)) - ax.xaxis.set_major_formatter(tick_formatter) - - if len(axes) > 1: - # two-way PDP - y-axis deciles + labels - deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) - trans = transforms.blended_transform_factory(ax.transAxes, - ax.transData) - xlim = ax.get_xlim() - ax.hlines(deciles, [0], 0.05, transform=trans, color='k') - ax.set_ylabel(name[1]) - # hline erases xlim - ax.set_xlim(xlim) - else: - ax.set_ylabel('Partial dependence') - - if len(axes) == 1: - ax.set_ylim(pdp_lim[1]) - axs.append(ax) - - fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, - hspace=0.3) - return fig, axs diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py deleted file mode 100644 index 84ff0c004b68b..0000000000000 --- a/sklearn/ensemble/tests/test_partial_dependence.py +++ /dev/null @@ -1,283 +0,0 @@ -""" -Testing for the partial dependence module. -""" -import pytest - -import numpy as np -from numpy.testing import assert_array_equal, assert_allclose - -from sklearn.utils._testing import assert_raises -from sklearn.ensemble.partial_dependence import partial_dependence -from sklearn.ensemble.partial_dependence import plot_partial_dependence -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import GradientBoostingRegressor -from sklearn import datasets -from sklearn.utils._testing import ignore_warnings - - -# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved -pytestmark = pytest.mark.filterwarnings( - "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" - "matplotlib.*") - - -# toy sample -X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] -y = [-1, -1, -1, 1, 1, 1] -sample_weight = [1, 1, 1, 2, 2, 2] - -# also load the boston dataset -boston = datasets.load_boston() - -# also load the iris dataset -iris = datasets.load_iris() - - -@ignore_warnings(category=FutureWarning) -def test_partial_dependence_classifier(): - # Test partial dependence for classifier - clf = GradientBoostingClassifier(n_estimators=10, random_state=1) - clf.fit(X, y) - - pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) - - # only 4 grid points instead of 5 because only 4 unique X[:,0] vals - assert pdp.shape == (1, 4) - assert axes[0].shape[0] == 4 - - # now with our own grid - X_ = np.asarray(X) - grid = np.unique(X_[:, 0]) - pdp_2, axes = partial_dependence(clf, [0], grid=grid) - - assert axes is None - assert_array_equal(pdp, pdp_2) - - # with trivial (no-op) sample weights - clf.fit(X, y, sample_weight=np.ones(len(y))) - - pdp_w, axes_w = partial_dependence(clf, [0], X=X, grid_resolution=5) - - assert pdp_w.shape == (1, 4) - assert axes_w[0].shape[0] == 4 - assert_allclose(pdp_w, pdp) - - # with non-trivial sample weights - clf.fit(X, y, sample_weight=sample_weight) - - pdp_w2, axes_w2 = partial_dependence(clf, [0], X=X, grid_resolution=5) - - assert pdp_w2.shape == (1, 4) - assert axes_w2[0].shape[0] == 4 - assert np.all(np.abs(pdp_w2 - pdp_w) / np.abs(pdp_w) > 0.1) - - -@ignore_warnings(category=FutureWarning) -def test_partial_dependence_multiclass(): - # Test partial dependence for multi-class classifier - clf = GradientBoostingClassifier(n_estimators=10, random_state=1) - clf.fit(iris.data, iris.target) - - grid_resolution = 25 - n_classes = clf.n_classes_ - pdp, axes = partial_dependence( - clf, [0], X=iris.data, grid_resolution=grid_resolution) - - assert pdp.shape == (n_classes, grid_resolution) - assert len(axes) == 1 - assert axes[0].shape[0] == grid_resolution - - -@ignore_warnings(category=FutureWarning) -def test_partial_dependence_regressor(): - # Test partial dependence for regressor - clf = GradientBoostingRegressor(n_estimators=10, random_state=1) - clf.fit(boston.data, boston.target) - - grid_resolution = 25 - pdp, axes = partial_dependence( - clf, [0], X=boston.data, grid_resolution=grid_resolution) - - assert pdp.shape == (1, grid_resolution) - assert axes[0].shape[0] == grid_resolution - - -@ignore_warnings(category=FutureWarning) -def test_partial_dependence_sample_weight(): - # Test near perfect correlation between partial dependence and diagonal - # when sample weights emphasize y = x predictions - N = 1000 - rng = np.random.RandomState(123456) - mask = rng.randint(2, size=N, dtype=bool) - - x = rng.rand(N) - # set y = x on mask and y = -x outside - y = x.copy() - y[~mask] = -y[~mask] - X = np.c_[mask, x] - # sample weights to emphasize data points where y = x - sample_weight = np.ones(N) - sample_weight[mask] = 1000. - - clf = GradientBoostingRegressor(n_estimators=10, random_state=1) - clf.fit(X, y, sample_weight=sample_weight) - - grid = np.arange(0, 1, 0.01) - pdp = partial_dependence(clf, [1], grid=grid) - - assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99 - - -@ignore_warnings(category=FutureWarning) -def test_partial_dependecy_input(): - # Test input validation of partial dependence. - clf = GradientBoostingClassifier(n_estimators=10, random_state=1) - clf.fit(X, y) - - assert_raises(ValueError, partial_dependence, - clf, [0], grid=None, X=None) - - assert_raises(ValueError, partial_dependence, - clf, [0], grid=[0, 1], X=X) - - # first argument must be an instance of BaseGradientBoosting - assert_raises(ValueError, partial_dependence, - {}, [0], X=X) - - # Gradient boosting estimator must be fit - assert_raises(ValueError, partial_dependence, - GradientBoostingClassifier(), [0], X=X) - - assert_raises(ValueError, partial_dependence, clf, [-1], X=X) - - assert_raises(ValueError, partial_dependence, clf, [100], X=X) - - # wrong ndim for grid - grid = np.random.rand(10, 2, 1) - assert_raises(ValueError, partial_dependence, clf, [0], grid=grid) - - -@ignore_warnings(category=FutureWarning) -@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') -# matplotlib Python3.7 warning -def test_plot_partial_dependence(pyplot): - # Test partial dependence plot function. - clf = GradientBoostingRegressor(n_estimators=10, random_state=1) - clf.fit(boston.data, boston.target) - - grid_resolution = 25 - fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)], - grid_resolution=grid_resolution, - feature_names=boston.feature_names) - assert len(axs) == 3 - assert all(ax.has_data for ax in axs) - - # check with str features and array feature names - fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', - ('CRIM', 'ZN')], - grid_resolution=grid_resolution, - feature_names=boston.feature_names) - - assert len(axs) == 3 - assert all(ax.has_data for ax in axs) - - # check with list feature_names - feature_names = boston.feature_names.tolist() - fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', - ('CRIM', 'ZN')], - grid_resolution=grid_resolution, - feature_names=feature_names) - assert len(axs) == 3 - assert all(ax.has_data for ax in axs) - - -@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') -# matplotlib Python3.7 warning -@ignore_warnings(category=FutureWarning) -def test_plot_partial_dependence_input(pyplot): - # Test partial dependence plot function input checks. - clf = GradientBoostingClassifier(n_estimators=10, random_state=1) - - # not fitted yet - assert_raises(ValueError, plot_partial_dependence, - clf, X, [0]) - - clf.fit(X, y) - - assert_raises(ValueError, plot_partial_dependence, - clf, np.array(X)[:, :0], [0]) - - # first argument must be an instance of BaseGradientBoosting - assert_raises(ValueError, plot_partial_dependence, - {}, X, [0]) - - # must be larger than -1 - assert_raises(ValueError, plot_partial_dependence, - clf, X, [-1]) - - # too large feature value - assert_raises(ValueError, plot_partial_dependence, - clf, X, [100]) - - # str feature but no feature_names - assert_raises(ValueError, plot_partial_dependence, - clf, X, ['foobar']) - - # not valid features value - assert_raises(ValueError, plot_partial_dependence, - clf, X, [{'foo': 'bar'}]) - - -@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') -# matplotlib Python3.7 warning -@ignore_warnings(category=FutureWarning) -def test_plot_partial_dependence_multiclass(pyplot): - # Test partial dependence plot function on multi-class input. - clf = GradientBoostingClassifier(n_estimators=10, random_state=1) - clf.fit(iris.data, iris.target) - - grid_resolution = 25 - fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], - label=0, - grid_resolution=grid_resolution) - assert len(axs) == 2 - assert all(ax.has_data for ax in axs) - - # now with symbol labels - target = iris.target_names[iris.target] - clf = GradientBoostingClassifier(n_estimators=10, random_state=1) - clf.fit(iris.data, target) - - grid_resolution = 25 - fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], - label='setosa', - grid_resolution=grid_resolution) - assert len(axs) == 2 - assert all(ax.has_data for ax in axs) - - # label not in gbrt.classes_ - assert_raises(ValueError, plot_partial_dependence, - clf, iris.data, [0, 1], label='foobar', - grid_resolution=grid_resolution) - - # label not provided - assert_raises(ValueError, plot_partial_dependence, - clf, iris.data, [0, 1], - grid_resolution=grid_resolution) - - -@pytest.mark.parametrize( - "func, params", - [(partial_dependence, {'target_variables': [0], 'X': boston.data}), - (plot_partial_dependence, {'X': boston.data, 'features': [0, 1, (0, 1)]})] -) -def test_raise_deprecation_warning(pyplot, func, params): - clf = GradientBoostingRegressor(n_estimators=10, random_state=1) - clf.fit(boston.data, boston.target) - grid_resolution = 25 - - warn_msg = "The function ensemble.{} has been deprecated".format( - func.__name__ - ) - with pytest.warns(FutureWarning, match=warn_msg): - func(clf, **params, grid_resolution=grid_resolution) From 5573abbb998b03105a541be1ac2095f2c2abc026 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 4 Dec 2019 12:24:10 -0500 Subject: [PATCH 075/448] MNT Removed deprecated logistic_regression_path (#15791) --- doc/modules/classes.rst | 1 - sklearn/linear_model/__init__.py | 4 +- sklearn/linear_model/_logistic.py | 172 -------------------- sklearn/linear_model/tests/test_logistic.py | 8 - 4 files changed, 1 insertion(+), 184 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index fc4a962b214a5..14ddf328b4749 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1623,5 +1623,4 @@ To be removed in 0.23 utils.delayed metrics.calinski_harabaz_score metrics.jaccard_similarity_score - linear_model.logistic_regression_path utils.safe_indexing diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 01c686a69e970..59d0600d508d0 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -20,8 +20,7 @@ from ._stochastic_gradient import SGDClassifier, SGDRegressor from ._ridge import (Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, ridge_regression) -from ._logistic import (LogisticRegression, LogisticRegressionCV, - logistic_regression_path) +from ._logistic import LogisticRegression, LogisticRegressionCV from ._omp import (orthogonal_mp, orthogonal_mp_gram, OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV) from ._passive_aggressive import PassiveAggressiveClassifier @@ -71,7 +70,6 @@ 'lars_path', 'lars_path_gram', 'lasso_path', - 'logistic_regression_path', 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index de7ac323833e8..fd2eba7c7df82 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -476,178 +476,6 @@ def _check_multi_class(multi_class, solver, n_classes): return multi_class -@deprecated('logistic_regression_path was deprecated in version 0.21 and ' - 'will be removed in version 0.23.0') -def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, - max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', coef=None, - class_weight=None, dual=False, penalty='l2', - intercept_scaling=1., multi_class='auto', - random_state=None, check_input=True, - max_squared_sum=None, sample_weight=None, - l1_ratio=None): - """Compute a Logistic Regression model for a list of regularization - parameters. - - This is an implementation that uses the result of the previous model - to speed up computations along the set of solutions, making it faster - than sequentially calling LogisticRegression for the different parameters. - Note that there will be no speedup with liblinear solver, since it does - not handle warm-starting. - - .. deprecated:: 0.21 - ``logistic_regression_path`` was deprecated in version 0.21 and will - be removed in 0.23. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) - Input data. - - y : array-like, shape (n_samples,) or (n_samples, n_targets) - Input data, target values. - - pos_class : int, None - The class with respect to which we perform a one-vs-all fit. - If None, then it is assumed that the given problem is binary. - - Cs : int | array-like, shape (n_cs,) - List of values for the regularization parameter or integer specifying - the number of regularization parameters that should be used. In this - case, the parameters will be chosen in a logarithmic scale between - 1e-4 and 1e4. - - fit_intercept : bool - Whether to fit an intercept for the model. In this case the shape of - the returned array is (n_cs, n_features + 1). - - max_iter : int - Maximum number of iterations for the solver. - - tol : float - Stopping criterion. For the newton-cg and lbfgs solvers, the iteration - will stop when ``max{|g_i | i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient. - - verbose : int - For the liblinear and lbfgs solvers set verbose to any positive - number for verbosity. - - solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'} - Numerical solver to use. - - coef : array-like, shape (n_features,), default None - Initialization value for coefficients of logistic regression. - Useless for liblinear solver. - - class_weight : dict or 'balanced', optional - Weights associated with classes in the form ``{class_label: weight}``. - If not given, all classes are supposed to have weight one. - - The "balanced" mode uses the values of y to automatically adjust - weights inversely proportional to class frequencies in the input data - as ``n_samples / (n_classes * np.bincount(y))``. - - Note that these weights will be multiplied with sample_weight (passed - through the fit method) if sample_weight is specified. - - dual : bool - Dual or primal formulation. Dual formulation is only implemented for - l2 penalty with liblinear solver. Prefer dual=False when - n_samples > n_features. - - penalty : str, 'l1', 'l2', or 'elasticnet' - Used to specify the norm used in the penalization. The 'newton-cg', - 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is - only supported by the 'saga' solver. - - intercept_scaling : float, default 1. - Useful only when the solver 'liblinear' is used - and self.fit_intercept is set to True. In this case, x becomes - [x, self.intercept_scaling], - i.e. a "synthetic" feature with constant value equal to - intercept_scaling is appended to the instance vector. - The intercept becomes ``intercept_scaling * synthetic_feature_weight``. - - Note! the synthetic feature weight is subject to l1/l2 regularization - as all other features. - To lessen the effect of regularization on synthetic feature weight - (and therefore on the intercept) intercept_scaling has to be increased. - - multi_class : {'ovr', 'multinomial', 'auto'}, default='auto' - If the option chosen is 'ovr', then a binary problem is fit for each - label. For 'multinomial' the loss minimised is the multinomial loss fit - across the entire probability distribution, *even when the data is - binary*. 'multinomial' is unavailable when solver='liblinear'. - 'auto' selects 'ovr' if the data is binary, or if solver='liblinear', - and otherwise selects 'multinomial'. - - .. versionadded:: 0.18 - Stochastic Average Gradient descent solver for 'multinomial' case. - .. versionchanged:: 0.22 - Default changed from 'ovr' to 'auto' in 0.22. - - random_state : int, RandomState instance or None, optional, default None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag' or - 'liblinear'. - - check_input : bool, default True - If False, the input arrays X and y will not be checked. - - max_squared_sum : float, default None - Maximum squared sum of X over samples. Used only in SAG solver. - If None, it will be computed, going through all the samples. - The value should be precomputed to speed up cross validation. - - sample_weight : array-like, shape(n_samples,) optional - Array of weights that are assigned to individual samples. - If not provided, then each sample is given unit weight. - - l1_ratio : float or None, optional (default=None) - The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only - used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent - to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent - to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a - combination of L1 and L2. - - Returns - ------- - coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) - List of coefficients for the Logistic Regression model. If - fit_intercept is set to True then the second dimension will be - n_features + 1, where the last item represents the intercept. For - ``multiclass='multinomial'``, the shape is (n_classes, n_cs, - n_features) or (n_classes, n_cs, n_features + 1). - - Cs : ndarray - Grid of Cs used for cross-validation. - - n_iter : array, shape (n_cs,) - Actual number of iteration for each Cs. - - Notes - ----- - You might get slightly different results with the solver liblinear than - with the others since this uses LIBLINEAR which penalizes the intercept. - - .. versionchanged:: 0.19 - The "copy" parameter was removed. - """ - - return _logistic_regression_path( - X, y, pos_class=None, Cs=10, fit_intercept=True, max_iter=100, - tol=1e-4, verbose=0, solver='lbfgs', coef=None, class_weight=None, - dual=False, penalty='l2', intercept_scaling=1., multi_class='auto', - random_state=None, check_input=True, max_squared_sum=None, - sample_weight=None, l1_ratio=None) - - def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, max_iter=100, tol=1e-4, verbose=0, solver='lbfgs', coef=None, diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 4886870806531..3c4ddda1d7b0e 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -33,7 +33,6 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model._logistic import ( LogisticRegression, - logistic_regression_path, _logistic_regression_path, LogisticRegressionCV, _logistic_loss_and_grad, _logistic_grad_hess, _multinomial_grad_hess, _logistic_loss, @@ -1738,13 +1737,6 @@ def fit(X, y, **kw): solver=solver).coef_) -def test_logistic_regression_path_deprecation(): - - assert_warns_message(FutureWarning, - "logistic_regression_path was deprecated", - logistic_regression_path, X, Y1) - - @pytest.mark.parametrize('solver', ('lbfgs', 'newton-cg', 'sag', 'saga')) def test_penalty_none(solver): # - Make sure warning is raised if penalty='none' and C is set to a From ff6f880755d12a380dbdac99f6b9d169aee8b588 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 12:25:56 -0500 Subject: [PATCH 076/448] STY Minior change on code padding in website theme (#15768) --- doc/themes/scikit-learn-modern/static/css/theme.css | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 9fb35c73c27bd..590d2679223b7 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -37,6 +37,7 @@ code { background-color: #ecf0f3; border-radius: 0.2rem; white-space: nowrap; + padding: 0.15rem; } nav { From 70ae89ecb2bccb8b7d6426770fcfe4b9bb40376f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 5 Dec 2019 10:49:00 -0500 Subject: [PATCH 077/448] MNT Removed deprecated metrics (#15794) --- doc/modules/classes.rst | 2 - sklearn/metrics/__init__.py | 4 - sklearn/metrics/_classification.py | 74 ------------------- sklearn/metrics/cluster/__init__.py | 6 +- sklearn/metrics/cluster/_unsupervised.py | 7 -- .../cluster/tests/test_unsupervised.py | 10 --- sklearn/metrics/tests/test_classification.py | 20 ----- 7 files changed, 2 insertions(+), 121 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 14ddf328b4749..531c84303741d 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1621,6 +1621,4 @@ To be removed in 0.23 utils.cpu_count utils.delayed - metrics.calinski_harabaz_score - metrics.jaccard_similarity_score utils.safe_indexing diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 69f34c492e3a8..8bcb047ec8161 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -24,7 +24,6 @@ from ._classification import fbeta_score from ._classification import hamming_loss from ._classification import hinge_loss -from ._classification import jaccard_similarity_score from ._classification import jaccard_score from ._classification import log_loss from ._classification import matthews_corrcoef @@ -48,7 +47,6 @@ from .cluster import silhouette_samples from .cluster import silhouette_score from .cluster import calinski_harabasz_score -from .cluster import calinski_harabaz_score from .cluster import v_measure_score from .cluster import davies_bouldin_score @@ -93,7 +91,6 @@ 'auc', 'average_precision_score', 'balanced_accuracy_score', - 'calinski_harabaz_score', 'calinski_harabasz_score', 'check_scoring', 'classification_report', @@ -117,7 +114,6 @@ 'homogeneity_completeness_v_measure', 'homogeneity_score', 'jaccard_score', - 'jaccard_similarity_score', 'label_ranking_average_precision_score', 'label_ranking_loss', 'log_loss', diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 8a975a6f59802..666f110aee6fc 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -601,80 +601,6 @@ class labels [2]_. return 1 - k -def jaccard_similarity_score(y_true, y_pred, normalize=True, - sample_weight=None): - """Jaccard similarity coefficient score - - .. deprecated:: 0.21 - This is deprecated to be removed in 0.23, since its handling of - binary and multiclass inputs was broken. `jaccard_score` has an API - that is consistent with precision_score, f_score, etc. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : 1d array-like, or label indicator array / sparse matrix - Ground truth (correct) labels. - - y_pred : 1d array-like, or label indicator array / sparse matrix - Predicted labels, as returned by a classifier. - - normalize : bool, optional (default=True) - If ``False``, return the sum of the Jaccard similarity coefficient - over the sample set. Otherwise, return the average of Jaccard - similarity coefficient. - - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. - - Returns - ------- - score : float - If ``normalize == True``, return the average Jaccard similarity - coefficient, else it returns the sum of the Jaccard similarity - coefficient over the sample set. - - The best performance is 1 with ``normalize == True`` and the number - of samples with ``normalize == False``. - - See also - -------- - accuracy_score, hamming_loss, zero_one_loss - - Notes - ----- - In binary and multiclass classification, this function is equivalent - to the ``accuracy_score``. It differs in the multilabel classification - problem. - - References - ---------- - .. [1] `Wikipedia entry for the Jaccard index - `_ - """ - warnings.warn('jaccard_similarity_score has been deprecated and replaced ' - 'with jaccard_score. It will be removed in version 0.23. ' - 'This implementation has surprising behavior for binary ' - 'and multiclass classification tasks.', - FutureWarning) - - # Compute accuracy for each possible representation - y_type, y_true, y_pred = _check_targets(y_true, y_pred) - check_consistent_length(y_true, y_pred, sample_weight) - if y_type.startswith('multilabel'): - with np.errstate(divide='ignore', invalid='ignore'): - # oddly, we may get an "invalid" rather than a "divide" error here - pred_or_true = count_nonzero(y_true + y_pred, axis=1) - pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1) - score = pred_and_true / pred_or_true - score[pred_or_true == 0.0] = 1.0 - else: - score = y_true == y_pred - - return _weighted_sum(score, sample_weight, normalize) - - def jaccard_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None): """Jaccard similarity coefficient score diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 9d4b2ac87a974..b45c1a8f21774 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -20,7 +20,6 @@ from ._unsupervised import silhouette_samples from ._unsupervised import silhouette_score from ._unsupervised import calinski_harabasz_score -from ._unsupervised import calinski_harabaz_score from ._unsupervised import davies_bouldin_score from ._bicluster import consensus_score @@ -29,6 +28,5 @@ "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", "fowlkes_mallows_score", "entropy", "silhouette_samples", - "silhouette_score", "calinski_harabaz_score", - "calinski_harabasz_score", "davies_bouldin_score", - "consensus_score"] + "silhouette_score", "calinski_harabasz_score", + "davies_bouldin_score", "consensus_score"] diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py index f341f3e80b5c8..d6fc6fbc82ab0 100644 --- a/sklearn/metrics/cluster/_unsupervised.py +++ b/sklearn/metrics/cluster/_unsupervised.py @@ -299,13 +299,6 @@ def calinski_harabasz_score(X, labels): (intra_disp * (n_labels - 1.))) -@deprecated("Function 'calinski_harabaz_score' has been renamed to " - "'calinski_harabasz_score' " - "and will be removed in version 0.23.") -def calinski_harabaz_score(X, labels): - return calinski_harabasz_score(X, labels) - - def davies_bouldin_score(X, labels): """Computes the Davies-Bouldin score. diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 6a00f771273cb..f169a9242daf0 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -10,7 +10,6 @@ from sklearn.metrics.cluster import silhouette_samples from sklearn.metrics import pairwise_distances from sklearn.metrics.cluster import calinski_harabasz_score -from sklearn.metrics.cluster import calinski_harabaz_score from sklearn.metrics.cluster import davies_bouldin_score @@ -221,15 +220,6 @@ def test_calinski_harabasz_score(): 45 * (40 - 4) / (5 * (4 - 1))) -def test_deprecated_calinski_harabaz_score(): - depr_message = ("Function 'calinski_harabaz_score' has been renamed " - "to 'calinski_harabasz_score' " - "and will be removed in version 0.23.") - assert_warns_message(FutureWarning, depr_message, - calinski_harabaz_score, - np.ones((10, 2)), [0] * 5 + [1] * 5) - - def test_davies_bouldin_score(): assert_raises_on_only_one_label(davies_bouldin_score) assert_raises_on_all_points_same_cluster(davies_bouldin_score) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 05aab00a1ce4f..66ea486f955b7 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -38,7 +38,6 @@ from sklearn.metrics import hamming_loss from sklearn.metrics import hinge_loss from sklearn.metrics import jaccard_score -from sklearn.metrics import jaccard_similarity_score from sklearn.metrics import log_loss from sklearn.metrics import matthews_corrcoef from sklearn.metrics import precision_recall_fscore_support @@ -2238,22 +2237,3 @@ def test_balanced_accuracy_score(y_true, y_pred): adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True) chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0])) assert adjusted == (balanced - chance) / (1 - chance) - - -def test_multilabel_jaccard_similarity_score_deprecation(): - # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], [1, 0, 1]]) - y2 = np.array([[0, 0, 1], [1, 0, 1]]) - - # size(y1 \inter y2) = [1, 2] - # size(y1 \union y2) = [2, 2] - - jss = partial(assert_warns, FutureWarning, - jaccard_similarity_score) - assert jss(y1, y2) == 0.75 - assert jss(y1, y1) == 1 - assert jss(y2, y2) == 1 - assert jss(y2, np.logical_not(y2)) == 0 - assert jss(y1, np.logical_not(y1)) == 0 - assert jss(y1, np.zeros(y1.shape)) == 0 - assert jss(y2, np.zeros(y1.shape)) == 0 From a83b8e0d486acb6d0958602dc3b8b48de151e44a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 5 Dec 2019 23:39:18 +0100 Subject: [PATCH 078/448] DOC Fix yticklabels order in permutation importances example (#15799) * Fix yticklabels order in permutation importances example * Trigger ci --- .../inspection/plot_permutation_importance_multicollinear.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py index eb8b591bb4f2b..5f832ffbd4228 100644 --- a/examples/inspection/plot_permutation_importance_multicollinear.py +++ b/examples/inspection/plot_permutation_importance_multicollinear.py @@ -60,11 +60,11 @@ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) ax1.barh(tree_indices, clf.feature_importances_[tree_importance_sorted_idx], height=0.7) -ax1.set_yticklabels(data.feature_names) +ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx]) ax1.set_yticks(tree_indices) ax1.set_ylim((0, len(clf.feature_importances_))) ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False, - labels=data.feature_names) + labels=data.feature_names[perm_sorted_idx]) fig.tight_layout() plt.show() From 5e66d8ee2e59fb8caaac296666bf9f39b0883def Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 5 Dec 2019 18:02:53 -0500 Subject: [PATCH 079/448] STY Update wrapper width (#15793) --- doc/themes/scikit-learn-modern/static/css/theme.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 590d2679223b7..782800eb31915 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -513,7 +513,7 @@ div.sk-sidebar-toc-logo { div.sk-sidebar-toc-wrapper { font-size: 0.9rem; - width: 120%; + width: 252px; overflow-x: hidden; overflow-y: scroll; height: 100vh; From 6f701e23e379c15b693ccafa6064dc093d472fe7 Mon Sep 17 00:00:00 2001 From: Matt Hall Date: Fri, 6 Dec 2019 08:15:37 +0000 Subject: [PATCH 080/448] DOC Long sentence was hard to parse and ambiguous in _classification.py (#15769) --- sklearn/metrics/_classification.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 666f110aee6fc..75ce29428cbe9 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -1842,10 +1842,10 @@ def classification_report(y_true, y_pred, labels=None, target_names=None, The reported averages include macro average (averaging the unweighted mean per label), weighted average (averaging the support-weighted mean - per label), sample average (only for multilabel classification) and - micro average (averaging the total true positives, false negatives and - false positives) it is only shown for multi-label or multi-class - with a subset of classes because it is accuracy otherwise. + per label), and sample average (only for multilabel classification). + Micro average (averaging the total true positives, false negatives and + false positives) is only shown for multi-label or multi-class + with a subset of classes, because it corresponds to accuracy otherwise. See also :func:`precision_recall_fscore_support` for more details on averages. From 3e26ea3cf339ae4a5e2b6bd6af1594ad992a6717 Mon Sep 17 00:00:00 2001 From: Kathryn Poole Date: Fri, 6 Dec 2019 12:11:37 +0000 Subject: [PATCH 081/448] DOC Removed duplicate 'classes_' attribute in Naive Bayes classifiers (#15811) --- sklearn/naive_bayes.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 8ebf19125dbf4..585ba69fbb1ce 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -157,9 +157,6 @@ class labels known to the classifier epsilon_ : float absolute additive value to variances - classes_ : array-like, shape (n_classes,) - Unique class labels. - Examples -------- >>> import numpy as np @@ -718,9 +715,6 @@ class MultinomialNB(_BaseDiscreteNB): n_features_ : int Number of features of each sample. - classes_ : array-like, shape (n_classes,) - Unique class labels. - Examples -------- >>> import numpy as np @@ -828,9 +822,6 @@ class ComplementNB(_BaseDiscreteNB): Number of samples encountered for each feature during fitting. This value is weighted by the sample weight when provided. - classes_ : array of shape (n_classes,) - The classes labels. - Examples -------- >>> import numpy as np @@ -939,9 +930,6 @@ class BernoulliNB(_BaseDiscreteNB): n_features_ : int Number of features of each sample. - classes_ : array of shape (n_classes,) - The classes labels. - See Also ---------- MultinomialNB: The multinomial Naive Bayes classifier is \ From 1b1c869ef3d5443d2104a1ea444fa41b86d6e3a7 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 6 Dec 2019 07:54:12 -0500 Subject: [PATCH 082/448] BUG Fixes pandas dataframe bug with boolean dtypes (#15797) --- doc/whats_new/v0.22.rst | 19 +++++++++++++++++++ sklearn/utils/tests/test_validation.py | 21 +++++++++++++++++++++ sklearn/utils/validation.py | 9 +++++++-- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 7b0c031f9196b..af08b832e9f6f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -2,6 +2,25 @@ .. currentmodule:: sklearn +.. _changes_0_22_1: + +Version 0.22.1 +============== + +**In Development** + +This is a bug-fix release to primarily resolve some packaging issues in version +0.22.0. It also includes minor documentation improvements and some bug fixes. + +Changelog +--------- + +:mod:`sklearn.utils` +.................... + +- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with + boolean columns to floats. :pr:`15797` by `Thomas Fan`_. + .. _changes_0_22: Version 0.22.0 diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 56efb98a8b2d8..bdd31f9c4859f 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -826,6 +826,27 @@ def test_check_dataframe_warns_on_dtype(): assert len(record) == 0 +def test_check_dataframe_mixed_float_dtypes(): + # pandas dataframe will coerce a boolean into a object, this is a mismatch + # with np.result_type which will return a float + # check_array needs to explicitly check for bool dtype in a dataframe for + # this situation + # https://github.com/scikit-learn/scikit-learn/issues/15787 + + pd = importorskip("pandas") + df = pd.DataFrame({ + 'int': [1, 2, 3], + 'float': [0, 0.1, 2.1], + 'bool': [True, False, True]}, columns=['int', 'float', 'bool']) + + array = check_array(df, dtype=(np.float64, np.float32, np.float16)) + expected_array = np.array( + [[1.0, 0.0, 1.0], + [2.0, 0.1, 0.0], + [3.0, 2.1, 1.0]], dtype=np.float) + assert_allclose_dense_sparse(array, expected_array) + + class DummyMemory: def cache(self, func): return func diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 424cf4b5180a3..fb34f3b3cccbd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -454,9 +454,14 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): - dtypes_orig = np.array(array.dtypes) + dtypes_orig = list(array.dtypes) + # pandas boolean dtype __array__ interface coerces bools to objects + for i, dtype_iter in enumerate(dtypes_orig): + if dtype_iter.kind == 'b': + dtypes_orig[i] = np.object + if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): - dtype_orig = np.result_type(*array.dtypes) + dtype_orig = np.result_type(*dtypes_orig) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From ad80d3159a3621367e074f1b781c704193e5c27a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 6 Dec 2019 07:59:46 -0500 Subject: [PATCH 083/448] DEP Remove deprecated joblib tools in utils (#15792) --- doc/modules/classes.rst | 11 +---------- sklearn/utils/__init__.py | 26 +------------------------- sklearn/utils/tests/test_utils.py | 28 ---------------------------- 3 files changed, 2 insertions(+), 63 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 531c84303741d..f8e5195cc9174 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1605,20 +1605,11 @@ Utilities from joblib: Recently deprecated =================== -To be removed in 0.23 +To be removed in 0.24 --------------------- -.. autosummary:: - :toctree: generated/ - :template: deprecated_class.rst - - utils.Memory - utils.Parallel - .. autosummary:: :toctree: generated/ :template: deprecated_function.rst - utils.cpu_count - utils.delayed utils.safe_indexing diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4d4ef606341ca..2f5384e9bac5f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -39,29 +39,6 @@ parallel_backend = _joblib.parallel_backend register_parallel_backend = _joblib.register_parallel_backend -# deprecate the joblib API in sklearn in favor of using directly joblib -msg = ("deprecated in version 0.20.1 to be removed in version 0.23. " - "Please import this functionality directly from joblib, which can " - "be installed with: pip install joblib.") -deprecate = deprecated(msg) - -delayed = deprecate(_joblib.delayed) -cpu_count = deprecate(_joblib.cpu_count) -hash = deprecate(_joblib.hash) -effective_n_jobs = deprecate(_joblib.effective_n_jobs) - - -# for classes, deprecated will change the object in _joblib module so we need -# to subclass them. -@deprecate -class Memory(_joblib.Memory): - pass - - -@deprecate -class Parallel(_joblib.Parallel): - pass - __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", @@ -70,8 +47,7 @@ class Parallel(_joblib.Parallel): "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", "check_scalar", 'indexable', "check_symmetric", "indices_to_mask", "deprecated", - "cpu_count", "Parallel", "Memory", "delayed", "parallel_backend", - "register_parallel_backend", "hash", "effective_n_jobs", + "parallel_backend", "register_parallel_backend", "resample", "shuffle", "check_matplotlib_support", "all_estimators", ] diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 8031245105571..2e2711f595d11 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -637,20 +637,6 @@ def dummy_func(): def test_deprecation_joblib_api(tmpdir): - def check_warning(*args, **kw): - return assert_warns_message( - FutureWarning, "deprecated in version 0.20.1", - *args, **kw) - - # Ensure that the joblib API is deprecated in sklearn.util - from sklearn.utils import Parallel, Memory, delayed - from sklearn.utils import cpu_count, hash, effective_n_jobs - check_warning(Memory, str(tmpdir)) - check_warning(hash, 1) - check_warning(Parallel) - check_warning(cpu_count) - check_warning(effective_n_jobs, 1) - check_warning(delayed, dummy_func) # Only parallel_backend and register_parallel_backend are not deprecated in # sklearn.utils @@ -658,19 +644,5 @@ def check_warning(*args, **kw): assert_no_warnings(parallel_backend, 'loky', None) assert_no_warnings(register_parallel_backend, 'failing', None) - # Ensure that the deprecation have no side effect in sklearn.utils._joblib - from sklearn.utils._joblib import Parallel, Memory, delayed - from sklearn.utils._joblib import cpu_count, hash, effective_n_jobs - from sklearn.utils._joblib import parallel_backend - from sklearn.utils._joblib import register_parallel_backend - assert_no_warnings(Memory, str(tmpdir)) - assert_no_warnings(hash, 1) - assert_no_warnings(Parallel) - assert_no_warnings(cpu_count) - assert_no_warnings(effective_n_jobs, 1) - assert_no_warnings(delayed, dummy_func) - assert_no_warnings(parallel_backend, 'loky', None) - assert_no_warnings(register_parallel_backend, 'failing', None) - from sklearn.utils._joblib import joblib del joblib.parallel.BACKENDS['failing'] From 4ce39dbc699cd9cd33ab2b67ca029698713aec3e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 6 Dec 2019 08:18:56 -0500 Subject: [PATCH 084/448] BUG Returns only public estimators in all_estimators (#15380) --- sklearn/tests/test_common.py | 2 - sklearn/utils/__init__.py | 40 ++++++++++++-------- sklearn/utils/tests/test_estimator_checks.py | 9 +++++ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 221dd52834c90..9fc3075c5fe28 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -78,8 +78,6 @@ def _tested_estimators(): for name, Estimator in all_estimators(): if issubclass(Estimator, BiclusterMixin): continue - if name.startswith("_"): - continue try: estimator = _construct_instance(Estimator) except SkipTest: diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 2f5384e9bac5f..93aca01ffab8b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -3,6 +3,7 @@ """ import pkgutil import inspect +from importlib import import_module from operator import itemgetter from collections.abc import Sequence from contextlib import contextmanager @@ -12,6 +13,7 @@ import platform import struct import timeit +from pathlib import Path import warnings import numpy as np @@ -1131,7 +1133,6 @@ def all_estimators(include_meta_estimators=None, and ``class`` is the actuall type of the class. """ # lazy import to avoid circular imports from sklearn.base - import sklearn from ._testing import ignore_warnings from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, ClusterMixin) @@ -1159,20 +1160,29 @@ def is_abstract(c): DeprecationWarning) all_classes = [] - # get parent folder - path = sklearn.__path__ - for importer, modname, ispkg in pkgutil.walk_packages( - path=path, prefix='sklearn.', onerror=lambda x: None): - if ".tests." in modname or "externals" in modname: - continue - if IS_PYPY and ('_svmlight_format' in modname or - 'feature_extraction._hashing' in modname): - continue - # Ignore deprecation warnings triggered at import time. - with ignore_warnings(category=FutureWarning): - module = __import__(modname, fromlist="dummy") - classes = inspect.getmembers(module, inspect.isclass) - all_classes.extend(classes) + modules_to_ignore = {"tests", "externals", "setup", "conftest"} + root = str(Path(__file__).parent.parent) # sklearn package + # Ignore deprecation warnings triggered at import time and from walking + # packages + with ignore_warnings(category=FutureWarning): + for importer, modname, ispkg in pkgutil.walk_packages( + path=[root], prefix='sklearn.'): + mod_parts = modname.split(".") + if (any(part in modules_to_ignore for part in mod_parts) + or '._' in modname): + continue + module = import_module(modname) + classes = inspect.getmembers(module, inspect.isclass) + classes = [(name, est_cls) for name, est_cls in classes + if not name.startswith("_")] + + # TODO: Remove when FeatureHasher is implemented in PYPY + # Skips FeatureHasher for PYPY + if IS_PYPY and 'feature_extraction' in modname: + classes = [(name, est_cls) for name, est_cls in classes + if name == "FeatureHasher"] + + all_classes.extend(classes) all_classes = set(all_classes) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index f0c014829483f..15b423d6e0ce8 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -34,6 +34,7 @@ from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.utils.validation import check_X_y, check_array +from sklearn.utils import all_estimators class CorrectNotFittedError(ValueError): @@ -572,6 +573,14 @@ def test_check_class_weight_balanced_linear_classifier(): BadBalancedWeightsClassifier) +def test_all_estimators_all_public(): + # all_estimator should not fail when pytest is not installed and return + # only public estimators + estimators = all_estimators() + for est in estimators: + assert not est.__class__.__name__.startswith("_") + + if __name__ == '__main__': # This module is run as a script to check that we have no dependency on # pytest for estimator checks. From e4c0adaf7bdf614f2dafb6a6803545bd9a9bf0bb Mon Sep 17 00:00:00 2001 From: lucyleeow Date: Fri, 6 Dec 2019 17:28:30 +0100 Subject: [PATCH 085/448] DOC improve doc for multiclass and types_of_target (#15333) --- doc/modules/multiclass.rst | 197 +++++++++++++++++++++++++++++-------- 1 file changed, 155 insertions(+), 42 deletions(-) diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index 195ecc0adcf6f..5613fc2334e73 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -14,45 +14,138 @@ Multiclass and multilabel algorithms The :mod:`sklearn.multiclass` module implements *meta-estimators* to solve ``multiclass`` and ``multilabel`` classification problems -by decomposing such problems into binary classification problems. Multitarget +by decomposing such problems into binary classification problems. ``multioutput`` regression is also supported. -- **Multiclass classification** means a classification task with more than - two classes; e.g., classify a set of images of fruits which may be oranges, - apples, or pears. Multiclass classification makes the assumption that each - sample is assigned to one and only one label: a fruit can be either an - apple or a pear but not both at the same time. - -- **Multilabel classification** assigns to each sample a set of target - labels. This can be thought as predicting properties of a data-point - that are not mutually exclusive, such as topics that are relevant for a - document. A text might be about any of religion, politics, finance or - education at the same time or none of these. - -- **Multioutput regression** assigns each sample a set of target - values. This can be thought of as predicting several properties - for each data-point, such as wind direction and magnitude at a - certain location. - -- **Multioutput-multiclass classification** and **multi-task classification** - means that a single estimator has to handle several joint classification - tasks. This is both a generalization of the multi-label classification - task, which only considers binary classification, as well as a - generalization of the multi-class classification task. *The output format - is a 2d numpy array or sparse matrix.* - - The set of labels can be different for each output variable. - For instance, a sample could be assigned "pear" for an output variable that - takes possible values in a finite set of species such as "pear", "apple"; - and "blue" or "green" for a second output variable that takes possible values - in a finite set of colors such as "green", "red", "blue", "yellow"... - - This means that any classifiers handling multi-output - multiclass or multi-task classification tasks, - support the multi-label classification task as a special case. - Multi-task classification is similar to the multi-output - classification task with different model formulations. For - more information, see the relevant estimator documentation. +- **Multiclass classification**: classification task with more than two classes. + Each sample can only be labelled as one class. + + For example, classification using features extracted from a set of images of + fruit, where each image may either be of an orange, an apple, or a pear. + Each image is one sample and is labelled as one of the 3 possible classes. + Multiclass classification makes the assumption that each sample is assigned + to one and only one label - one sample cannot, for example, be both a pear + and an apple. + + Valid :term:`multiclass` representations for + :func:`~utils.multiclass.type_of_target` (`y`) are: + + - 1d or column vector containing more than two discrete values. An + example of a vector ``y`` for 3 samples: + + >>> import numpy as np + >>> y = np.array(['apple', 'pear', 'apple']) + >>> print(y) + ['apple' 'pear' 'apple'] + + - sparse :term:`binary` matrix of shape ``(n_samples, n_classes)`` with a + single element per row, where each column represents one class. An + example of a sparse :term:`binary` matrix ``y`` for 3 samples, where + the columns, in order, are orange, apple and pear: + + >>> from scipy import sparse + >>> row_ind = np.array([0, 1, 2]) + >>> col_ind = np.array([1, 2, 1]) + >>> y_sparse = sparse.csr_matrix((np.ones(3), (row_ind, col_ind))) + >>> print(y_sparse) + (0, 1) 1.0 + (1, 2) 1.0 + (2, 1) 1.0 + + +- **Multilabel classification**: classification task labelling each sample with + ``x`` labels from ``n_classes`` possible classes, where ``x`` can be 0 to + ``n_classes`` inclusive. This can be thought of as predicting properties of a + sample that are not mutually exclusive. Formally, a binary output is assigned + to each class, for every sample. Positive classes are indicated with 1 and + negative classes with 0 or -1. It is thus comparable to running ``n_classes`` + binary classification tasks, for example with + :class:`sklearn.multioutput.MultiOutputClassifier`. This approach treats + each label independently whereas multilabel classifiers *may* treat the + multiple classes simultaneously, accounting for correlated behaviour amoung + them. + + For example, prediction of the topics relevant to a text document or video. + The document or video may be about one of 'religion', 'politics', 'finance' + or 'education', several of the topic classes or all of the topic classes. + + Valid representation of :term:`multilabel` `y` is either dense (or sparse) + :term:`binary` matrix of shape ``(n_samples, n_classes)``. Each column + represents a class. The ``1``'s in each row denote the positive classes a + sample has been labelled with. An example of a dense matrix ``y`` for 3 + samples: + + >>> y = np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]) + >>> print(y) + [[1 0 0 1] + [0 0 1 1] + [0 0 0 0]] + + An example of the same ``y`` in sparse matrix form: + + >>> y_sparse = sparse.csr_matrix(y) + >>> print(y_sparse) + (0, 0) 1 + (0, 3) 1 + (1, 2) 1 + (1, 3) 1 + + +- **Multioutput regression**: predicts multiple numerical properties for each + sample. Each property is a numerical variable and the number of properties + to be predicted for each sample is greater than or equal to 2. Some estimators + that support multioutput regression are faster than just running ``n_output`` + estimators. + + For example, prediction of both wind speed and wind direction, in degrees, + using data obtained at a certain location. Each sample would be data + obtained at one location and both wind speed and directtion would be + output for each sample. + + Valid representation of :term:`multilabel` `y` is dense matrix of shape + ``(n_samples, n_classes)`` of floats. A column wise concatenation of + :term:`continuous` variables. An example of ``y`` for 3 samples: + + >>> y = np.array([[31.4, 94], [40.5, 109], [25.0, 30]]) + >>> print(y) + [[ 31.4 94. ] + [ 40.5 109. ] + [ 25. 30. ]] + + +- **Multioutput-multiclass classification** + (also known as **multitask classification**): + classification task which labels each sample with a set of **non-binary** + properties. Both the number of properties and the number of + classes per property is greater than 2. A single estimator thus + handles several joint classification tasks. This is both a generalization of + the multi\ *label* classification task, which only considers binary + attributes, as well as a generalization of the multi\ *class* classification + task, where only one property is considered. + + For example, classification of the properties "type of fruit" and "colour" + for a set of images of fruit. The property "type of fruit" has the possible + classes: "apple", "pear" and "orange". The property "colour" has the + possible classes: "green", "red", "yellow" and "orange". Each sample is an + image of a fruit, a label is output for both properties and each label is + one of the possible classes of the corresponding property. + + Valid representation of :term:`multilabel` `y` is dense matrix of shape + ``(n_samples, n_classes)`` of floats. A column wise concatenation of 1d + :term:`multiclass` variables. An example of ``y`` for 3 samples: + + >>> y = np.array([['apple', 'green'], ['orange', 'orange'], ['pear', 'green']]) + >>> print(y) + [['apple' 'green'] + ['orange' 'orange'] + ['pear' 'green']] + + Note that any classifiers handling multioutput-multiclass (also known as + multitask classification) tasks, support the multilabel classification task + as a special case. Multitask classification is similar to the multioutput + classification task with different model formulations. For more information, + see the relevant estimator documentation. + All scikit-learn classifiers are capable of multiclass classification, but the meta-estimators offered by :mod:`sklearn.multiclass` @@ -60,6 +153,26 @@ permit changing the way they handle more than two classes because this may have an effect on classifier performance (either in terms of generalization error or required computational resources). +**Summary** + ++-----------------+-------------+-------------+------------------------------------------+ +| | Number of | Target | Valid | +| | targets | cardinality | :func:`~utils.multiclass.type_of_target` | ++=================+=============+=============+==========================================+ +| Multiclass | 1 | >2 | - 'multiclass' | +| classification | | | | ++-----------------+-------------+-------------+------------------------------------------+ +| Multilabel | >1 | 2 (0 or 1) | - 'multilabel-indicator' | +| classification | | | | ++-----------------+-------------+-------------+------------------------------------------+ +| Multioutput | >1 | Continuous | - 'continuous-multioutput' | +| regression | | | | ++-----------------+-------------+-------------+------------------------------------------+ +| Multioutput- | >1 | >2 | - 'multiclass-multioutput' | +| multiclass | | | | +| classification | | | | ++-----------------+-------------+-------------+------------------------------------------+ + Below is a summary of the classifiers supported by scikit-learn grouped by strategy; you don't need the meta-estimators in this class if you're using one of these, unless you want custom multiclass behavior: @@ -94,7 +207,7 @@ if you're using one of these, unless you want custom multiclass behavior: - :class:`sklearn.gaussian_process.GaussianProcessClassifier` (setting multi_class = "one_vs_one") -- **Multiclass as One-Vs-All:** +- **Multiclass as One-Vs-The-Rest:** - :class:`sklearn.ensemble.GradientBoostingClassifier` - :class:`sklearn.gaussian_process.GaussianProcessClassifier` (setting multi_class = "one_vs_rest") @@ -167,7 +280,7 @@ This strategy, also known as **one-vs-all**, is implemented in per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only `n_classes` classifiers are needed), one advantage of this approach is its -interpretability. Since each class is represented by one and only one classifier, +interpretability. Since each class is represented by one and only one classifier, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy and is a fair default choice. @@ -431,7 +544,7 @@ averaged together. Regressor Chain ================ -Regressor chains (see :class:`RegressorChain`) is analogous to -ClassifierChain as a way of combining a number of regressions -into a single multi-target model that is capable of exploiting +Regressor chains (see :class:`RegressorChain`) is analogous to +ClassifierChain as a way of combining a number of regressions +into a single multi-target model that is capable of exploiting correlations among targets. From 0fa54e5d7f5551c78cc067b91788d4d3e64244b8 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 6 Dec 2019 22:54:40 -0600 Subject: [PATCH 086/448] TST Increases tol for check_pca_float_dtype_preservation assertion (#15775) --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 826a8cc082c3a..d2c5452c10461 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -532,7 +532,7 @@ def check_pca_float_dtype_preservation(svd_solver): assert pca_64.transform(X_64).dtype == np.float64 assert pca_32.transform(X_32).dtype == np.float32 - assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4) + assert_allclose(pca_64.components_, pca_32.components_, rtol=2e-4) def check_pca_int_dtype_upcast_to_double(svd_solver): From 9c62eee695cdcd75c6fd23d02334b8d0c241ddde Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Sat, 7 Dec 2019 18:36:40 +0100 Subject: [PATCH 087/448] MNT Deprecate unused 'rotate' parameter in tree.plot_tree. (#15806) * Deprecate unused 'rotate' parameter in tree.plot_tree. * Clarify warning and docstring. Add test. * Fix lint error and adress comment. * Fix python lint error. * Add what's new entry. Conform to skl convention. * Update sklearn/tree/tests/test_export.py Co-Authored-By: Olivier Grisel --- doc/whats_new/v0.23.rst | 8 ++++++++ sklearn/tree/_export.py | 16 ++++++++++++++-- sklearn/tree/tests/test_export.py | 11 +++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index a1cf4b4dd7d00..07ff9826e6ad9 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -56,3 +56,11 @@ Changelog - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. + +:mod:`sklearn.tree` +................... + +- |Fix| :func:`tree.plot_tree` `rotate` parameter was unused and has been + deprecated. + :pr:`15806` by :user:`Chiara Marmo `. + diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 826f4345298d2..212ae4e309749 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -24,6 +24,7 @@ from ._reingold_tilford import buchheim, Tree from . import DecisionTreeClassifier +import warnings def _color_brew(n): """Generate n colors with equally spaced hues. @@ -78,7 +79,7 @@ def __repr__(self): def plot_tree(decision_tree, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, - proportion=False, rotate=False, rounded=False, + proportion=False, rotate='deprecated', rounded=False, precision=3, ax=None, fontsize=None): """Plot a decision tree. @@ -131,7 +132,12 @@ def plot_tree(decision_tree, max_depth=None, feature_names=None, to be proportions and percentages respectively. rotate : bool, optional (default=False) - When set to ``True``, orient tree left to right rather than top-down. + This parameter has no effect on the matplotlib tree visualisation and + it is kept here for backward compatibility. + + .. deprecated:: 0.23 + ``rotate`` is deprecated in 0.23 and will be removed in 0.25. + rounded : bool, optional (default=False) When set to ``True``, draw node boxes with rounded corners and use @@ -167,6 +173,12 @@ def plot_tree(decision_tree, max_depth=None, feature_names=None, [Text(251.5,345.217,'X[3] <= 0.8... """ + + if rotate != 'deprecated': + warnings.warn(("'rotate' has no effect and is deprecated in 0.23. " + "It will be removed in 0.25."), + FutureWarning) + exporter = _MPLTreeExporter( max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index 8122b2096dad0..f1c080dea4d2a 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -448,3 +448,14 @@ def test_plot_tree_gini(pyplot): "samples = 6\nvalue = [3, 3]") assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]" assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]" + + +# FIXME: to be removed in 0.25 +def test_plot_tree_rotate_deprecation(pyplot): + tree = DecisionTreeClassifier() + tree.fit(X, y) + # test that a warning is raised when rotate is used. + match = ("'rotate' has no effect and is deprecated in 0.23. " + "It will be removed in 0.25.") + with pytest.warns(FutureWarning, match=match): + plot_tree(tree, rotate=True) From e94b67a4d36bfa68f5a864a6401253846bac7138 Mon Sep 17 00:00:00 2001 From: Maciej J Mikulski Date: Sun, 8 Dec 2019 11:51:31 +0100 Subject: [PATCH 088/448] ENH Allow two-element tuple as n_samples argument in make_circles and make_moons (#15707) --- doc/whats_new/v0.23.rst | 7 ++++ sklearn/datasets/_samples_generator.py | 36 ++++++++++++++----- .../datasets/tests/test_samples_generator.py | 34 ++++++++++++++++++ 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 07ff9826e6ad9..d1fe1ca0f531e 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -51,6 +51,13 @@ Changelog more memory efficient implementation of single linkage clustering. :pr:`11514` by :user:`Leland McInnes `. +:mod:`sklearn.datasets` +....................... + +- |Enhancement| Functions :func:`datasets.make_circles` and + :func:`datasets.make_moons` now accept two-element tuple. + :pr:`15707` by :user:`Maciej J Mikulski ` + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 5d18b46711489..8893aedbdfc5a 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -591,9 +591,12 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None, Parameters ---------- - n_samples : int, optional (default=100) - The total number of points generated. If odd, the inner circle will - have one point more than the outer circle. + n_samples : int or two-element tuple, optional (default=100) + If int, it is the total number of points generated. + For odd numbers, the inner circle will have one point more than the + outer circle. + If two-element tuple, number of points in outer circle and inner + circle. shuffle : bool, optional (default=True) Whether to shuffle the samples. @@ -621,8 +624,15 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None, if factor >= 1 or factor < 0: raise ValueError("'factor' has to be between 0 and 1.") - n_samples_out = n_samples // 2 - n_samples_in = n_samples - n_samples_out + if isinstance(n_samples, numbers.Integral): + n_samples_out = n_samples // 2 + n_samples_in = n_samples - n_samples_out + else: + try: + n_samples_out, n_samples_in = n_samples + except ValueError: + raise ValueError('`n_samples` can be either an int or ' + 'a two-element tuple.') generator = check_random_state(random_state) # so as not to have the first point = last point, we set endpoint=False @@ -654,8 +664,9 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None): Parameters ---------- - n_samples : int, optional (default=100) - The total number of points generated. + n_samples : int or two-element tuple, optional (default=100) + If int, the total number of points generated. + If two-element tuple, number of points in each of two moons. shuffle : bool, optional (default=True) Whether to shuffle the samples. @@ -677,8 +688,15 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None): The integer labels (0 or 1) for class membership of each sample. """ - n_samples_out = n_samples // 2 - n_samples_in = n_samples - n_samples_out + if isinstance(n_samples, numbers.Integral): + n_samples_out = n_samples // 2 + n_samples_in = n_samples - n_samples_out + else: + try: + n_samples_out, n_samples_in = n_samples + except ValueError: + raise ValueError('`n_samples` can be either an int or ' + 'a two-element tuple.') generator = check_random_state(random_state) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index e51ca3970bdae..433baca985b87 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -476,6 +476,22 @@ def test_make_moons(): err_msg="Point is not on expected unit circle") +def test_make_moons_unbalanced(): + X, y = make_moons(n_samples=(7, 5)) + assert np.sum(y == 0) == 7 and np.sum(y == 1) == 5, \ + 'Number of samples in a moon is wrong' + assert X.shape == (12, 2), "X shape mismatch" + assert y.shape == (12,), "y shape mismatch" + + with pytest.raises(ValueError, match=r'`n_samples` can be either an int ' + r'or a two-element tuple.'): + make_moons(n_samples=[1, 2, 3]) + + with pytest.raises(ValueError, match=r'`n_samples` can be either an int ' + r'or a two-element tuple.'): + make_moons(n_samples=(10,)) + + def test_make_circles(): factor = 0.3 @@ -490,6 +506,7 @@ def test_make_circles(): for x, label in zip(X, y): dist_sqr = ((x - center) ** 2).sum() dist_exp = 1.0 if label == 0 else factor**2 + dist_exp = 1.0 if label == 0 else factor ** 2 assert_almost_equal(dist_sqr, dist_exp, err_msg="Point is not on expected circle") @@ -502,3 +519,20 @@ def test_make_circles(): make_circles(factor=-0.01) with pytest.raises(ValueError): make_circles(factor=1.) + + +def test_make_circles_unbalanced(): + X, y = make_circles(n_samples=(2, 8)) + + assert np.sum(y == 0) == 2, 'Number of samples in inner circle is wrong' + assert np.sum(y == 1) == 8, 'Number of samples in outer circle is wrong' + assert X.shape == (10, 2), "X shape mismatch" + assert y.shape == (10,), "y shape mismatch" + + with pytest.raises(ValueError, match=r'`n_samples` can be either an int ' + r'or a two-element tuple.'): + make_circles(n_samples=[1, 2, 3]) + + with pytest.raises(ValueError, match=r'`n_samples` can be either an int ' + r'or a two-element tuple.'): + make_circles(n_samples=(10,)) From 9d61fe031724634d412ac8107b0542139e901aca Mon Sep 17 00:00:00 2001 From: JJmistry Date: Sun, 8 Dec 2019 18:23:59 +0000 Subject: [PATCH 089/448] update _alpha_grid class in _coordinate_descent.py (#15835) --- sklearn/linear_model/_coordinate_descent.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index efe5612845157..30ccb0c9f702f 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -43,26 +43,26 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, y : ndarray, shape (n_samples,) Target values - Xy : array-like, optional + Xy : array-like, default=None Xy = np.dot(X.T, y) that can be precomputed. - l1_ratio : float + l1_ratio : float, default=1.0 The elastic net mixing parameter, with ``0 < l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not supported) ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. - eps : float, optional + eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3`` - n_alphas : int, optional + n_alphas : int, default=100 Number of alphas along the regularization path - fit_intercept : boolean, default True + fit_intercept : boolean, default=True Whether to fit an intercept or not - normalize : boolean, optional, default False + normalize : boolean, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. @@ -70,7 +70,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. - copy_X : boolean, optional, default True + copy_X : boolean, optional, default=True If ``True``, X will be copied; else, it may be overwritten. """ if l1_ratio == 0: From 29932e690e00fa0c610a8fba2dfb0abc1d66cbac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Mon, 9 Dec 2019 08:45:54 +0100 Subject: [PATCH 090/448] FIX Explicit conversion of ndarray to object dtype. (#15832) --- sklearn/neighbors/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 1ff45332b1e70..3a1fdadfb94b7 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -276,8 +276,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = np.array(np.split(data, indptr[1:-1])) - neigh_ind = np.array(np.split(indices, indptr[1:-1])) + neigh_dist = np.array(np.split(data, indptr[1:-1]), dtype=object) + neigh_ind = np.array(np.split(indices, indptr[1:-1]), dtype=object) if return_distance: return neigh_dist, neigh_ind From 1c42e79d420cc03de5e0c3b625753c6084e25a3f Mon Sep 17 00:00:00 2001 From: J-A16 Date: Mon, 9 Dec 2019 03:33:06 -0500 Subject: [PATCH 091/448] FIX pass sample weights to final estimator (#15773) --- doc/whats_new/v0.23.rst | 8 ++++++++ sklearn/linear_model/_ransac.py | 10 +++++++++- sklearn/linear_model/tests/test_ransac.py | 20 ++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index d1fe1ca0f531e..b1edda7900b81 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -58,6 +58,14 @@ Changelog :func:`datasets.make_moons` now accept two-element tuple. :pr:`15707` by :user:`Maciej J Mikulski ` +:mod:`sklearn.linear_model` +........................... + +- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit + method of :class:`linear_model.RANSACRegressor`, it would not be passed to + the wrapped `base_estimator` during the fitting of the final model. + :pr:`15573` by :user:`Jeremy Alexandre `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 1350878b54154..40ebb3a08420f 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -328,6 +328,7 @@ def fit(self, X, y, sample_weight=None): inlier_mask_best = None X_inlier_best = None y_inlier_best = None + inlier_best_idxs_subset = None self.n_skips_no_inliers_ = 0 self.n_skips_invalid_data_ = 0 self.n_skips_invalid_model_ = 0 @@ -404,6 +405,7 @@ def fit(self, X, y, sample_weight=None): inlier_mask_best = inlier_mask_subset X_inlier_best = X_inlier_subset y_inlier_best = y_inlier_subset + inlier_best_idxs_subset = inlier_idxs_subset max_trials = min( max_trials, @@ -441,7 +443,13 @@ def fit(self, X, y, sample_weight=None): ConvergenceWarning) # estimate final model using all inliers - base_estimator.fit(X_inlier_best, y_inlier_best) + if sample_weight is None: + base_estimator.fit(X_inlier_best, y_inlier_best) + else: + base_estimator.fit( + X_inlier_best, + y_inlier_best, + sample_weight=sample_weight[inlier_best_idxs_subset]) self.estimator_ = base_estimator self.inlier_mask_ = inlier_mask_best diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 83f688c95692e..ae29fb81cdd5d 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -10,6 +10,8 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import assert_raises +from sklearn.utils._testing import assert_allclose +from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression, RANSACRegressor, Lasso from sklearn.linear_model._ransac import _dynamic_max_trials from sklearn.exceptions import ConvergenceWarning @@ -494,3 +496,21 @@ def test_ransac_fit_sample_weight(): base_estimator = Lasso() ransac_estimator = RANSACRegressor(base_estimator) assert_raises(ValueError, ransac_estimator.fit, X, y, weights) + + +def test_ransac_final_model_fit_sample_weight(): + X, y = make_regression(n_samples=1000, random_state=10) + rng = check_random_state(42) + sample_weight = rng.randint(1, 4, size=y.shape[0]) + sample_weight = sample_weight / sample_weight.sum() + ransac = RANSACRegressor(base_estimator=LinearRegression(), random_state=0) + ransac.fit(X, y, sample_weight=sample_weight) + + final_model = LinearRegression() + mask_samples = ransac.inlier_mask_ + final_model.fit( + X[mask_samples], y[mask_samples], + sample_weight=sample_weight[mask_samples] + ) + + assert_allclose(ransac.estimator_.coef_, final_model.coef_) From 7dd12fb22dc964c49aa6e9538394cbc836410409 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 9 Dec 2019 07:17:43 -0600 Subject: [PATCH 092/448] BLD Parallelize sphinx builds on circle ci (#15745) --- doc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/Makefile b/doc/Makefile index 11c5d58749bec..1cbce7dba9662 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = +SPHINXOPTS = -j auto SPHINXBUILD ?= sphinx-build PAPER = BUILDDIR = _build From 4256542b10bcc6bd10ddf31dfb356ecc084f2e7d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Dec 2019 09:17:06 -0500 Subject: [PATCH 093/448] MNT Removed deprecated attributes and parameters (#15803) --- sklearn/base.py | 29 +- sklearn/cluster/_hierarchical.py | 7 - sklearn/cluster/tests/test_hierarchical.py | 14 - sklearn/cross_decomposition/tests/test_pls.py | 1 - .../decomposition/tests/test_kernel_pca.py | 6 - sklearn/externals/six.py | 583 ------------------ .../tests/test_from_model.py | 8 - sklearn/linear_model/_huber.py | 2 +- .../tests/test_coordinate_descent.py | 1 - sklearn/linear_model/tests/test_huber.py | 2 - .../tests/test_passive_aggressive.py | 24 - sklearn/linear_model/tests/test_perceptron.py | 4 - sklearn/linear_model/tests/test_ransac.py | 2 - sklearn/linear_model/tests/test_ridge.py | 1 - sklearn/linear_model/tests/test_sgd.py | 5 - sklearn/model_selection/tests/test_search.py | 1 - .../model_selection/tests/test_validation.py | 5 - sklearn/neural_network/tests/test_mlp.py | 1 - sklearn/preprocessing/_data.py | 24 +- sklearn/preprocessing/tests/test_data.py | 14 - sklearn/tests/test_base.py | 23 - sklearn/tests/test_dummy.py | 1 - sklearn/tests/test_multiclass.py | 6 - sklearn/tests/test_multioutput.py | 10 - sklearn/utils/__init__.py | 40 +- sklearn/utils/_testing.py | 39 +- sklearn/utils/tests/test_validation.py | 96 --- sklearn/utils/validation.py | 65 +- 28 files changed, 16 insertions(+), 998 deletions(-) delete mode 100644 sklearn/externals/six.py diff --git a/sklearn/base.py b/sklearn/base.py index 050bb4e2a522b..4732c7ba165a9 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -406,34 +406,17 @@ def score(self, X, y, sample_weight=None): Notes ----- - The R2 score used when calling ``score`` on a regressor will use + The R2 score used when calling ``score`` on a regressor uses ``multioutput='uniform_average'`` from version 0.23 to keep consistent - with :func:`~sklearn.metrics.r2_score`. This will influence the - ``score`` method of all the multioutput regressors (except for - :class:`~sklearn.multioutput.MultiOutputRegressor`). To specify the - default value manually and avoid the warning, please either call - :func:`~sklearn.metrics.r2_score` directly or make a custom scorer with - :func:`~sklearn.metrics.make_scorer` (the built-in scorer ``'r2'`` uses - ``multioutput='uniform_average'``). + with default value of :func:`~sklearn.metrics.r2_score`. + This influences the ``score`` method of all the multioutput + regressors (except for + :class:`~sklearn.multioutput.MultiOutputRegressor`). """ from .metrics import r2_score - from .metrics._regression import _check_reg_targets y_pred = self.predict(X) - # XXX: Remove the check in 0.23 - y_type, _, _, _ = _check_reg_targets(y, y_pred, None) - if y_type == 'continuous-multioutput': - warnings.warn("The default value of multioutput (not exposed in " - "score method) will change from 'variance_weighted' " - "to 'uniform_average' in 0.23 to keep consistent " - "with 'metrics.r2_score'. To specify the default " - "value manually and avoid the warning, please " - "either call 'metrics.r2_score' directly or make a " - "custom scorer with 'metrics.make_scorer' (the " - "built-in scorer 'r2' uses " - "multioutput='uniform_average').", FutureWarning) - return r2_score(y, y_pred, sample_weight=sample_weight, - multioutput='variance_weighted') + return r2_score(y, y_pred, sample_weight=sample_weight) class ClusterMixin: diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index f553a9e505eb5..9cb80747fbc20 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -787,13 +787,6 @@ def __init__(self, n_clusters=2, affinity="euclidean", self.linkage = linkage self.affinity = affinity - @deprecated("The ``n_components_`` attribute was deprecated " - "in favor of ``n_connected_components_`` in 0.21 " - "and will be removed in 0.23.") - @property - def n_components_(self): - return self.n_connected_components_ - def fit(self, X, y=None): """Fit the hierarchical clustering from features, or distance matrix. diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 06e2561df5de7..49d102a57e4f3 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -750,17 +750,3 @@ def test_dist_threshold_invalid_parameters(): AgglomerativeClustering(n_clusters=None, distance_threshold=1, compute_full_tree=False).fit(X) - - -def test_n_components_deprecation(): - # Test that a Deprecation warning is thrown when n_components_ - # attribute is accessed - - X = np.array([[1, 2], [1, 4], [1, 0], [4, 2]]) - agc = AgglomerativeClustering().fit(X) - - match = ("``n_components_`` attribute was deprecated " - "in favor of ``n_connected_components_``") - with pytest.warns(FutureWarning, match=match): - n = agc.n_components_ - assert n == agc.n_connected_components_ diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index 13c55fbd135d0..2d788a2cf6271 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -426,7 +426,6 @@ def test_pls_errors(): clf.fit, X, Y) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 def test_pls_scaling(): # sanity check for scale=True n_samples = 1000 diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 39fc16b5ff5fb..a08ae0cb7a43a 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -215,8 +215,6 @@ def test_kernel_pca_invalid_kernel(): kpca.fit(X_fit) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. @@ -231,8 +229,6 @@ def test_gridsearch_pipeline(): assert grid_search.best_score_ == 1 -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_gridsearch_pipeline_precomputed(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model using a precomputed kernel. @@ -248,8 +244,6 @@ def test_gridsearch_pipeline_precomputed(): assert grid_search.best_score_ == 1 -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_nested_circles(): # Test the linear separability of the first 2D KPCA transform X, y = make_circles(n_samples=400, factor=.3, noise=.05, diff --git a/sklearn/externals/six.py b/sklearn/externals/six.py deleted file mode 100644 index 26d95f7df9abc..0000000000000 --- a/sklearn/externals/six.py +++ /dev/null @@ -1,583 +0,0 @@ -"""Utilities for writing code that runs on Python 2 and 3""" - -# Copyright (c) 2010-2013 Benjamin Peterson -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import operator -import sys -import types - -import warnings -warnings.warn("The module is deprecated in version 0.21 and will be removed " - "in version 0.23 since we've dropped support for Python 2.7. " - "Please rely on the official version of six " - "(https://pypi.org/project/six/).", FutureWarning) - -__author__ = "Benjamin Peterson " -__version__ = "1.4.1" - - -# Useful for very coarse version differentiation. -PY2 = sys.version_info[0] == 2 -PY3 = sys.version_info[0] == 3 - -if PY3: - string_types = str, - integer_types = int, - class_types = type, - text_type = str - binary_type = bytes - - MAXSIZE = sys.maxsize -else: - string_types = basestring, - integer_types = (int, long) - class_types = (type, types.ClassType) - text_type = unicode - binary_type = str - - if sys.platform.startswith("java"): - # Jython always uses 32 bits. - MAXSIZE = int((1 << 31) - 1) - else: - # It's possible to have sizeof(long) != sizeof(Py_ssize_t). - class X(object): - def __len__(self): - return 1 << 31 - try: - len(X()) - except OverflowError: - # 32-bit - MAXSIZE = int((1 << 31) - 1) - else: - # 64-bit - MAXSIZE = int((1 << 63) - 1) - del X - - -def _add_doc(func, doc): - """Add documentation to a function.""" - func.__doc__ = doc - - -def _import_module(name): - """Import module, returning the module after the last dot.""" - __import__(name) - return sys.modules[name] - - -class _LazyDescr(object): - - def __init__(self, name): - self.name = name - - def __get__(self, obj, tp): - result = self._resolve() - setattr(obj, self.name, result) - # This is a bit ugly, but it avoids running this again. - delattr(tp, self.name) - return result - - -class MovedModule(_LazyDescr): - - def __init__(self, name, old, new=None): - super(MovedModule, self).__init__(name) - if PY3: - if new is None: - new = name - self.mod = new - else: - self.mod = old - - def _resolve(self): - return _import_module(self.mod) - - -class MovedAttribute(_LazyDescr): - - def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None): - super(MovedAttribute, self).__init__(name) - if PY3: - if new_mod is None: - new_mod = name - self.mod = new_mod - if new_attr is None: - if old_attr is None: - new_attr = name - else: - new_attr = old_attr - self.attr = new_attr - else: - self.mod = old_mod - if old_attr is None: - old_attr = name - self.attr = old_attr - - def _resolve(self): - module = _import_module(self.mod) - return getattr(module, self.attr) - - - -class _MovedItems(types.ModuleType): - """Lazy loading of moved objects""" - - -_moved_attributes = [ - MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"), - MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"), - MovedAttribute("filterfalse", "itertools", "itertools", "ifilterfalse", "filterfalse"), - MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"), - MovedAttribute("map", "itertools", "builtins", "imap", "map"), - MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"), - MovedAttribute("reload_module", "__builtin__", "imp", "reload"), - MovedAttribute("reduce", "__builtin__", "functools"), - MovedAttribute("StringIO", "StringIO", "io"), - MovedAttribute("UserString", "UserString", "collections"), - MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"), - MovedAttribute("zip", "itertools", "builtins", "izip", "zip"), - MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"), - - MovedModule("builtins", "__builtin__"), - MovedModule("configparser", "ConfigParser"), - MovedModule("copyreg", "copy_reg"), - MovedModule("http_cookiejar", "cookielib", "http.cookiejar"), - MovedModule("http_cookies", "Cookie", "http.cookies"), - MovedModule("html_entities", "htmlentitydefs", "html.entities"), - MovedModule("html_parser", "HTMLParser", "html.parser"), - MovedModule("http_client", "httplib", "http.client"), - MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"), - MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"), - MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"), - MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"), - MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"), - MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"), - MovedModule("cPickle", "cPickle", "pickle"), - MovedModule("queue", "Queue"), - MovedModule("reprlib", "repr"), - MovedModule("socketserver", "SocketServer"), - MovedModule("tkinter", "Tkinter"), - MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"), - MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"), - MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"), - MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"), - MovedModule("tkinter_tix", "Tix", "tkinter.tix"), - MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"), - MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"), - MovedModule("tkinter_colorchooser", "tkColorChooser", - "tkinter.colorchooser"), - MovedModule("tkinter_commondialog", "tkCommonDialog", - "tkinter.commondialog"), - MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"), - MovedModule("tkinter_font", "tkFont", "tkinter.font"), - MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"), - MovedModule("tkinter_tksimpledialog", "tkSimpleDialog", - "tkinter.simpledialog"), - MovedModule("urllib_parse", __name__ + ".moves.urllib_parse", "urllib.parse"), - MovedModule("urllib_error", __name__ + ".moves.urllib_error", "urllib.error"), - MovedModule("urllib", __name__ + ".moves.urllib", __name__ + ".moves.urllib"), - MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"), - MovedModule("winreg", "_winreg"), -] -for attr in _moved_attributes: - setattr(_MovedItems, attr.name, attr) -del attr - -moves = sys.modules[__name__ + ".moves"] = _MovedItems(__name__ + ".moves") - - - -class Module_six_moves_urllib_parse(types.ModuleType): - """Lazy loading of moved objects in six.moves.urllib_parse""" - - -_urllib_parse_moved_attributes = [ - MovedAttribute("ParseResult", "urlparse", "urllib.parse"), - MovedAttribute("parse_qs", "urlparse", "urllib.parse"), - MovedAttribute("parse_qsl", "urlparse", "urllib.parse"), - MovedAttribute("urldefrag", "urlparse", "urllib.parse"), - MovedAttribute("urljoin", "urlparse", "urllib.parse"), - MovedAttribute("urlparse", "urlparse", "urllib.parse"), - MovedAttribute("urlsplit", "urlparse", "urllib.parse"), - MovedAttribute("urlunparse", "urlparse", "urllib.parse"), - MovedAttribute("urlunsplit", "urlparse", "urllib.parse"), - MovedAttribute("quote", "urllib", "urllib.parse"), - MovedAttribute("quote_plus", "urllib", "urllib.parse"), - MovedAttribute("unquote", "urllib", "urllib.parse"), - MovedAttribute("unquote_plus", "urllib", "urllib.parse"), - MovedAttribute("urlencode", "urllib", "urllib.parse"), -] -for attr in _urllib_parse_moved_attributes: - setattr(Module_six_moves_urllib_parse, attr.name, attr) -del attr - -sys.modules[__name__ + ".moves.urllib_parse"] = Module_six_moves_urllib_parse(__name__ + ".moves.urllib_parse") -sys.modules[__name__ + ".moves.urllib.parse"] = Module_six_moves_urllib_parse(__name__ + ".moves.urllib.parse") - - -class Module_six_moves_urllib_error(types.ModuleType): - """Lazy loading of moved objects in six.moves.urllib_error""" - - -_urllib_error_moved_attributes = [ - MovedAttribute("URLError", "urllib2", "urllib.error"), - MovedAttribute("HTTPError", "urllib2", "urllib.error"), - MovedAttribute("ContentTooShortError", "urllib", "urllib.error"), -] -for attr in _urllib_error_moved_attributes: - setattr(Module_six_moves_urllib_error, attr.name, attr) -del attr - -sys.modules[__name__ + ".moves.urllib_error"] = Module_six_moves_urllib_error(__name__ + ".moves.urllib_error") -sys.modules[__name__ + ".moves.urllib.error"] = Module_six_moves_urllib_error(__name__ + ".moves.urllib.error") - - -class Module_six_moves_urllib_request(types.ModuleType): - """Lazy loading of moved objects in six.moves.urllib_request""" - - -_urllib_request_moved_attributes = [ - MovedAttribute("urlopen", "urllib2", "urllib.request"), - MovedAttribute("install_opener", "urllib2", "urllib.request"), - MovedAttribute("build_opener", "urllib2", "urllib.request"), - MovedAttribute("pathname2url", "urllib", "urllib.request"), - MovedAttribute("url2pathname", "urllib", "urllib.request"), - MovedAttribute("getproxies", "urllib", "urllib.request"), - MovedAttribute("Request", "urllib2", "urllib.request"), - MovedAttribute("OpenerDirector", "urllib2", "urllib.request"), - MovedAttribute("HTTPDefaultErrorHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPRedirectHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPCookieProcessor", "urllib2", "urllib.request"), - MovedAttribute("ProxyHandler", "urllib2", "urllib.request"), - MovedAttribute("BaseHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPPasswordMgr", "urllib2", "urllib.request"), - MovedAttribute("HTTPPasswordMgrWithDefaultRealm", "urllib2", "urllib.request"), - MovedAttribute("AbstractBasicAuthHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPBasicAuthHandler", "urllib2", "urllib.request"), - MovedAttribute("ProxyBasicAuthHandler", "urllib2", "urllib.request"), - MovedAttribute("AbstractDigestAuthHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPDigestAuthHandler", "urllib2", "urllib.request"), - MovedAttribute("ProxyDigestAuthHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPSHandler", "urllib2", "urllib.request"), - MovedAttribute("FileHandler", "urllib2", "urllib.request"), - MovedAttribute("FTPHandler", "urllib2", "urllib.request"), - MovedAttribute("CacheFTPHandler", "urllib2", "urllib.request"), - MovedAttribute("UnknownHandler", "urllib2", "urllib.request"), - MovedAttribute("HTTPErrorProcessor", "urllib2", "urllib.request"), - MovedAttribute("urlretrieve", "urllib", "urllib.request"), - MovedAttribute("urlcleanup", "urllib", "urllib.request"), - MovedAttribute("URLopener", "urllib", "urllib.request"), - MovedAttribute("FancyURLopener", "urllib", "urllib.request"), -] -for attr in _urllib_request_moved_attributes: - setattr(Module_six_moves_urllib_request, attr.name, attr) -del attr - -sys.modules[__name__ + ".moves.urllib_request"] = Module_six_moves_urllib_request(__name__ + ".moves.urllib_request") -sys.modules[__name__ + ".moves.urllib.request"] = Module_six_moves_urllib_request(__name__ + ".moves.urllib.request") - - -class Module_six_moves_urllib_response(types.ModuleType): - """Lazy loading of moved objects in six.moves.urllib_response""" - - -_urllib_response_moved_attributes = [ - MovedAttribute("addbase", "urllib", "urllib.response"), - MovedAttribute("addclosehook", "urllib", "urllib.response"), - MovedAttribute("addinfo", "urllib", "urllib.response"), - MovedAttribute("addinfourl", "urllib", "urllib.response"), -] -for attr in _urllib_response_moved_attributes: - setattr(Module_six_moves_urllib_response, attr.name, attr) -del attr - -sys.modules[__name__ + ".moves.urllib_response"] = Module_six_moves_urllib_response(__name__ + ".moves.urllib_response") -sys.modules[__name__ + ".moves.urllib.response"] = Module_six_moves_urllib_response(__name__ + ".moves.urllib.response") - - -class Module_six_moves_urllib_robotparser(types.ModuleType): - """Lazy loading of moved objects in six.moves.urllib_robotparser""" - - -_urllib_robotparser_moved_attributes = [ - MovedAttribute("RobotFileParser", "robotparser", "urllib.robotparser"), -] -for attr in _urllib_robotparser_moved_attributes: - setattr(Module_six_moves_urllib_robotparser, attr.name, attr) -del attr - -sys.modules[__name__ + ".moves.urllib_robotparser"] = Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib_robotparser") -sys.modules[__name__ + ".moves.urllib.robotparser"] = Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib.robotparser") - - -class Module_six_moves_urllib(types.ModuleType): - """Create a six.moves.urllib namespace that resembles the Python 3 namespace""" - parse = sys.modules[__name__ + ".moves.urllib_parse"] - error = sys.modules[__name__ + ".moves.urllib_error"] - request = sys.modules[__name__ + ".moves.urllib_request"] - response = sys.modules[__name__ + ".moves.urllib_response"] - robotparser = sys.modules[__name__ + ".moves.urllib_robotparser"] - - -sys.modules[__name__ + ".moves.urllib"] = Module_six_moves_urllib(__name__ + ".moves.urllib") - - -def add_move(move): - """Add an item to six.moves.""" - setattr(_MovedItems, move.name, move) - - -def remove_move(name): - """Remove item from six.moves.""" - try: - delattr(_MovedItems, name) - except AttributeError: - try: - del moves.__dict__[name] - except KeyError: - raise AttributeError("no such move, %r" % (name,)) - - -if PY3: - _meth_func = "__func__" - _meth_self = "__self__" - - _func_closure = "__closure__" - _func_code = "__code__" - _func_defaults = "__defaults__" - _func_globals = "__globals__" - - _iterkeys = "keys" - _itervalues = "values" - _iteritems = "items" - _iterlists = "lists" -else: - _meth_func = "im_func" - _meth_self = "im_self" - - _func_closure = "func_closure" - _func_code = "func_code" - _func_defaults = "func_defaults" - _func_globals = "func_globals" - - _iterkeys = "iterkeys" - _itervalues = "itervalues" - _iteritems = "iteritems" - _iterlists = "iterlists" - - -try: - advance_iterator = next -except NameError: - def advance_iterator(it): - return it.next() -next = advance_iterator - - -try: - callable = callable -except NameError: - def callable(obj): - return any("__call__" in klass.__dict__ for klass in type(obj).__mro__) - - -if PY3: - def get_unbound_function(unbound): - return unbound - - create_bound_method = types.MethodType - - Iterator = object -else: - def get_unbound_function(unbound): - return unbound.im_func - - def create_bound_method(func, obj): - return types.MethodType(func, obj, obj.__class__) - - class Iterator(object): - - def next(self): - return type(self).__next__(self) - - callable = callable -_add_doc(get_unbound_function, - """Get the function out of a possibly unbound function""") - - -get_method_function = operator.attrgetter(_meth_func) -get_method_self = operator.attrgetter(_meth_self) -get_function_closure = operator.attrgetter(_func_closure) -get_function_code = operator.attrgetter(_func_code) -get_function_defaults = operator.attrgetter(_func_defaults) -get_function_globals = operator.attrgetter(_func_globals) - - -def iterkeys(d, **kw): - """Return an iterator over the keys of a dictionary.""" - return iter(getattr(d, _iterkeys)(**kw)) - -def itervalues(d, **kw): - """Return an iterator over the values of a dictionary.""" - return iter(getattr(d, _itervalues)(**kw)) - -def iteritems(d, **kw): - """Return an iterator over the (key, value) pairs of a dictionary.""" - return iter(getattr(d, _iteritems)(**kw)) - -def iterlists(d, **kw): - """Return an iterator over the (key, [values]) pairs of a dictionary.""" - return iter(getattr(d, _iterlists)(**kw)) - - -if PY3: - def b(s): - return s.encode("latin-1") - def u(s): - return s - unichr = chr - if sys.version_info[1] <= 1: - def int2byte(i): - return bytes((i,)) - else: - # This is about 2x faster than the implementation above on 3.2+ - int2byte = operator.methodcaller("to_bytes", 1, "big") - byte2int = operator.itemgetter(0) - indexbytes = operator.getitem - iterbytes = iter - import io - StringIO = io.StringIO - BytesIO = io.BytesIO -else: - def b(s): - return s - def u(s): - return unicode(s, "unicode_escape") - unichr = unichr - int2byte = chr - def byte2int(bs): - return ord(bs[0]) - def indexbytes(buf, i): - return ord(buf[i]) - def iterbytes(buf): - return (ord(byte) for byte in buf) - import StringIO - StringIO = BytesIO = StringIO.StringIO -_add_doc(b, """Byte literal""") -_add_doc(u, """Text literal""") - - -if PY3: - import builtins - exec_ = getattr(builtins, "exec") - - - def reraise(tp, value, tb=None): - if value.__traceback__ is not tb: - raise value.with_traceback(tb) - raise value - - - print_ = getattr(builtins, "print") - del builtins - -else: - def exec_(_code_, _globs_=None, _locs_=None): - """Execute code in a namespace.""" - if _globs_ is None: - frame = sys._getframe(1) - _globs_ = frame.f_globals - if _locs_ is None: - _locs_ = frame.f_locals - del frame - elif _locs_ is None: - _locs_ = _globs_ - exec("""exec _code_ in _globs_, _locs_""") - - - exec_("""def reraise(tp, value, tb=None): - raise tp, value, tb -""") - - - def print_(*args, **kwargs): - """The new-style print function.""" - fp = kwargs.pop("file", sys.stdout) - if fp is None: - return - def write(data): - if not isinstance(data, basestring): - data = str(data) - fp.write(data) - want_unicode = False - sep = kwargs.pop("sep", None) - if sep is not None: - if isinstance(sep, unicode): - want_unicode = True - elif not isinstance(sep, str): - raise TypeError("sep must be None or a string") - end = kwargs.pop("end", None) - if end is not None: - if isinstance(end, unicode): - want_unicode = True - elif not isinstance(end, str): - raise TypeError("end must be None or a string") - if kwargs: - raise TypeError("invalid keyword arguments to print()") - if not want_unicode: - for arg in args: - if isinstance(arg, unicode): - want_unicode = True - break - if want_unicode: - newline = unicode("\n") - space = unicode(" ") - else: - newline = "\n" - space = " " - if sep is None: - sep = space - if end is None: - end = newline - for i, arg in enumerate(args): - if i: - write(sep) - write(arg) - write(end) - -_add_doc(reraise, """Reraise an exception.""") - - -def with_metaclass(meta, *bases): - """Create a base class with a metaclass.""" - return meta("NewBase", bases, {}) - -def add_metaclass(metaclass): - """Class decorator for creating a class with a metaclass.""" - def wrapper(cls): - orig_vars = cls.__dict__.copy() - orig_vars.pop('__dict__', None) - orig_vars.pop('__weakref__', None) - for slots_var in orig_vars.get('__slots__', ()): - orig_vars.pop(slots_var) - return metaclass(cls.__name__, cls.__bases__, orig_vars) - return wrapper diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 57bd88a30eb0e..89c1777b8c32c 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -37,8 +37,6 @@ def _more_tags(self): rng = np.random.RandomState(0) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_invalid_input(): clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=None, tol=None) @@ -252,8 +250,6 @@ def test_2d_coef(): assert_array_almost_equal(X_new, X[:, feature_mask]) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False, max_iter=5, tol=None) @@ -284,8 +280,6 @@ def test_calling_fit_reinitializes(): assert transformer.estimator_.C == 100 -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_prefit(): # Test all possible combinations of the prefit parameter. @@ -325,8 +319,6 @@ def test_threshold_string(): assert_array_almost_equal(X_transform, data[:, mask]) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_threshold_without_refitting(): # Test that the threshold can be set without refitting the model. clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 06d182f7fcbdb..152055a62c662 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -205,7 +205,7 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator): >>> y[:4] = rng.uniform(10, 20, 4) >>> huber = HuberRegressor().fit(X, y) >>> huber.score(X, y) - -7.284608623514573 + -7.284... >>> huber.predict(X[:1,]) array([806.7200...]) >>> linear = LinearRegression().fit(X, y) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index a739c876fa77f..ab9594c4d0567 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -229,7 +229,6 @@ def test_lasso_path_return_models_vs_new_return_gives_same_coefficients(): decimal=1) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index 78fa0f3b1cd14..cb70db88d3d41 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -143,8 +143,6 @@ def test_huber_scaling_invariant(): assert_array_equal(n_outliers_mask_3, n_outliers_mask_1) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_huber_and_sgd_same_results(): # Test they should converge to same coefficients for same parameters diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index 5da9883cba369..34fe8334211b4 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -67,8 +67,6 @@ def project(self, X): return np.dot(X, self.w) + self.b -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): @@ -86,8 +84,6 @@ def test_classifier_accuracy(): assert hasattr(clf, 'standard_coef_') -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): @@ -105,8 +101,6 @@ def test_classifier_partial_fit(): assert hasattr(clf, 'standard_coef_') -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_classifier_refit(): # Classifier can be retrained on different labels and features. clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y) @@ -116,8 +110,6 @@ def test_classifier_refit(): assert_array_equal(clf.classes_, iris.target_names) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') @pytest.mark.parametrize('loss', ("hinge", "squared_hinge")) def test_classifier_correctness(loss): y_bin = y.copy() @@ -140,8 +132,6 @@ def test_classifier_undefined_methods(): assert_raises(AttributeError, lambda x: getattr(clf, x), meth) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_class_weights(): # Test class weights. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], @@ -164,16 +154,12 @@ def test_class_weights(): assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_partial_fit_weight_class_balanced(): # partial_fit with class_weight='balanced' not supported clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100) assert_raises(ValueError, clf.partial_fit, X, y, classes=np.unique(y)) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_equal_class_weight(): X2 = [[1, 0], [1, 0], [0, 1], [0, 1]] y2 = [0, 0, 1, 1] @@ -195,8 +181,6 @@ def test_equal_class_weight(): assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_wrong_class_weight_label(): # ValueError due to wrong class_weight label. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], @@ -207,8 +191,6 @@ def test_wrong_class_weight_label(): assert_raises(ValueError, clf.fit, X2, y2) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_wrong_class_weight_format(): # ValueError due to wrong class_weight argument type. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], @@ -222,8 +204,6 @@ def test_wrong_class_weight_format(): assert_raises(ValueError, clf.fit, X2, y2) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_regressor_mse(): y_bin = y.copy() y_bin[y != 1] = -1 @@ -244,8 +224,6 @@ def test_regressor_mse(): assert hasattr(reg, 'standard_coef_') -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 @@ -265,8 +243,6 @@ def test_regressor_partial_fit(): assert hasattr(reg, 'standard_coef_') -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') @pytest.mark.parametrize( 'loss', ("epsilon_insensitive", "squared_epsilon_insensitive")) diff --git a/sklearn/linear_model/tests/test_perceptron.py b/sklearn/linear_model/tests/test_perceptron.py index ffbd844b902f2..6cdd538ca9247 100644 --- a/sklearn/linear_model/tests/test_perceptron.py +++ b/sklearn/linear_model/tests/test_perceptron.py @@ -43,8 +43,6 @@ def predict(self, X): return np.sign(self.project(X)) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_perceptron_accuracy(): for data in (X, X_csr): clf = Perceptron(max_iter=100, tol=None, shuffle=False) @@ -53,8 +51,6 @@ def test_perceptron_accuracy(): assert score > 0.7 -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_perceptron_correctness(): y_bin = y.copy() y_bin[y != 1] = -1 diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index ae29fb81cdd5d..244ef0114b9bf 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -334,7 +334,6 @@ def test_ransac_min_n_samples(): assert_raises(ValueError, ransac_estimator7.fit, X, y) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 def test_ransac_multi_dimensional_targets(): base_estimator = LinearRegression() @@ -355,7 +354,6 @@ def test_ransac_multi_dimensional_targets(): assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 def test_ransac_residual_loss(): loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1) loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1) diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index c786b154fcb85..4d17c58ee1176 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -720,7 +720,6 @@ def check_dense_sparse(test_func): assert_array_almost_equal(ret_dense, ret_sparse, decimal=3) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 @pytest.mark.parametrize( 'test_func', (_test_ridge_loo, _test_ridge_cv, _test_ridge_cv_normalize, diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index f462a1fb4a040..1d7c582c51a7d 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -24,11 +24,6 @@ from sklearn.model_selection import RandomizedSearchCV -# 0.23. warning about tol not having its correct default value. -pytestmark = pytest.mark.filterwarnings( - "ignore:max_iter and tol parameters have been") - - def _update_kwargs(kwargs): if "random_state" not in kwargs: kwargs["random_state"] = 42 diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index fc6183f3a1f0b..056927bee75d0 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1358,7 +1358,6 @@ def test_pickle(): random_search_pickled.predict(X)) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index aaf4f497f1585..c72ac0c1b7a14 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1098,8 +1098,6 @@ def test_learning_curve_incremental_learning_unsupervised(): np.linspace(0.1, 1.0, 10)) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, @@ -1167,8 +1165,6 @@ def test_learning_curve_with_boolean_indices(): np.linspace(0.1, 1.0, 10)) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_learning_curve_with_shuffle(): # Following test case was designed this way to verify the code # changes made in pull request: #7506. @@ -1411,7 +1407,6 @@ def test_cross_val_predict_with_method(): LogisticRegression(solver="liblinear")) -@pytest.mark.filterwarnings('ignore: max_iter and tol parameters') def test_cross_val_predict_method_checking(): # Regression test for issue #9639. Tests that cross_val_predict does not # check estimator methods (e.g. predict_proba) before fitting diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py index 53f69b79edb40..09a01ad69dbdd 100644 --- a/sklearn/neural_network/tests/test_mlp.py +++ b/sklearn/neural_network/tests/test_mlp.py @@ -345,7 +345,6 @@ def test_multilabel_classification(): mlp.fit(X, y).predict(X) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 def test_multioutput_regression(): # Test that multi-output regression works as expected X, y = make_regression(n_samples=200, n_targets=5) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index ef8b9c6db9e3b..19e21c0862cf7 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2543,7 +2543,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000, ignore_implicit_zeros=False, subsample=int(1e5), random_state=None, - copy="warn"): + copy=True): """Transform features using quantiles information. This method transforms the features to follow a uniform or a normal @@ -2601,18 +2601,13 @@ def quantile_transform(X, axis=0, n_quantiles=1000, by np.random. Note that this is used by subsampling and smoothing noise. - copy : boolean, optional, (default="warn") + copy : boolean, optional, (default=True) Set to False to perform inplace transformation and avoid a copy (if the input is already a numpy array). If True, a copy of `X` is transformed, leaving the original `X` unchanged - .. deprecated:: 0.21 - The default value of parameter `copy` will be changed from False - to True in 0.23. The current default of False is being changed to - make it more consistent with the default `copy` values of other - functions in :mod:`sklearn.preprocessing`. Furthermore, the - current default of False may have unexpected side effects by - modifying the value of `X` inplace + ..versionchnanged:: 0.22 + The default value of `copy` changed from False to True in 0.22. Returns ------- @@ -2649,17 +2644,6 @@ def quantile_transform(X, axis=0, n_quantiles=1000, see :ref:`examples/preprocessing/plot_all_scaling.py `. """ - if copy == "warn": - warnings.warn("The default value of `copy` will change from False to " - "True in 0.23 in order to make it more consistent with " - "the default `copy` values of other functions in " - ":mod:`sklearn.preprocessing` and prevent " - "unexpected side effects by modifying the value of `X` " - "inplace. To avoid inplace modifications of `X`, it is " - "recommended to explicitly set `copy=True`", - FutureWarning) - copy = False - n = QuantileTransformer(n_quantiles=n_quantiles, output_distribution=output_distribution, subsample=subsample, diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 060719200fa99..a67c101dec499 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1453,7 +1453,6 @@ def test_quantile_transform_sparse_toy(): assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) -@pytest.mark.filterwarnings("ignore: The default value of `copy`") # 0.23 def test_quantile_transform_axis1(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], @@ -1533,18 +1532,6 @@ def test_quantile_transform_nan(): assert not np.isnan(transformer.quantiles_[:, 1:]).any() -def test_deprecated_quantile_transform_copy(): - future_message = ("The default value of `copy` will change from False to " - "True in 0.23 in order to make it more consistent with " - "the default `copy` values of other functions in " - ":mod:`sklearn.preprocessing` and prevent " - "unexpected side effects by modifying the value of `X` " - "inplace. To avoid inplace modifications of `X`, it is " - "recommended to explicitly set `copy=True`") - assert_warns_message(FutureWarning, future_message, quantile_transform, - np.array([[0, 1], [0, 0.5], [1, 0]])) - - def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), @@ -2163,7 +2150,6 @@ def test_fit_cold_start(): scaler.fit_transform(X_2d) -@pytest.mark.filterwarnings("ignore: The default value of `copy`") # 0.23 def test_quantile_transform_valid_axis(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 155dbcaaa1f6c..f480fffda1571 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -490,29 +490,6 @@ def test_tag_inheritance(): assert inherit_diamond_tag_est._get_tags()['allow_nan'] -# XXX: Remove in 0.23 -def test_regressormixin_score_multioutput(): - from sklearn.linear_model import LinearRegression - # no warnings when y_type is continuous - X = [[1], [2], [3]] - y = [1, 2, 3] - reg = LinearRegression().fit(X, y) - assert_no_warnings(reg.score, X, y) - # warn when y_type is continuous-multioutput - y = [[1, 2], [2, 3], [3, 4]] - reg = LinearRegression().fit(X, y) - msg = ("The default value of multioutput (not exposed in " - "score method) will change from 'variance_weighted' " - "to 'uniform_average' in 0.23 to keep consistent " - "with 'metrics.r2_score'. To specify the default " - "value manually and avoid the warning, please " - "either call 'metrics.r2_score' directly or make a " - "custom scorer with 'metrics.make_scorer' (the " - "built-in scorer 'r2' uses " - "multioutput='uniform_average').") - assert_warns_message(FutureWarning, msg, reg.score, X, y) - - def test_warns_on_get_params_non_attribute(): class MyEstimator(BaseEstimator): def __init__(self, param=5): diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 55f3abc77b0de..0d4addb48e64d 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -708,7 +708,6 @@ def test_dummy_regressor_return_std(): assert_array_equal(y_pred_list[1], y_std_expected) -@pytest.mark.filterwarnings('ignore: The default value of multioutput') # 0.23 @pytest.mark.parametrize("y,y_test", [ ([1, 1, 1, 2], [1.25] * 4), (np.array([[2, 2], diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index ef0aa888f2ab9..33eb5da939725 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -76,8 +76,6 @@ def test_ovr_fit_predict(): assert np.mean(iris.target == pred) > 0.65 -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) @@ -602,8 +600,6 @@ def test_ovo_gridsearch(): assert best_C in Cs -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label @@ -629,8 +625,6 @@ def test_ovo_ties(): assert ovo_prediction[0] == normalized_confidences[0].argmax() -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_ovo_ties2(): # test that ties can not only be won by the first two labels X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index cd87ad3fc863d..6256f72a4b0b3 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -50,8 +50,6 @@ def test_multi_target_regression(): assert_almost_equal(references, y_pred) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_multi_target_regression_partial_fit(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] @@ -113,8 +111,6 @@ def test_multi_target_sample_weights_api(): rgr.fit(X, y, w) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_multi_target_sample_weight_partial_fit(): # weighted regressor X = [[1, 2, 3], [4, 5, 6]] @@ -219,8 +215,6 @@ def custom_scorer(estimator, X, y): multi_target_linear.predict_proba(X) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict @@ -252,8 +246,6 @@ def test_multi_output_classification_partial_fit(): assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i]) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_multi_output_classification_partial_fit_no_first_classes_exception(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) @@ -368,8 +360,6 @@ def test_multi_output_classification_sample_weights(): assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test)) -# 0.23. warning about tol not having its correct default value. -@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been') def test_multi_output_classification_partial_fit_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 93aca01ffab8b..82abff2b12183 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1086,9 +1086,7 @@ def check_pandas_support(caller_name): ) from e -def all_estimators(include_meta_estimators=None, - include_other=None, type_filter=None, - include_dont_test=None): +def all_estimators(type_filter=None): """Get a list of all estimators from sklearn. This function crawls the module and gets all classes that inherit @@ -1098,20 +1096,6 @@ def all_estimators(include_meta_estimators=None, Parameters ---------- - include_meta_estimators : boolean, default=False - Deprecated, ignored. - - .. deprecated:: 0.21 - ``include_meta_estimators`` has been deprecated and has no effect in - 0.21 and will be removed in 0.23. - - include_other : boolean, default=False - Deprecated, ignored. - - .. deprecated:: 0.21 - ``include_other`` has been deprecated and has not effect in 0.21 and - will be removed in 0.23. - type_filter : string, list of string, or None, default=None Which kind of estimators should be returned. If None, no filter is applied and all estimators are returned. Possible values are @@ -1119,13 +1103,6 @@ def all_estimators(include_meta_estimators=None, estimators only of these specific types, or a list of these to get the estimators that fit at least one of the types. - include_dont_test : boolean, default=False - Deprecated, ignored. - - .. deprecated:: 0.21 - ``include_dont_test`` has been deprecated and has no effect in 0.21 - and will be removed in 0.23. - Returns ------- estimators : list of tuples @@ -1144,21 +1121,6 @@ def is_abstract(c): return False return True - if include_other is not None: - warnings.warn("include_other was deprecated in version 0.21," - " has no effect and will be removed in 0.23", - DeprecationWarning) - - if include_dont_test is not None: - warnings.warn("include_dont_test was deprecated in version 0.21," - " has no effect and will be removed in 0.23", - DeprecationWarning) - - if include_meta_estimators is not None: - warnings.warn("include_meta_estimators was deprecated in version 0.21," - " has no effect and will be removed in 0.23", - DeprecationWarning) - all_classes = [] modules_to_ignore = {"tests", "externals", "setup", "conftest"} root = str(Path(__file__).parent.parent) # sklearn package diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 806f302b78288..4e4e6043eae3d 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -438,9 +438,7 @@ def assert_allclose_dense_sparse(x, y, rtol=1e-07, atol=1e-9, err_msg=''): # TODO: Remove in 0.24. This class is now in utils.__init__. -def all_estimators(include_meta_estimators=None, - include_other=None, type_filter=None, - include_dont_test=None): +def all_estimators(type_filter=None): """Get a list of all estimators from sklearn. This function crawls the module and gets all classes that inherit @@ -450,19 +448,6 @@ def all_estimators(include_meta_estimators=None, Parameters ---------- - include_meta_estimators : boolean, default=False - Deprecated, ignored. - - .. deprecated:: 0.21 - ``include_meta_estimators`` has been deprecated and has no effect in - 0.21 and will be removed in 0.23. - - include_other : boolean, default=False - Deprecated, ignored. - - .. deprecated:: 0.21 - ``include_other`` has been deprecated and has not effect in 0.21 and - will be removed in 0.23. type_filter : string, list of string, or None, default=None Which kind of estimators should be returned. If None, no filter is @@ -471,13 +456,6 @@ def all_estimators(include_meta_estimators=None, estimators only of these specific types, or a list of these to get the estimators that fit at least one of the types. - include_dont_test : boolean, default=False - Deprecated, ignored. - - .. deprecated:: 0.21 - ``include_dont_test`` has been deprecated and has no effect in 0.21 - and will be removed in 0.23. - Returns ------- estimators : list of tuples @@ -491,21 +469,6 @@ def is_abstract(c): return False return True - if include_other is not None: - warnings.warn("include_other was deprecated in version 0.21," - " has no effect and will be removed in 0.23", - FutureWarning) - - if include_dont_test is not None: - warnings.warn("include_dont_test was deprecated in version 0.21," - " has no effect and will be removed in 0.23", - FutureWarning) - - if include_meta_estimators is not None: - warnings.warn("include_meta_estimators was deprecated in version 0.21," - " has no effect and will be removed in 0.23", - FutureWarning) - all_classes = [] # get parent folder path = sklearn.__path__ diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index bdd31f9c4859f..f121f11658051 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -412,55 +412,18 @@ def test_check_array_dtype_stability(): def test_check_array_dtype_warning(): X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - X_float64 = np.asarray(X_int_list, dtype=np.float64) X_float32 = np.asarray(X_int_list, dtype=np.float32) X_int64 = np.asarray(X_int_list, dtype=np.int64) - X_csr_float64 = sp.csr_matrix(X_float64) X_csr_float32 = sp.csr_matrix(X_float32) X_csc_float32 = sp.csc_matrix(X_float32) X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32) - y = [0, 0, 1] integer_data = [X_int64, X_csc_int32] - float64_data = [X_float64, X_csr_float64] float32_data = [X_float32, X_csr_float32, X_csc_float32] for X in integer_data: X_checked = assert_no_warnings(check_array, X, dtype=np.float64, accept_sparse=True) assert X_checked.dtype == np.float64 - X_checked = assert_warns(DataConversionWarning, check_array, X, - dtype=np.float64, - accept_sparse=True, warn_on_dtype=True) - assert X_checked.dtype == np.float64 - - # Check that the warning message includes the name of the Estimator - X_checked = assert_warns_message(DataConversionWarning, - 'SomeEstimator', - check_array, X, - dtype=[np.float64, np.float32], - accept_sparse=True, - warn_on_dtype=True, - estimator='SomeEstimator') - assert X_checked.dtype == np.float64 - - X_checked, y_checked = assert_warns_message( - DataConversionWarning, 'KNeighborsClassifier', - check_X_y, X, y, dtype=np.float64, accept_sparse=True, - warn_on_dtype=True, estimator=KNeighborsClassifier()) - - assert X_checked.dtype == np.float64 - - for X in float64_data: - with pytest.warns(None) as record: - warnings.simplefilter("ignore", FutureWarning) # 0.23 - X_checked = check_array(X, dtype=np.float64, - accept_sparse=True, warn_on_dtype=True) - assert X_checked.dtype == np.float64 - X_checked = check_array(X, dtype=np.float64, - accept_sparse=True, warn_on_dtype=False) - assert X_checked.dtype == np.float64 - assert len(record) == 0 - for X in float32_data: X_checked = assert_no_warnings(check_array, X, dtype=[np.float64, np.float32], @@ -484,17 +447,6 @@ def test_check_array_dtype_warning(): assert X_checked.format == 'csr' -def test_check_array_warn_on_dtype_deprecation(): - X = np.asarray([[0.0], [1.0]]) - Y = np.asarray([[2.0], [3.0]]) - with pytest.warns(FutureWarning, - match="'warn_on_dtype' is deprecated"): - check_array(X, warn_on_dtype=True) - with pytest.warns(FutureWarning, - match="'warn_on_dtype' is deprecated"): - check_X_y(X, Y, warn_on_dtype=True) - - def test_check_array_accept_sparse_type_exception(): X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) @@ -725,16 +677,6 @@ def test_check_is_fitted(): assert check_is_fitted(ard) is None assert check_is_fitted(svr) is None - # to be removed in 0.23 - assert_warns_message( - FutureWarning, - "Passing attributes to check_is_fitted is deprecated", - check_is_fitted, ard, ['coef_']) - assert_warns_message( - FutureWarning, - "Passing all_or_any to check_is_fitted is deprecated", - check_is_fitted, ard, all_or_any=any) - def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) @@ -788,44 +730,6 @@ def test_check_array_series(): assert_array_equal(res, np.array(['a', 'b', 'c'], dtype=object)) -def test_check_dataframe_warns_on_dtype(): - # Check that warn_on_dtype also works for DataFrames. - # https://github.com/scikit-learn/scikit-learn/issues/10948 - pd = importorskip("pandas") - - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], dtype=object) - assert_warns_message(DataConversionWarning, - "Data with input dtype object were all converted to " - "float64.", - check_array, df, dtype=np.float64, warn_on_dtype=True) - assert_warns(DataConversionWarning, check_array, df, - dtype='numeric', warn_on_dtype=True) - with pytest.warns(None) as record: - warnings.simplefilter("ignore", FutureWarning) # 0.23 - check_array(df, dtype='object', warn_on_dtype=True) - assert len(record) == 0 - - # Also check that it raises a warning for mixed dtypes in a DataFrame. - df_mixed = pd.DataFrame([['1', 2, 3], ['4', 5, 6]]) - assert_warns(DataConversionWarning, check_array, df_mixed, - dtype=np.float64, warn_on_dtype=True) - assert_warns(DataConversionWarning, check_array, df_mixed, - dtype='numeric', warn_on_dtype=True) - assert_warns(DataConversionWarning, check_array, df_mixed, - dtype=object, warn_on_dtype=True) - - # Even with numerical dtypes, a conversion can be made because dtypes are - # uniformized throughout the array. - df_mixed_numeric = pd.DataFrame([[1., 2, 3], [4., 5, 6]]) - assert_warns(DataConversionWarning, check_array, df_mixed_numeric, - dtype='numeric', warn_on_dtype=True) - with pytest.warns(None) as record: - warnings.simplefilter("ignore", FutureWarning) # 0.23 - check_array(df_mixed_numeric.astype(int), - dtype='numeric', warn_on_dtype=True) - assert len(record) == 0 - - def test_check_dataframe_mixed_float_dtypes(): # pandas dataframe will coerce a boolean into a object, this is a mismatch # with np.result_type which will return a float diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index fb34f3b3cccbd..5502fdd534965 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -339,7 +339,7 @@ def _ensure_no_complex_data(array): def check_array(array, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, - ensure_min_features=1, warn_on_dtype=None, estimator=None): + ensure_min_features=1, estimator=None): """Input validation on an array, list, sparse matrix or similar. @@ -414,14 +414,6 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. - warn_on_dtype : boolean or None, optional (default=None) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - .. deprecated:: 0.21 - ``warn_on_dtype`` is deprecated in version 0.21 and will be - removed in 0.23. - estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. @@ -430,14 +422,6 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, array_converted : object The converted and validated array. """ - # warn_on_dtype deprecation - if warn_on_dtype is not None: - warnings.warn( - "'warn_on_dtype' is deprecated in version 0.21 and will be " - "removed in 0.23. Don't set `warn_on_dtype` to remove this " - "warning.", - FutureWarning, stacklevel=2) - # store reference to original array to check if copy is needed when # function returns array_orig = array @@ -582,24 +566,9 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, % (n_features, array.shape, ensure_min_features, context)) - if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig: - msg = ("Data with input dtype %s was converted to %s%s." - % (dtype_orig, array.dtype, context)) - warnings.warn(msg, DataConversionWarning, stacklevel=2) - if copy and np.may_share_memory(array, array_orig): array = np.array(array, dtype=dtype, order=order) - if (warn_on_dtype and dtypes_orig is not None and - {array.dtype} != set(dtypes_orig)): - # if there was at the beginning some other types than the final one - # (for instance in a DataFrame that can contain several dtypes) then - # some data must have been converted - msg = ("Data with input dtype %s were all converted to %s%s." - % (', '.join(map(str, sorted(set(dtypes_orig)))), array.dtype, - context)) - warnings.warn(msg, DataConversionWarning, stacklevel=3) - return array @@ -626,7 +595,7 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, - warn_on_dtype=None, estimator=None): + estimator=None): """Input validation for standard estimators. Checks X and y for consistent length, enforces X to be 2D and y 1D. By @@ -711,14 +680,6 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, it is converted to float64. Should only be used for regression algorithms. - warn_on_dtype : boolean or None, optional (default=None) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - .. deprecated:: 0.21 - ``warn_on_dtype`` is deprecated in version 0.21 and will be - removed in 0.23. - estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. @@ -740,7 +701,6 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, ensure_2d=ensure_2d, allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, - warn_on_dtype=warn_on_dtype, estimator=estimator) if multi_output: y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, @@ -890,8 +850,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, attributes='deprecated', msg=None, - all_or_any='deprecated'): +def check_is_fitted(estimator, msg=None): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -903,11 +862,6 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None, estimator : estimator instance. estimator instance for which the check is performed. - attributes : deprecated, ignored - .. deprecated:: 0.22 - `attributes` is deprecated, is currently ignored and will be removed - in 0.23. - msg : string The default error message is, "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this @@ -918,11 +872,6 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None, Eg. : "Estimator, %(name)s, must be fitted before sparsifying". - all_or_any : deprecated, ignored - .. deprecated:: 0.21 - `all_or_any` is deprecated, is currently ignored and will be removed - in 0.23. - Returns ------- None @@ -932,14 +881,6 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None, NotFittedError If the attributes are not found. """ - if attributes != 'deprecated': - warnings.warn("Passing attributes to check_is_fitted is deprecated" - " and will be removed in 0.23. The attributes " - "argument is ignored.", FutureWarning) - if all_or_any != 'deprecated': - warnings.warn("Passing all_or_any to check_is_fitted is deprecated" - " and will be removed in 0.23. The any_or_all " - "argument is ignored.", FutureWarning) if isclass(estimator): raise TypeError("{} is a class, not an instance.".format(estimator)) if msg is None: From 54da2f00fcec7458974dfb3d48f8881c715850f8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 9 Dec 2019 15:21:34 +0100 Subject: [PATCH 094/448] ENH do not allocate local arrays in Ridge*CV of store_cv_values is False (#15652) --- doc/whats_new/v0.23.rst | 10 +++- sklearn/linear_model/_ridge.py | 63 ++++++++++++++---------- sklearn/linear_model/tests/test_ridge.py | 21 ++++++-- 3 files changed, 63 insertions(+), 31 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b1edda7900b81..3de4c40e109c7 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -56,7 +56,7 @@ Changelog - |Enhancement| Functions :func:`datasets.make_circles` and :func:`datasets.make_moons` now accept two-element tuple. - :pr:`15707` by :user:`Maciej J Mikulski ` + :pr:`15707` by :user:`Maciej J Mikulski `. :mod:`sklearn.linear_model` ........................... @@ -66,6 +66,13 @@ Changelog the wrapped `base_estimator` during the fitting of the final model. :pr:`15573` by :user:`Jeremy Alexandre `. +- |Efficiency| :class:`linear_model.RidgeCV` and + :class:`linear_model.RidgeClassifierCV` now does not allocate a + potentially large array to store dual coefficients for all hyperparameters + during its `fit`, nor an array to store all error or LOO predictions unless + `store_cv_values` is `True`. + :pr:`15652` by :user:`Jérôme Dockès `. + :mod:`sklearn.preprocessing` ............................ @@ -78,4 +85,3 @@ Changelog - |Fix| :func:`tree.plot_tree` `rotate` parameter was unused and has been deprecated. :pr:`15806` by :user:`Chiara Marmo `. - diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 9e1dd7f22085d..1c0407066048c 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1054,6 +1054,16 @@ def _matmat(self, v): return res +class _IdentityEstimator: + """Hack to call a scorer when we already have the predictions.""" + + def decision_function(self, y_predict): + return y_predict + + def predict(self, y_predict): + return y_predict + + class _RidgeGCV(LinearModel): """Ridge regression with built-in Generalized Cross-Validation @@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel): looe = y - loov = c / diag(G^-1) + The best score (negative mean squared error or user-provided scoring) is + stored in the `best_score_` attribute, and the selected hyperparameter in + `alpha_`. + References ---------- http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf @@ -1462,43 +1476,40 @@ def fit(self, X, y, sample_weight=None): else: sqrt_sw = np.ones(X.shape[0], dtype=X.dtype) + X_mean, *decomposition = decompose(X, y, sqrt_sw) + scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None n_y = 1 if len(y.shape) == 1 else y.shape[1] - cv_values = np.zeros((n_samples * n_y, len(self.alphas)), - dtype=X.dtype) - C = [] - X_mean, *decomposition = decompose(X, y, sqrt_sw) + + if self.store_cv_values: + self.cv_values_ = np.empty( + (n_samples * n_y, len(self.alphas)), dtype=X.dtype) + + best_coef, best_score, best_alpha = None, None, None + for i, alpha in enumerate(self.alphas): G_inverse_diag, c = solve( float(alpha), y, sqrt_sw, X_mean, *decomposition) if error: squared_errors = (c / G_inverse_diag) ** 2 - cv_values[:, i] = squared_errors.ravel() + alpha_score = -squared_errors.mean() + if self.store_cv_values: + self.cv_values_[:, i] = squared_errors.ravel() else: predictions = y - (c / G_inverse_diag) - cv_values[:, i] = predictions.ravel() - C.append(c) + alpha_score = scorer( + _IdentityEstimator(), predictions.ravel(), y.ravel()) + if self.store_cv_values: + self.cv_values_[:, i] = predictions.ravel() - if error: - best = cv_values.mean(axis=0).argmin() - else: - # The scorer want an object that will make the predictions but - # they are already computed efficiently by _RidgeGCV. This - # identity_estimator will just return them - def identity_estimator(): - pass - identity_estimator.decision_function = lambda y_predict: y_predict - identity_estimator.predict = lambda y_predict: y_predict - - # signature of scorer is (estimator, X, y) - out = [scorer(identity_estimator, cv_values[:, i], y.ravel()) - for i in range(len(self.alphas))] - best = np.argmax(out) - - self.alpha_ = self.alphas[best] - self.dual_coef_ = C[best] + if (best_score is None) or (alpha_score > best_score): + best_coef, best_score, best_alpha = c, alpha_score, alpha + + self.alpha_ = best_alpha + self.best_score_ = best_score + self.dual_coef_ = best_coef self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) X_offset += X_mean * X_scale @@ -1509,7 +1520,7 @@ def identity_estimator(): cv_values_shape = n_samples, len(self.alphas) else: cv_values_shape = n_samples, n_y, len(self.alphas) - self.cv_values_ = cv_values.reshape(cv_values_shape) + self.cv_values_ = self.cv_values_.reshape(cv_values_shape) return self diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 4d17c58ee1176..ab45a093500df 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -34,6 +34,7 @@ from sklearn.linear_model._ridge import _check_gcv_mode from sklearn.linear_model._ridge import _X_CenterStackOp from sklearn.datasets import make_regression +from sklearn.datasets import make_classification from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold, GroupKFold, cross_val_predict @@ -661,6 +662,19 @@ def _test_ridge_cv(filter_): assert type(ridge_cv.intercept_) == np.float64 +@pytest.mark.parametrize( + "ridge, make_dataset", + [(RidgeCV(), make_regression), + (RidgeClassifierCV(), make_classification)] +) +def test_ridge_gcv_cv_values_not_stored(ridge, make_dataset): + # Check that `cv_values_` is not stored when store_cv_values is False + X, y = make_dataset(n_samples=6, random_state=42) + ridge.set_params(store_cv_values=False) + ridge.fit(X, y) + assert not hasattr(ridge, "cv_values_") + + def _test_ridge_diabetes(filter_): ridge = Ridge(fit_intercept=False) ridge.fit(filter_(X_diabetes), y_diabetes) @@ -817,7 +831,8 @@ def test_class_weights_cv(): assert_array_equal(reg.predict([[-.2, 2]]), np.array([-1])) -def test_ridgecv_store_cv_values(): +@pytest.mark.parametrize("scoring", [None, 'neg_mean_squared_error']) +def test_ridgecv_store_cv_values(scoring): rng = np.random.RandomState(42) n_samples = 8 @@ -826,7 +841,7 @@ def test_ridgecv_store_cv_values(): alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) - r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True) + r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True, scoring=scoring) # with len(y.shape) == 1 y = rng.randn(n_samples) @@ -839,7 +854,7 @@ def test_ridgecv_store_cv_values(): r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) - r = RidgeCV(cv=3, store_cv_values=True) + r = RidgeCV(cv=3, store_cv_values=True, scoring=scoring) assert_raises_regex(ValueError, 'cv!=None and store_cv_values', r.fit, x, y) From 677f2c6208cae031e8bd771005d745fe9b8f35db Mon Sep 17 00:00:00 2001 From: SylvainLan Date: Tue, 10 Dec 2019 13:58:42 +0100 Subject: [PATCH 095/448] DOC correct url for preprocessing (#15853) --- sklearn/utils/optimize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index 3534d85f1edef..fa682e8c2d97d 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -239,7 +239,7 @@ def _check_optimize_result(solver, result, max_iter=None, "Increase the number of iterations (max_iter) " "or scale the data as shown in:\n" " https://scikit-learn.org/stable/modules/" - "preprocessing.html." + "preprocessing.html" ).format(solver, result.status, result.message.decode("latin1")) if extra_warning_msg is not None: warning_msg += "\n" + extra_warning_msg From 7ee0ae861b4e75b9887f7af1d73860c17f79cde5 Mon Sep 17 00:00:00 2001 From: "Santiago M. Mola" Date: Tue, 10 Dec 2019 14:01:30 +0100 Subject: [PATCH 096/448] ENH CountVectorizer: sort features after pruning by frequency (#15834) --- doc/whats_new/v0.23.rst | 8 ++++++++ sklearn/feature_extraction/text.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 3de4c40e109c7..a926d86896e7d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -58,6 +58,14 @@ Changelog :func:`datasets.make_moons` now accept two-element tuple. :pr:`15707` by :user:`Maciej J Mikulski `. +:mod:`sklearn.feature_extraction` +................................. + +- |Efficiency| :class:`feature_extraction.text.CountVectorizer` now sorts + features after pruning them by document frequency. This improves performances + for datasets with large vocabularies combined with ``min_df`` or ``max_df``. + :pr:`15834` by :user:`Santiago M. Mola `. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index afc8ee4118cdc..2cd86a29cf4fa 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1223,8 +1223,6 @@ def fit_transform(self, raw_documents, y=None): X.data.fill(1) if not self.fixed_vocabulary_: - X = self._sort_features(X, vocabulary) - n_doc = X.shape[0] max_doc_count = (max_df if isinstance(max_df, numbers.Integral) @@ -1240,6 +1238,8 @@ def fit_transform(self, raw_documents, y=None): min_doc_count, max_features) + X = self._sort_features(X, vocabulary) + self.vocabulary_ = vocabulary return X From 5aa28afc46ff5ac4ef62d21e1fdc5001f038b09d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 10 Dec 2019 08:03:36 -0500 Subject: [PATCH 097/448] MNT avoid generating too many cross links in examples (#15844) --- doc/conf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index 7959a0862f547..0386f7676e0be 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -304,7 +304,9 @@ def __call__(self, directory): 'branch': binder_branch, 'dependencies': './binder/requirements.txt', 'use_jupyter_lab': True - } + }, + # avoid generating too many cross links + 'inspect_global_variables': False, } From 49b03befdad891ffbc581c20af14571a39242e27 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 10 Dec 2019 21:07:22 +0800 Subject: [PATCH 098/448] DOC Correct wrong doc in precision_recall_fscore_support (#15833) --- sklearn/metrics/_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 75ce29428cbe9..c7101f7a38eeb 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -1352,7 +1352,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, fbeta_score : float (if average is not None) or array of float, shape =\ [n_unique_labels] - support : int (if average is not None) or array of int, shape =\ + support : None (if average is not None) or array of int, shape =\ [n_unique_labels] The number of occurrences of each label in ``y_true``. From 62b1e266ef08acf5d3cf9267f33beb48e3820703 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 10 Dec 2019 05:14:00 -0800 Subject: [PATCH 099/448] DOC add comment in check_pca_float_dtype_preservation (#15819) Documenting the changes in https://github.com/scikit-learn/scikit-learn/pull/15775 --- sklearn/decomposition/tests/test_pca.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index d2c5452c10461..65624215b1158 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -532,6 +532,9 @@ def check_pca_float_dtype_preservation(svd_solver): assert pca_64.transform(X_64).dtype == np.float64 assert pca_32.transform(X_32).dtype == np.float32 + # the rtol is set such that the test passes on all platforms tested on + # conda-forge: PR#15775 + # see: https://github.com/conda-forge/scikit-learn-feedstock/pull/113 assert_allclose(pca_64.components_, pca_32.components_, rtol=2e-4) From 64750e561b5e05a8969f9673f00798715bbbbbc0 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 10 Dec 2019 21:46:57 +0800 Subject: [PATCH 100/448] DOC correct indents in docstring _split.py (#15843) --- sklearn/model_selection/_split.py | 66 +++++++++++++++---------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index ff3a3ba5bf365..9a85b4049a3c3 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -135,10 +135,10 @@ class LeaveOneOut(BaseCrossValidator): >>> print(loo) LeaveOneOut() >>> for train_index, test_index in loo.split(X): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) TRAIN: [1] TEST: [0] [[3 4]] [[1 2]] [2] [1] TRAIN: [0] TEST: [1] @@ -222,9 +222,9 @@ class LeavePOut(BaseCrossValidator): >>> print(lpo) LeavePOut(p=2) >>> for train_index, test_index in lpo.split(X): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] TRAIN: [2 3] TEST: [0 1] TRAIN: [1 3] TEST: [0 2] TRAIN: [1 2] TEST: [0 3] @@ -398,9 +398,9 @@ class KFold(_BaseKFold): >>> print(kf) KFold(n_splits=2, random_state=None, shuffle=False) >>> for train_index, test_index in kf.split(X): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] TRAIN: [2 3] TEST: [0 1] TRAIN: [0 1] TEST: [2 3] @@ -604,9 +604,9 @@ class StratifiedKFold(_BaseKFold): >>> print(skf) StratifiedKFold(n_splits=2, random_state=None, shuffle=False) >>> for train_index, test_index in skf.split(X, y): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] TRAIN: [1 3] TEST: [0 2] TRAIN: [0 2] TEST: [1 3] @@ -769,9 +769,9 @@ class TimeSeriesSplit(_BaseKFold): >>> print(tscv) TimeSeriesSplit(max_train_size=None, n_splits=5) >>> for train_index, test_index in tscv.split(X): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] TRAIN: [0] TEST: [1] TRAIN: [0 1] TEST: [2] TRAIN: [0 1 2] TEST: [3] @@ -861,10 +861,10 @@ class LeaveOneGroupOut(BaseCrossValidator): >>> print(logo) LeaveOneGroupOut() >>> for train_index, test_index in logo.split(X, y, groups): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) TRAIN: [2 3] TEST: [0 1] [[5 6] [7 8]] [[1 2] @@ -980,10 +980,10 @@ class LeavePGroupsOut(BaseCrossValidator): >>> print(lpgo) LeavePGroupsOut(n_groups=2) >>> for train_index, test_index in lpgo.split(X, y, groups): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) TRAIN: [2] TEST: [0 1] [[5 6]] [[1 2] [3 4]] [1] [1 2] @@ -1405,7 +1405,7 @@ class ShuffleSplit(BaseShuffleSplit): >>> print(rs) ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) >>> for train_index, test_index in rs.split(X): - ... print("TRAIN:", train_index, "TEST:", test_index) + ... print("TRAIN:", train_index, "TEST:", test_index) TRAIN: [1 3 0 4] TEST: [5 2] TRAIN: [4 0 2 5] TEST: [1 3] TRAIN: [1 2 4 0] TEST: [3 5] @@ -1414,7 +1414,7 @@ class ShuffleSplit(BaseShuffleSplit): >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25, ... random_state=0) >>> for train_index, test_index in rs.split(X): - ... print("TRAIN:", train_index, "TEST:", test_index) + ... print("TRAIN:", train_index, "TEST:", test_index) TRAIN: [1 3 0] TEST: [5 2] TRAIN: [4 0 2] TEST: [1 3] TRAIN: [1 2 4] TEST: [3 5] @@ -1508,7 +1508,7 @@ class GroupShuffleSplit(ShuffleSplit): >>> gss.get_n_splits() 2 >>> for train_idx, test_idx in gss.split(X, y, groups): - ... print("TRAIN:", train_idx, "TEST:", test_idx) + ... print("TRAIN:", train_idx, "TEST:", test_idx) TRAIN: [2 3 4 5 6 7] TEST: [0 1] TRAIN: [0 1 5 6 7] TEST: [2 3 4] ''' @@ -1620,9 +1620,9 @@ class StratifiedShuffleSplit(BaseShuffleSplit): >>> print(sss) StratifiedShuffleSplit(n_splits=5, random_state=0, ...) >>> for train_index, test_index in sss.split(X, y): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] TRAIN: [5 2 3] TEST: [4 1 0] TRAIN: [5 1 4] TEST: [0 2 3] TRAIN: [5 0 2] TEST: [4 3 1] @@ -1837,9 +1837,9 @@ class PredefinedSplit(BaseCrossValidator): >>> print(ps) PredefinedSplit(test_fold=array([ 0, 1, -1, 1])) >>> for train_index, test_index in ps.split(): - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] TRAIN: [1 2 3] TEST: [0] TRAIN: [0 2] TEST: [1 3] """ From d5c6c966c4786dfef00831e0a6864a9238e4b903 Mon Sep 17 00:00:00 2001 From: cgsavard Date: Tue, 10 Dec 2019 08:56:17 -0700 Subject: [PATCH 101/448] DOC fix docstring of KMeans based on sklearn guideline (#15754) --- sklearn/cluster/_k_means.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_k_means.py index 52f2b5fee4dac..f470d61423b2c 100644 --- a/sklearn/cluster/_k_means.py +++ b/sklearn/cluster/_k_means.py @@ -654,11 +654,12 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): Parameters ---------- - n_clusters : int, optional, default: 8 + n_clusters : int, default=8 The number of clusters to form as well as the number of centroids to generate. - init : {'k-means++', 'random' or an ndarray} + init : {'k-means++', 'random'} or ndarray of shape \ + (n_clusters, n_features), default='k-means++' Method for initialization, defaults to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean @@ -671,19 +672,19 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. - n_init : int, default: 10 + n_init : int, default=10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. - max_iter : int, default: 300 + max_iter : int, default=300 Maximum number of iterations of the k-means algorithm for a single run. - tol : float, default: 1e-4 + tol : float, default=1e-4 Relative tolerance with regards to inertia to declare convergence. - precompute_distances : {'auto', True, False} + precompute_distances : 'auto' or bool, default='auto' Precompute distances (faster but takes more memory). 'auto' : do not precompute distances if n_samples * n_clusters > 12 @@ -694,15 +695,15 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): False : never precompute distances. - verbose : int, default 0 + verbose : int, default=0 Verbosity mode. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. - copy_x : bool, optional + copy_x : bool, default=True When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True (default), then the original data is not modified, ensuring X is C-contiguous. If False, the original data @@ -711,7 +712,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): the data mean, in this case it will also not ensure that data is C-contiguous which may cause a significant slowdown. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. @@ -719,7 +720,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. - algorithm : "auto", "full" or "elkan", default="auto" + algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". The "elkan" variation is more efficient by using the triangle inequality, but currently doesn't support sparse data. "auto" chooses @@ -727,12 +728,12 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): Attributes ---------- - cluster_centers_ : array, [n_clusters, n_features] + cluster_centers_ : ndarray of shape (n_clusters, n_features) Coordinates of cluster centers. If the algorithm stops before fully converging (see ``tol`` and ``max_iter``), these will not be consistent with ``labels_``. - labels_ : array, shape (n_samples,) + labels_ : ndarray of shape (n_samples,) Labels of each point inertia_ : float From 893a4d45595d7bcc30dc56bd450a3cc3d8c920bc Mon Sep 17 00:00:00 2001 From: Vachan D A Date: Tue, 10 Dec 2019 09:05:18 -0700 Subject: [PATCH 102/448] DOC fix docstring of AgglomerativeClustering based on sklearn guideline (#15764) --- sklearn/cluster/_hierarchical.py | 38 +++++++++++++++++--------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index 9cb80747fbc20..745fb66874005 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -683,23 +683,23 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): Parameters ---------- - n_clusters : int or None, optional (default=2) + n_clusters : int or None, default=2 The number of clusters to find. It must be ``None`` if ``distance_threshold`` is not ``None``. - affinity : string or callable, default: "euclidean" + affinity : str or callable, default='euclidean' Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or "precomputed". If linkage is "ward", only "euclidean" is accepted. If "precomputed", a distance matrix (instead of a similarity matrix) is needed as input for the fit method. - memory : None, str or object with the joblib.Memory interface, optional + memory : str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. - connectivity : array-like or callable, optional + connectivity : array-like or callable, default=None Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. This can be a connectivity matrix itself or a callable that transforms @@ -707,17 +707,19 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): kneighbors_graph. Default is None, i.e, the hierarchical clustering algorithm is unstructured. - compute_full_tree : bool or 'auto' (optional) - Stop early the construction of the tree at n_clusters. This is - useful to decrease computation time if the number of clusters is - not small compared to the number of samples. This option is - useful only when specifying a connectivity matrix. Note also that - when varying the number of clusters and using caching, it may - be advantageous to compute the full tree. It must be ``True`` if - ``distance_threshold`` is not ``None``. - - linkage : {"ward", "complete", "average", "single"}, optional \ - (default="ward") + compute_full_tree : 'auto' or bool, default='auto' + Stop early the construction of the tree at n_clusters. This is useful + to decrease computation time if the number of clusters is not small + compared to the number of samples. This option is useful only when + specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is + equivalent to `False`. + + linkage : {"ward", "complete", "average", "single"}, default="ward" Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. @@ -730,7 +732,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): - single uses the minimum of the distances between all observations of the two sets. - distance_threshold : float, optional (default=None) + distance_threshold : float, default=None The linkage distance threshold above which, clusters will not be merged. If not ``None``, ``n_clusters`` must be ``None`` and ``compute_full_tree`` must be ``True``. @@ -744,7 +746,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): ``distance_threshold=None``, it will be equal to the given ``n_clusters``. - labels_ : array [n_samples] + labels_ : ndarray of shape (n_samples) cluster labels for each point n_leaves_ : int @@ -753,7 +755,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): n_connected_components_ : int The estimated number of connected components in the graph. - children_ : array-like, shape (n_samples-1, 2) + children_ : array-like of shape (n_samples-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf From 10eb17cd9aac943e08a62e427ea38ae1aab65c1e Mon Sep 17 00:00:00 2001 From: cgsavard Date: Tue, 10 Dec 2019 09:07:39 -0700 Subject: [PATCH 103/448] DOC fix docstring of AffinityPropagation based on sklearn guideline (#15777) --- sklearn/cluster/_affinity_propagation.py | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 3393e0686bd02..eaba9ccf1ec20 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -242,51 +242,51 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): Parameters ---------- - damping : float, optional, default: 0.5 + damping : float, default=0.5 Damping factor (between 0.5 and 1) is the extent to which the current value is maintained relative to incoming values (weighted 1 - damping). This in order to avoid numerical oscillations when updating these values (messages). - max_iter : int, optional, default: 200 + max_iter : int, default=200 Maximum number of iterations. - convergence_iter : int, optional, default: 15 + convergence_iter : int, default=15 Number of iterations with no change in the number of estimated clusters that stops the convergence. - copy : boolean, optional, default: True + copy : bool, default=True Make a copy of input data. - preference : array-like, shape (n_samples,) or float, optional + preference : array-like of shape (n_samples,) or float, default=None Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities. - affinity : string, optional, default=``euclidean`` - Which affinity to use. At the moment ``precomputed`` and - ``euclidean`` are supported. ``euclidean`` uses the + affinity : {'euclidean', 'precomputed'}, default='euclidean' + Which affinity to use. At the moment 'precomputed' and + ``euclidean`` are supported. 'euclidean' uses the negative squared euclidean distance between points. - verbose : boolean, optional, default: False + verbose : bool, default=False Whether to be verbose. Attributes ---------- - cluster_centers_indices_ : array, shape (n_clusters,) + cluster_centers_indices_ : ndarray of shape (n_clusters,) Indices of cluster centers - cluster_centers_ : array, shape (n_clusters, n_features) + cluster_centers_ : ndarray of shape (n_clusters, n_features) Cluster centers (if affinity != ``precomputed``). - labels_ : array, shape (n_samples,) + labels_ : ndarray of shape (n_samples,) Labels of each point - affinity_matrix_ : array, shape (n_samples, n_samples) + affinity_matrix_ : ndarray of shape (n_samples, n_samples) Stores the affinity matrix used in ``fit``. n_iter_ : int From 64ac463702638f60b99dfc373f96bad496893159 Mon Sep 17 00:00:00 2001 From: cgsavard Date: Tue, 10 Dec 2019 09:09:45 -0700 Subject: [PATCH 104/448] DOC fixed SpectralCoclustering and SpectralBiclustering docstrings following sklearn guideline (#15778) --- sklearn/cluster/_bicluster.py | 70 ++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 5bfd335549012..cced1674e167b 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -191,10 +191,10 @@ class SpectralCoclustering(BaseSpectral): Parameters ---------- - n_clusters : integer, optional, default: 3 + n_clusters : int, default=3 The number of biclusters to find. - svd_method : string, optional, default: 'randomized' + svd_method : {'randomized', 'arpack'}, default='randomized' Selects the algorithm for finding singular vectors. May be 'randomized' or 'arpack'. If 'randomized', use :func:`sklearn.utils.extmath.randomized_svd`, which may be faster @@ -202,20 +202,21 @@ class SpectralCoclustering(BaseSpectral): :func:`scipy.sparse.linalg.svds`, which is more accurate, but possibly slower in some cases. - n_svd_vecs : int, optional, default: None + n_svd_vecs : int, default=None Number of vectors to use in calculating the SVD. Corresponds to `ncv` when `svd_method=arpack` and `n_oversamples` when `svd_method` is 'randomized`. - mini_batch : bool, optional, default: False + mini_batch : bool, default=False Whether to use mini-batch k-means, which is faster but may get different results. - init : {'k-means++', 'random' or an ndarray} - Method for initialization of k-means algorithm; defaults to - 'k-means++'. + init : {'k-means++', 'random', or ndarray of shape \ + (n_clusters, n_features), default='k-means++' + Method for initialization of k-means algorithm; defaults to + 'k-means++'. - n_init : int, optional, default: 10 + n_init : int, default=10 Number of random initializations that are tried with the k-means algorithm. @@ -223,7 +224,7 @@ class SpectralCoclustering(BaseSpectral): chosen and the algorithm runs once. Otherwise, the algorithm is run for each initialization and the best solution chosen. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. @@ -232,24 +233,24 @@ class SpectralCoclustering(BaseSpectral): ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. Attributes ---------- - rows_ : array-like, shape (n_row_clusters, n_rows) + rows_ : array-like of shape (n_row_clusters, n_rows) Results of the clustering. `rows[i, r]` is True if cluster `i` contains row `r`. Available only after calling ``fit``. - columns_ : array-like, shape (n_column_clusters, n_columns) + columns_ : array-like of shape (n_column_clusters, n_columns) Results of the clustering, like `rows`. - row_labels_ : array-like, shape (n_rows,) + row_labels_ : array-like of shape (n_rows,) The bicluster label of each row. - column_labels_ : array-like, shape (n_cols,) + column_labels_ : array-like of shape (n_cols,) The bicluster label of each column. Examples @@ -319,26 +320,28 @@ class SpectralBiclustering(BaseSpectral): Parameters ---------- - n_clusters : integer or tuple (n_row_clusters, n_column_clusters) + n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3 The number of row and column clusters in the checkerboard structure. - method : string, optional, default: 'bistochastic' + method : {'bistochastic', 'scale', 'log'}, default='bistochastic' Method of normalizing and converting singular vectors into biclusters. May be one of 'scale', 'bistochastic', or 'log'. The authors recommend using 'log'. If the data is sparse, however, log normalization will not work, which is why the - default is 'bistochastic'. CAUTION: if `method='log'`, the - data must not be sparse. + default is 'bistochastic'. - n_components : integer, optional, default: 6 + .. warning:: + if `method='log'`, the data must be sparse. + + n_components : int, default=6 Number of singular vectors to check. - n_best : integer, optional, default: 3 + n_best : int, default=3 Number of best singular vectors to which to project the data for clustering. - svd_method : string, optional, default: 'randomized' + svd_method : {'randomized', 'arpack'}, default='randomized' Selects the algorithm for finding singular vectors. May be 'randomized' or 'arpack'. If 'randomized', uses :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster @@ -346,20 +349,21 @@ class SpectralBiclustering(BaseSpectral): `scipy.sparse.linalg.svds`, which is more accurate, but possibly slower in some cases. - n_svd_vecs : int, optional, default: None + n_svd_vecs : int, default=None Number of vectors to use in calculating the SVD. Corresponds to `ncv` when `svd_method=arpack` and `n_oversamples` when `svd_method` is 'randomized`. - mini_batch : bool, optional, default: False + mini_batch : bool, default=False Whether to use mini-batch k-means, which is faster but may get different results. - init : {'k-means++', 'random' or an ndarray} - Method for initialization of k-means algorithm; defaults to - 'k-means++'. + init : {'k-means++', 'random'} or ndarray of (n_clusters, n_features), \ + default='k-means++' + Method for initialization of k-means algorithm; defaults to + 'k-means++'. - n_init : int, optional, default: 10 + n_init : int, default=10 Number of random initializations that are tried with the k-means algorithm. @@ -367,7 +371,7 @@ class SpectralBiclustering(BaseSpectral): chosen and the algorithm runs once. Otherwise, the algorithm is run for each initialization and the best solution chosen. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. @@ -376,24 +380,24 @@ class SpectralBiclustering(BaseSpectral): ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. Attributes ---------- - rows_ : array-like, shape (n_row_clusters, n_rows) + rows_ : array-like of shape (n_row_clusters, n_rows) Results of the clustering. `rows[i, r]` is True if cluster `i` contains row `r`. Available only after calling ``fit``. - columns_ : array-like, shape (n_column_clusters, n_columns) + columns_ : array-like of shape (n_column_clusters, n_columns) Results of the clustering, like `rows`. - row_labels_ : array-like, shape (n_rows,) + row_labels_ : array-like of shape (n_rows,) Row partition labels. - column_labels_ : array-like, shape (n_cols,) + column_labels_ : array-like of shape (n_cols,) Column partition labels. Examples From e504ea7dd275fdd4539b93c1530471135750fb51 Mon Sep 17 00:00:00 2001 From: Vachan D A Date: Tue, 10 Dec 2019 09:36:39 -0700 Subject: [PATCH 105/448] DOC fix FeatureAgglomeration and MiniBatchKMeans docstring following sklearn guideline (#15809) --- sklearn/cluster/_hierarchical.py | 44 +++++++++++++++++--------------- sklearn/cluster/_k_means.py | 29 +++++++++++---------- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index 745fb66874005..9883f7e809d48 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -716,8 +716,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): the full tree. It must be ``True`` if ``distance_threshold`` is not ``None``. By default `compute_full_tree` is "auto", which is equivalent to `True` when `distance_threshold` is not `None` or that `n_clusters` - is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is - equivalent to `False`. + is inferior to the maximum between 100 or `0.02 * n_samples`. + Otherwise, "auto" is equivalent to `False`. linkage : {"ward", "complete", "average", "single"}, default="ward" Which linkage criterion to use. The linkage criterion determines which @@ -924,21 +924,21 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): Parameters ---------- - n_clusters : int or None, optional (default=2) + n_clusters : int, default=2 The number of clusters to find. It must be ``None`` if ``distance_threshold`` is not ``None``. - affinity : string or callable, default "euclidean" + affinity : str or callable, default='euclidean' Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or 'precomputed'. If linkage is "ward", only "euclidean" is accepted. - memory : None, str or object with the joblib.Memory interface, optional + memory : str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. - connectivity : array-like or callable, optional + connectivity : array-like or callable, default=None Connectivity matrix. Defines for each feature the neighboring features following a given structure of the data. This can be a connectivity matrix itself or a callable that transforms @@ -946,17 +946,19 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): kneighbors_graph. Default is None, i.e, the hierarchical clustering algorithm is unstructured. - compute_full_tree : bool or 'auto', optional, default "auto" - Stop early the construction of the tree at n_clusters. This is - useful to decrease computation time if the number of clusters is - not small compared to the number of features. This option is - useful only when specifying a connectivity matrix. Note also that - when varying the number of clusters and using caching, it may - be advantageous to compute the full tree. It must be ``True`` if - ``distance_threshold`` is not ``None``. + compute_full_tree : 'auto' or bool, optional, default='auto' + Stop early the construction of the tree at n_clusters. This is useful + to decrease computation time if the number of clusters is not small + compared to the number of features. This option is useful only when + specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to the maximum between 100 or `0.02 * n_samples`. + Otherwise, "auto" is equivalent to `False`. - linkage : {"ward", "complete", "average", "single"}, optional\ - (default="ward") + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' Which linkage criterion to use. The linkage criterion determines which distance to use between sets of features. The algorithm will merge the pairs of cluster that minimize this criterion. @@ -969,12 +971,12 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): - single uses the minimum of the distances between all observations of the two sets. - pooling_func : callable, default np.mean + pooling_func : callable, default=np.mean This combines the values of agglomerated features into a single value, and should accept an array of shape [M, N] and the keyword argument `axis=1`, and reduce it to an array of size [M]. - distance_threshold : float, optional (default=None) + distance_threshold : float, default=None The linkage distance threshold above which, clusters will not be merged. If not ``None``, ``n_clusters`` must be ``None`` and ``compute_full_tree`` must be ``True``. @@ -988,7 +990,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): ``distance_threshold=None``, it will be equal to the given ``n_clusters``. - labels_ : array-like, (n_features,) + labels_ : array-like of (n_features,) cluster labels for each feature. n_leaves_ : int @@ -997,7 +999,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): n_connected_components_ : int The estimated number of connected components in the graph. - children_ : array-like, shape (n_nodes-1, 2) + children_ : array-like of shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_features` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_features` is a non-leaf @@ -1005,7 +1007,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_features + i` - distances_ : array-like, shape (n_nodes-1,) + distances_ : array-like of shape (n_nodes-1,) Distances between nodes in the corresponding place in `children_`. Only computed if distance_threshold is not None. diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_k_means.py index f470d61423b2c..71be86a087629 100644 --- a/sklearn/cluster/_k_means.py +++ b/sklearn/cluster/_k_means.py @@ -1336,12 +1336,13 @@ class MiniBatchKMeans(KMeans): Parameters ---------- - n_clusters : int, optional, default: 8 + n_clusters : int, default=8 The number of clusters to form as well as the number of centroids to generate. - init : {'k-means++', 'random' or an ndarray}, default: 'k-means++' - Method for initialization, defaults to 'k-means++': + init : {'k-means++', 'random'} or ndarray of shape \ + (n_clusters, n_features), default='k-means++' + Method for initialization 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section @@ -1353,26 +1354,26 @@ class MiniBatchKMeans(KMeans): If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. - max_iter : int, optional + max_iter : int, default=100 Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics. - batch_size : int, optional, default: 100 + batch_size : int, default=100 Size of the mini batches. - verbose : bool, optional + verbose : int, default=0 Verbosity mode. compute_labels : bool, default=True Compute label assignment and inertia for the complete dataset once the minibatch optimization has converged in fit. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization and random reassignment. Use an int to make the randomness deterministic. See :term:`Glossary `. - tol : float, default: 0.0 + tol : float, default=0.0 Control early stopping based on the relative center changes as measured by a smoothed, variance-normalized of the mean center squared position changes. This early stopping heuristics is @@ -1383,25 +1384,27 @@ class MiniBatchKMeans(KMeans): To disable convergence detection based on normalized center change, set tol to 0.0 (default). - max_no_improvement : int, default: 10 + max_no_improvement : int, default=10 Control early stopping based on the consecutive number of mini batches that does not yield an improvement on the smoothed inertia. To disable convergence detection based on inertia, set max_no_improvement to None. - init_size : int, optional, default: 3 * batch_size + init_size : int, default=None Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accuracy): the only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than n_clusters. + If `None`, `init_size= 3 * batch_size`. + n_init : int, default=3 Number of random initializations that are tried. In contrast to KMeans, the algorithm is only run once, using the best of the ``n_init`` initializations as measured by inertia. - reassignment_ratio : float, default: 0.01 + reassignment_ratio : float, default=0.01 Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more easily reassigned, which means that the @@ -1411,10 +1414,10 @@ class MiniBatchKMeans(KMeans): Attributes ---------- - cluster_centers_ : array, [n_clusters, n_features] + cluster_centers_ : ndarray of shape (n_clusters, n_features) Coordinates of cluster centers - labels_ : + labels_ : int Labels of each point (if compute_labels is set to True). inertia_ : float From a07974b3c05f5f5b957aea99fe8c199e29485f5b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Dec 2019 20:29:55 +0100 Subject: [PATCH 106/448] FIX add best_score_ to Ridge*CV estimators (#15655) --- doc/whats_new/v0.23.rst | 4 ++++ sklearn/linear_model/_ridge.py | 10 +++++++++- sklearn/linear_model/tests/test_ridge.py | 20 +++++++++++++++++--- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index a926d86896e7d..e7b9fdd24989d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -81,6 +81,10 @@ Changelog `store_cv_values` is `True`. :pr:`15652` by :user:`Jérôme Dockès `. +- |Fix| add `best_score_` attribute to :class:`linear_model.RidgeCV` and + :class:`linear_model.RidgeClassifierCV`. + :pr:`15653` by :user:`Jérôme Dockès `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1c0407066048c..3b54a4eb5b3cb 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1576,6 +1576,7 @@ def fit(self, X, y, sample_weight=None): store_cv_values=self.store_cv_values) estimator.fit(X, y, sample_weight=sample_weight) self.alpha_ = estimator.alpha_ + self.best_score_ = estimator.best_score_ if self.store_cv_values: self.cv_values_ = estimator.cv_values_ else: @@ -1591,6 +1592,7 @@ def fit(self, X, y, sample_weight=None): gs.fit(X, y, sample_weight=sample_weight) estimator = gs.best_estimator_ self.alpha_ = gs.best_estimator_.alpha + self.best_score_ = gs.best_score_ self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ @@ -1693,6 +1695,9 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): alpha_ : float Estimated regularization parameter. + best_score_ : float + Mean cross-validated score of the estimator with the best alpha found. + Examples -------- >>> from sklearn.datasets import load_diabetes @@ -1795,7 +1800,10 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): ``fit_intercept = False``. alpha_ : float - Estimated regularization parameter + Estimated regularization parameter. + + best_score_ : float + Mean cross-validated score of the estimator with the best alpha found. classes_ : array of shape (n_classes,) The classes labels. diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index ab45a093500df..a92e830aba66e 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -664,17 +664,31 @@ def _test_ridge_cv(filter_): @pytest.mark.parametrize( "ridge, make_dataset", - [(RidgeCV(), make_regression), - (RidgeClassifierCV(), make_classification)] + [(RidgeCV(store_cv_values=False), make_regression), + (RidgeClassifierCV(store_cv_values=False), make_classification)] ) def test_ridge_gcv_cv_values_not_stored(ridge, make_dataset): # Check that `cv_values_` is not stored when store_cv_values is False X, y = make_dataset(n_samples=6, random_state=42) - ridge.set_params(store_cv_values=False) ridge.fit(X, y) assert not hasattr(ridge, "cv_values_") +@pytest.mark.parametrize( + "ridge, make_dataset", + [(RidgeCV(), make_regression), + (RidgeClassifierCV(), make_classification)] +) +@pytest.mark.parametrize("cv", [None, 3]) +def test_ridge_best_score(ridge, make_dataset, cv): + # check that the best_score_ is store + X, y = make_dataset(n_samples=6, random_state=42) + ridge.set_params(store_cv_values=False, cv=cv) + ridge.fit(X, y) + assert hasattr(ridge, "best_score_") + assert isinstance(ridge.best_score_, float) + + def _test_ridge_diabetes(filter_): ridge = Ridge(fit_intercept=False) ridge.fit(filter_(X_diabetes), y_diabetes) From c7fccc697e29173b8fdb24dccf8816d5e4114933 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 11 Dec 2019 18:22:51 +0800 Subject: [PATCH 107/448] DOC Correct best_score_ docstring in RidgeCV (#15859) --- sklearn/linear_model/_ridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 3b54a4eb5b3cb..2a24fba4675a5 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1696,7 +1696,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): Estimated regularization parameter. best_score_ : float - Mean cross-validated score of the estimator with the best alpha found. + Score of base estimator with best alpha. Examples -------- @@ -1803,7 +1803,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): Estimated regularization parameter. best_score_ : float - Mean cross-validated score of the estimator with the best alpha found. + Score of base estimator with best alpha. classes_ : array of shape (n_classes,) The classes labels. From 6c582888abfc1c2affe42554c02f86bbc4470df0 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 11 Dec 2019 18:33:53 +0800 Subject: [PATCH 108/448] TST Specify random_state in test_cv_iterable_wrapper (#15829) --- sklearn/model_selection/tests/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 5d3c41900472e..eb5eb192a6921 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1338,7 +1338,7 @@ def test_cv_iterable_wrapper(): list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results - kf_randomized_iter = KFold(shuffle=True).split(X, y) + kf_randomized_iter = KFold(shuffle=True, random_state=0).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) # numpy's assert_array_equal properly compares nested lists np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)), From e0a8f7155a6bdf15831c196159882dabce8055af Mon Sep 17 00:00:00 2001 From: Sambhav Kothari Date: Fri, 13 Dec 2019 00:44:59 +0000 Subject: [PATCH 109/448] DOC Include LinearSV{C, R} in models that support sample_weights (#15871) --- doc/modules/svm.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 03020cfd2252c..c5da9f09fa720 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -267,10 +267,11 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``. :scale: 75 -:class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR` and -:class:`OneClassSVM` implement also weights for individual samples in method -``fit`` through keyword ``sample_weight``. Similar to ``class_weight``, these -set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``. +:class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`, :class:`LinearSVC`, +:class:`LinearSVR` and :class:`OneClassSVM` implement also weights for +individual samples in method ``fit`` through keyword ``sample_weight``. Similar +to ``class_weight``, these set the parameter ``C`` for the i-th example to +``C * sample_weight[i]``. .. figure:: ../auto_examples/svm/images/sphx_glr_plot_weighted_samples_001.png From dc111489a1a13b025e0d787c6a0369c1786654dc Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Fri, 13 Dec 2019 11:51:17 +0800 Subject: [PATCH 110/448] DOC correct some indents (#15875) --- sklearn/exceptions.py | 4 ++-- sklearn/utils/_testing.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 590fa416dd807..c3da8e8d7986f 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -62,8 +62,8 @@ class ConvergenceWarning(UserWarning): ... [1, 0], ... [1, 0]]) # last point is duplicated >>> with warnings.catch_warnings(record=True) as w: - ... km = KMeans(n_clusters=4).fit(X) - ... print(w[-1].message) + ... km = KMeans(n_clusters=4).fit(X) + ... print(w[-1].message) Number of distinct clusters (3) found smaller than n_clusters (4). Possibly due to duplicate points in X. diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 4e4e6043eae3d..d5ad9d262da93 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -276,8 +276,8 @@ def ignore_warnings(obj=None, category=Warning): ... warnings.warn('buhuhuhu') >>> def nasty_warn(): - ... warnings.warn('buhuhuhu') - ... print(42) + ... warnings.warn('buhuhuhu') + ... print(42) >>> ignore_warnings(nasty_warn)() 42 From 42e17b34876101bec79786b42206d4f2e946c278 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 13 Dec 2019 15:05:51 -0500 Subject: [PATCH 111/448] MNT Removed deprecated attributes and parameters -- ctnd (#15804) --- doc/modules/ensemble.rst | 2 +- sklearn/decomposition/_nmf.py | 17 +++------- sklearn/decomposition/tests/test_nmf.py | 4 --- sklearn/discriminant_analysis.py | 19 +++-------- sklearn/ensemble/_forest.py | 20 +++++------ sklearn/ensemble/_gb.py | 8 ++--- .../ensemble/tests/test_gradient_boosting.py | 5 +-- sklearn/metrics/_classification.py | 19 +---------- sklearn/metrics/tests/test_classification.py | 5 --- sklearn/metrics/tests/test_common.py | 2 -- sklearn/preprocessing/_data.py | 21 ++++-------- sklearn/preprocessing/tests/test_data.py | 18 ---------- sklearn/tests/test_discriminant_analysis.py | 17 +++------- sklearn/tree/_classes.py | 34 +++++++++---------- sklearn/tree/tests/test_tree.py | 2 +- sklearn/utils/_testing.py | 14 ++------ sklearn/utils/estimator_checks.py | 3 -- 17 files changed, 59 insertions(+), 151 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index c2897ed518509..8a414e5371511 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1323,7 +1323,7 @@ computationally expensive. StackingRegressor(...) >>> print('R2 score: {:.2f}' ... .format(multi_layer_regressor.score(X_test, y_test))) - R2 score: 0.82 + R2 score: 0.83 .. topic:: References diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 9d335eb775d8b..6d5509611cefd 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -842,7 +842,7 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius', def non_negative_factorization(X, W=None, H=None, n_components=None, - init='warn', update_H=True, solver='cd', + init=None, update_H=True, solver='cd', beta_loss='frobenius', tol=1e-4, max_iter=200, alpha=0., l1_ratio=0., regularization=None, random_state=None, @@ -891,10 +891,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom' Method used to initialize the procedure. - Default: 'random'. - - The default value will change from 'random' to None in version 0.23 - to make it consistent with decomposition.NMF. + Default: None. Valid options: @@ -915,6 +912,9 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, - 'custom': use custom matrices W and H + .. versionchanged:: 0.23 + The default value of `init` changed from 'random' to None in 0.23. + update_H : boolean, default: True Set to True, both W and H will be estimated from initial guesses. Set to False, only W will be estimated. @@ -1028,13 +1028,6 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % tol) - if init == "warn": - if n_components < n_features: - warnings.warn("The default value of init will change from " - "random to None in 0.23 to make it consistent " - "with decomposition.NMF.", FutureWarning) - init = "random" - # check W and H, or initialize them if init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index d98ad551513e7..4fd21ffbf5b1d 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -224,10 +224,6 @@ def test_non_negative_factorization_checking(): A = np.ones((2, 2)) # Test parameters checking is public function nnmf = non_negative_factorization - msg = ("The default value of init will change from " - "random to None in 0.23 to make it consistent " - "with decomposition.NMF.") - assert_warns_message(FutureWarning, msg, nnmf, A, A, A, np.int64(1)) msg = ("Number of components must be a positive integer; " "got (n_components=1.5)") assert_raise_message(ValueError, msg, nnmf, A, A, A, 1.5, 'random') diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 4492d0868994d..1495d00620911 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -423,7 +423,6 @@ def fit(self, X, y): y : array, shape (n_samples,) Target values. """ - # FIXME: Future warning to be removed in 0.23 X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self, dtype=[np.float64, np.float32]) self.classes_ = unique_labels(y) @@ -455,21 +454,11 @@ def fit(self, X, y): self._max_components = max_components else: if self.n_components > max_components: - warnings.warn( + raise ValueError( "n_components cannot be larger than min(n_features, " - "n_classes - 1). Using min(n_features, " - "n_classes - 1) = min(%d, %d - 1) = %d components." - % (X.shape[1], len(self.classes_), max_components), - ChangedBehaviorWarning) - future_msg = ("In version 0.23, setting n_components > min(" - "n_features, n_classes - 1) will raise a " - "ValueError. You should set n_components to None" - " (default), or a value smaller or equal to " - "min(n_features, n_classes - 1).") - warnings.warn(future_msg, FutureWarning) - self._max_components = max_components - else: - self._max_components = self.n_components + "n_classes - 1)." + ) + self._max_components = self.n_components if self.solver == 'svd': if self.shrinkage is not None: diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index eba59c232531b..7e88f0c2f189a 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -935,14 +935,14 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. @@ -1253,14 +1253,14 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. bootstrap : boolean, optional (default=True) @@ -1530,14 +1530,14 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. bootstrap : boolean, optional (default=False) @@ -1840,14 +1840,14 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. bootstrap : boolean, optional (default=False) @@ -2078,14 +2078,14 @@ class RandomTreesEmbedding(BaseForest): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. sparse_output : bool, optional (default=True) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index c3971e019a088..667a526e486a9 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -868,14 +868,14 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. init : estimator or 'zero', optional (default=None) @@ -1340,14 +1340,14 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. init : estimator or 'zero', optional (default=None) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 5fe9dee573d1d..a28c69d0f7cc5 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1170,9 +1170,10 @@ def test_non_uniform_weights_toy_edge_case_clf(): def check_sparse_input(EstimatorClass, X, X_sparse, y): dense = EstimatorClass(n_estimators=10, random_state=0, - max_depth=2).fit(X, y) + max_depth=2, min_impurity_decrease=1e-7).fit(X, y) sparse = EstimatorClass(n_estimators=10, random_state=0, - max_depth=2).fit(X_sparse, y) + max_depth=2, + min_impurity_decrease=1e-7).fit(X_sparse, y) assert_array_almost_equal(sparse.apply(X), dense.apply(X)) assert_array_almost_equal(sparse.predict(X), dense.predict(X)) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index c7101f7a38eeb..322ac3409722f 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -1986,7 +1986,7 @@ class 2 1.00 0.67 0.80 3 return report -def hamming_loss(y_true, y_pred, labels=None, sample_weight=None): +def hamming_loss(y_true, y_pred, sample_weight=None): """Compute the average Hamming loss. The Hamming loss is the fraction of labels that are incorrectly predicted. @@ -2001,17 +2001,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None): y_pred : 1d array-like, or label indicator array / sparse matrix Predicted labels, as returned by a classifier. - labels : array, shape = [n_labels], optional (default='deprecated') - Integer array of labels. If not provided, labels will be inferred - from y_true and y_pred. - - .. versionadded:: 0.18 - .. deprecated:: 0.21 - This parameter ``labels`` is deprecated in version 0.21 and will - be removed in version 0.23. Hamming loss uses ``y_true.shape[1]`` - for the number of labels when y_true is binary label indicators, - so it is unnecessary for the user to specify. - sample_weight : array-like of shape (n_samples,), default=None Sample weights. @@ -2071,12 +2060,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None): y_type, y_true, y_pred = _check_targets(y_true, y_pred) check_consistent_length(y_true, y_pred, sample_weight) - if labels is not None: - warnings.warn("The labels parameter is unused. It was" - " deprecated in version 0.21 and" - " will be removed in version 0.23", - FutureWarning) - if sample_weight is None: weight_average = 1. else: diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 66ea486f955b7..4c1db4b55bb16 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -1176,11 +1176,6 @@ def test_multilabel_hamming_loss(): assert hamming_loss(y1, np.zeros_like(y1), sample_weight=w) == 2. / 3 # sp_hamming only works with 1-D arrays assert hamming_loss(y1[0], y2[0]) == sp_hamming(y1[0], y2[0]) - assert_warns_message(FutureWarning, - "The labels parameter is unused. It was" - " deprecated in version 0.21 and" - " will be removed in version 0.23", - hamming_loss, y1, y2, labels=[0, 1]) def test_jaccard_score_validation(): diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 991af61537012..331bcf197dccb 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -351,8 +351,6 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "roc_curve", "precision_recall_curve", - "hamming_loss", - "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", "jaccard_score", diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 19e21c0862cf7..cae75be2e591c 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2606,8 +2606,8 @@ def quantile_transform(X, axis=0, n_quantiles=1000, input is already a numpy array). If True, a copy of `X` is transformed, leaving the original `X` unchanged - ..versionchnanged:: 0.22 - The default value of `copy` changed from False to True in 0.22. + ..versionchnanged:: 0.23 + The default value of `copy` changed from False to True in 0.23. Returns ------- @@ -3008,7 +3008,7 @@ def _more_tags(self): return {'allow_nan': True} -def power_transform(X, method='warn', standardize=True, copy=True): +def power_transform(X, method='yeo-johnson', standardize=True, copy=True): """ Power transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for @@ -3032,15 +3032,15 @@ def power_transform(X, method='warn', standardize=True, copy=True): X : array-like, shape (n_samples, n_features) The data to be transformed using a power transformation. - method : str + method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' The power transform method. Available methods are: - 'yeo-johnson' [1]_, works with positive and negative values - 'box-cox' [2]_, only works with strictly positive values - The default method will be changed from 'box-cox' to 'yeo-johnson' - in version 0.23. To suppress the FutureWarning, explicitly set the - parameter. + .. versionchanged:: 0.23 + The default value of the `method` parameter changed from + 'box-cox' to 'yeo-johnson' in 0.23. standardize : boolean, default=True Set to True to apply zero-mean, unit-variance normalization to the @@ -3092,12 +3092,5 @@ def power_transform(X, method='warn', standardize=True, copy=True): .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the Royal Statistical Society B, 26, 211-252 (1964). """ - if method == 'warn': - warnings.warn("The default value of 'method' will change from " - "'box-cox' to 'yeo-johnson' in version 0.23. Set " - "the 'method' argument explicitly to silence this " - "warning in the meantime.", - FutureWarning) - method = 'box-cox' pt = PowerTransformer(method=method, standardize=standardize, copy=copy) return pt.fit_transform(X) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a67c101dec499..9a8e31d468f1c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2452,21 +2452,3 @@ def test_power_transformer_copy_False(method, standardize): X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is X_inv_trans - - -def test_power_transform_default_method(): - X = np.abs(X_2d) - - future_warning_message = ( - "The default value of 'method' " - "will change from 'box-cox'" - ) - assert_warns_message(FutureWarning, future_warning_message, - power_transform, X) - - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - X_trans_default = power_transform(X) - - X_trans_boxcox = power_transform(X, method='box-cox') - assert_array_equal(X_trans_boxcox, X_trans_default) diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py index 7b3e94bea793c..dcd4009a47a2d 100644 --- a/sklearn/tests/test_discriminant_analysis.py +++ b/sklearn/tests/test_discriminant_analysis.py @@ -332,7 +332,6 @@ def test_lda_store_covariance(): @pytest.mark.parametrize('n_features', [3, 5]) @pytest.mark.parametrize('n_classes', [5, 3]) def test_lda_dimension_warning(n_classes, n_features): - # FIXME: Future warning to be removed in 0.23 rng = check_random_state(0) n_samples = 10 X = rng.randn(n_samples, n_features) @@ -348,22 +347,14 @@ def test_lda_dimension_warning(n_classes, n_features): for n_components in [max_components + 1, max(n_features, n_classes - 1) + 1]: - # if n_components > min(n_classes - 1, n_features), raise warning + # if n_components > min(n_classes - 1, n_features), raise error. # We test one unit higher than max_components, and then something # larger than both n_features and n_classes - 1 to ensure the test # works for any value of n_component lda = LinearDiscriminantAnalysis(n_components=n_components) - msg = ("n_components cannot be larger than min(n_features, " - "n_classes - 1). Using min(n_features, " - "n_classes - 1) = min(%d, %d - 1) = %d components." % - (n_features, n_classes, max_components)) - assert_warns_message(ChangedBehaviorWarning, msg, lda.fit, X, y) - future_msg = ("In version 0.23, setting n_components > min(" - "n_features, n_classes - 1) will raise a " - "ValueError. You should set n_components to None" - " (default), or a value smaller or equal to " - "min(n_features, n_classes - 1).") - assert_warns_message(FutureWarning, future_msg, lda.fit, X, y) + msg = "n_components cannot be larger than " + with pytest.raises(ValueError, match=msg): + lda.fit(X, y) @pytest.mark.parametrize("data_type, expected_type", [ diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index ea43716e20ae6..4eb02464e786f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -293,19 +293,19 @@ def fit(self, X, y, sample_weight=None, check_input=True, min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) - if self.min_impurity_split is not None: + min_impurity_split = self.min_impurity_split + if min_impurity_split is not None: warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value will change from 1e-7 to 0 in " + "Its default value has changed from 1e-7 to 0 in " "version 0.23, and it will be removed in 0.25. " "Use the min_impurity_decrease parameter instead.", FutureWarning) - min_impurity_split = self.min_impurity_split - else: - min_impurity_split = 1e-7 - if min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") + if min_impurity_split < 0.: + raise ValueError("min_impurity_split must be greater than " + "or equal to 0") + else: + min_impurity_split = 0 if self.min_impurity_decrease < 0.: raise ValueError("min_impurity_decrease must be greater than " @@ -679,14 +679,14 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.19 - min_impurity_split : float, default=1e-7 + min_impurity_split : float, default=0 Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. class_weight : dict, list of dicts, "balanced" or None, default=None @@ -1061,14 +1061,14 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. presort : deprecated, default='deprecated' @@ -1349,14 +1349,14 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. class_weight : dict, list of dicts, "balanced" or None, default=None @@ -1573,14 +1573,14 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.19 - min_impurity_split : float, (default=1e-7) + min_impurity_split : float, (default=0) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. deprecated:: 0.19 ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of - ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it + ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. max_leaf_nodes : int or None, optional (default=None) @@ -1645,7 +1645,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): >>> reg = BaggingRegressor(extra_tree, random_state=0).fit( ... X_train, y_train) >>> reg.score(X_test, y_test) - 0.7823... + 0.7788... """ def __init__(self, criterion="mse", diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index dcd9d4c01a8ec..9f65ad7f68e83 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -803,7 +803,7 @@ def test_min_impurity_split(): est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0) assert est.min_impurity_split is None, ( - "Failed, min_impurity_split = {0} > 1e-7".format( + "Failed, min_impurity_split = {0} != None".format( est.min_impurity_split)) try: assert_warns(FutureWarning, est.fit, X, y) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index d5ad9d262da93..707c1dbd1b82d 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -53,8 +53,8 @@ __all__ = ["assert_equal", "assert_not_equal", "assert_raises", - "assert_raises_regexp", "assert_true", - "assert_false", "assert_almost_equal", "assert_array_equal", + "assert_raises_regexp", + "assert_almost_equal", "assert_array_equal", "assert_array_almost_equal", "assert_array_less", "assert_less", "assert_less_equal", "assert_greater", "assert_greater_equal", @@ -85,16 +85,6 @@ # the old name for now assert_raises_regexp = assert_raises_regex -deprecation_message = "'assert_true' is deprecated in version 0.21 " \ - "and will be removed in version 0.23. " \ - "Please use 'assert' instead." -assert_true = deprecated(deprecation_message)(_dummy.assertTrue) - -deprecation_message = "'assert_false' is deprecated in version 0.21 " \ - "and will be removed in version 0.23. " \ - "Please use 'assert' instead." -assert_false = deprecated(deprecation_message)(_dummy.assertFalse) - def assert_warns(warning_class, func, *args, **kw): """Test that a certain warning occurs. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 30c668237b371..b8471daf5deab 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -479,9 +479,6 @@ def _set_checking_parameters(estimator): # K-Means estimator.set_params(n_init=2) - if hasattr(estimator, "n_components"): - estimator.n_components = 2 - if name == 'TruncatedSVD': # TruncatedSVD doesn't run with n_components = n_features # This is ugly :-/ From 41747f6ecd6d3fe5d25229838554eae3d3083957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Fri, 13 Dec 2019 23:10:17 +0100 Subject: [PATCH 112/448] DOC Fix documentation of default values in tree classes (#15870) --- sklearn/tree/_classes.py | 194 +++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4eb02464e786f..9e45edd6bb063 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -244,9 +244,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: - raise ValueError( - 'Invalid value for max_features. Allowed string ' - 'values are "auto", "sqrt" or "log2".') + raise ValueError("Invalid value for max_features. " + "Allowed string values are 'auto', " + "'sqrt' or 'log2'.") elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): @@ -401,12 +401,12 @@ def predict(self, X, check_input=True): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. - check_input : bool, (default=True) + check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. @@ -445,8 +445,7 @@ def predict(self, X, check_input=True): return proba[:, :, 0] def apply(self, X, check_input=True): - """ - Return the index of the leaf that each sample is predicted as. + """Return the index of the leaf that each sample is predicted as. .. versionadded:: 0.17 @@ -457,13 +456,13 @@ def apply(self, X, check_input=True): ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. - check_input : bool, (default=True) + check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- - X_leaves : array_like, shape = [n_samples,] + X_leaves : array-like of shape (n_samples,) For each datapoint x in X, return the index of the leaf x ends up in. Leaves are numbered within ``[0; self.tree_.node_count)``, possibly with gaps in the @@ -485,14 +484,14 @@ def decision_path(self, X, check_input=True): ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. - check_input : bool, (default=True) + check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- - indicator : sparse csr array, shape = [n_samples, n_nodes] - Return a node indicator matrix where non zero elements + indicator : sparse matrix of shape (n_samples, n_nodes) + Return a node indicator CSR matrix where non zero elements indicates that the samples goes through the nodes. """ X = self._validate_X_predict(X, check_input) @@ -570,7 +569,7 @@ def feature_importances_(self): Returns ------- - feature_importances_ : array, shape = [n_features] + feature_importances_ : ndarray of shape (n_features,) Normalized total reduction of critera by feature (Gini importance). """ check_is_fitted(self) @@ -589,21 +588,21 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): Parameters ---------- - criterion : str, optional (default="gini") + criterion : {"gini", "entropy"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. - splitter : str, optional (default="best") + splitter : {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. - max_depth : int or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -614,7 +613,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -629,12 +628,12 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, str or None, optional (default=None) + max_features : int, float or {"auto", "sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -650,18 +649,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -689,9 +688,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - class_weight : dict, list of dicts, "balanced" or None, default=None + class_weight : dict, list of dict or "balanced", default=None Weights associated with classes in the form ``{class_label: weight}``. - If not given, all classes are supposed to have weight one. For + If None, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. @@ -715,7 +714,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. deprecated:: 0.22 - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -725,7 +724,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): Attributes ---------- - classes_ : array of shape (n_classes,) or a list of such arrays + classes_ : ndarray of shape (n_classes,) or list of ndarray The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). @@ -735,10 +734,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): total reduction of the criterion brought by that feature. It is also known as the Gini importance [4]_. - max_features_ : int, + max_features_ : int The inferred value of max_features. - n_classes_ : int or list + n_classes_ : int or list of int The number of classes (for single output problems), or a list containing the number of classes for each output (for multi-output problems). @@ -749,7 +748,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): n_outputs_ : int The number of outputs when ``fit`` is performed. - tree_ : Tree object + tree_ : Tree The underlying Tree object. Please refer to ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` @@ -838,7 +837,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, Parameters ---------- - X : {array-like or sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. @@ -853,11 +852,12 @@ def fit(self, X, y, sample_weight=None, check_input=True, ignored if they would result in any single class carrying a negative weight in either child node. - check_input : bool, (default=True) + check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. - X_idx_sorted : array-like of shape (n_samples, n_features), optional + X_idx_sorted : array-like of shape (n_samples, n_features), \ + default=None The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. @@ -865,7 +865,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, Returns ------- - self : object + self : DecisionTreeClassifier Fitted estimator. """ @@ -882,24 +882,21 @@ def predict_proba(self, X, check_input=True): The predicted class probability is the fraction of samples of the same class in a leaf. - check_input : boolean, (default=True) - Allow to bypass several input checking. - Don't use this parameter unless you know what you do. - Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. - check_input : bool - Run check_array on X. + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. Returns ------- - proba : array of shape (n_samples, n_classes), or a list of n_outputs \ - such arrays if n_outputs > 1. + proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \ + such arrays if n_outputs > 1 The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ @@ -932,15 +929,15 @@ def predict_log_proba(self, X): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- - proba : array of shape (n_samples, n_classes), or a list of n_outputs \ - such arrays if n_outputs > 1. + proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \ + such arrays if n_outputs > 1 The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ @@ -963,7 +960,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): Parameters ---------- - criterion : str, optional (default="mse") + criterion : {"mse", "friedman_mse", "mae"}, default="mse" The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss @@ -975,17 +972,17 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. - splitter : str, optional (default="best") + splitter : {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. - max_depth : int or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -996,7 +993,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -1011,12 +1008,12 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, str or None, optional (default=None) + max_features : int, float or {"auto", "sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -1032,18 +1029,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -1076,7 +1073,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. deprecated:: 0.22 - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -1093,7 +1090,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance [4]_. - max_features_ : int, + max_features_ : int The inferred value of max_features. n_features_ : int @@ -1102,7 +1099,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): n_outputs_ : int The number of outputs when ``fit`` is performed. - tree_ : Tree object + tree_ : Tree The underlying Tree object. Please refer to ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` @@ -1189,7 +1186,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, Parameters ---------- - X : {array-like or sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. @@ -1203,11 +1200,12 @@ def fit(self, X, y, sample_weight=None, check_input=True, that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. - check_input : bool, (default=True) + check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. - X_idx_sorted : array-like of shape (n_samples, n_features), optional + X_idx_sorted : array-like of shape (n_samples, n_features), \ + default=None The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. @@ -1215,7 +1213,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, Returns ------- - self : object + self : DecisionTreeRegressor Fitted estimator. """ @@ -1259,21 +1257,21 @@ class ExtraTreeClassifier(DecisionTreeClassifier): Parameters ---------- - criterion : str, optional (default="gini") + criterion : {"gini", "entropy"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. - splitter : str, optional (default="random") + splitter : {"random", "best"}, default="random" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. - max_depth : int or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -1284,7 +1282,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -1299,12 +1297,12 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, str or None, optional (default="auto") + max_features : int, float, {"auto", "sqrt", "log2"} or None, default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -1320,18 +1318,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -1359,9 +1357,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - class_weight : dict, list of dicts, "balanced" or None, default=None + class_weight : dict, list of dict or "balanced", default=None Weights associated with classes in the form ``{class_label: weight}``. - If not given, all classes are supposed to have weight one. For + If None, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. @@ -1380,7 +1378,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier): Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -1390,14 +1388,14 @@ class ExtraTreeClassifier(DecisionTreeClassifier): Attributes ---------- - classes_ : array of shape (n_classes,) or a list of such arrays + classes_ : ndarray of shape (n_classes,) or list of ndarray The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). - max_features_ : int, + max_features_ : int The inferred value of max_features. - n_classes_ : int or list + n_classes_ : int or list of int The number of classes (for single output problems), or a list containing the number of classes for each output (for multi-output problems). @@ -1412,7 +1410,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier): n_outputs_ : int The number of outputs when ``fit`` is performed. - tree_ : Tree object + tree_ : Tree The underlying Tree object. Please refer to ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` @@ -1420,8 +1418,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier): See Also -------- - ExtraTreeRegressor, sklearn.ensemble.ExtraTreesClassifier, - sklearn.ensemble.ExtraTreesRegressor + ExtraTreeRegressor : An extremely randomized tree regressor. + sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier. + sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor. Notes ----- @@ -1483,7 +1482,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Parameters ---------- - criterion : str, optional (default="mse") + criterion : {"mse", "friedman_mse", "mae"}, default="mse" The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean @@ -1492,17 +1491,17 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. - splitter : str, optional (default="random") + splitter : {"random", "best"}, default="random" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. - max_depth : int or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -1513,7 +1512,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -1528,12 +1527,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, str or None, optional (default="auto") + max_features : int, float, {"auto", "sqrt", "log2"} or None, default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -1549,13 +1548,13 @@ class ExtraTreeRegressor(DecisionTreeRegressor): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -1583,12 +1582,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -1598,7 +1597,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Attributes ---------- - max_features_ : int, + max_features_ : int The inferred value of max_features. n_features_ : int @@ -1607,7 +1606,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): n_outputs_ : int The number of outputs when ``fit`` is performed. - tree_ : Tree object + tree_ : Tree The underlying Tree object. Please refer to ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` @@ -1615,8 +1614,9 @@ class ExtraTreeRegressor(DecisionTreeRegressor): See Also -------- - ExtraTreeClassifier, sklearn.ensemble.ExtraTreesClassifier, - sklearn.ensemble.ExtraTreesRegressor + ExtraTreeClassifier : An extremely randomized tree classifier. + sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier. + sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor. Notes ----- From 6478e4bf6dce06f4864fda1c9fe6071a123ccd9b Mon Sep 17 00:00:00 2001 From: wenliwyan <12013376+wenliwyan@users.noreply.github.com> Date: Sat, 14 Dec 2019 19:52:54 +0800 Subject: [PATCH 113/448] DOC fix typo in docstring (#15887) --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3a1fdadfb94b7..258440d20c836 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -558,7 +558,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): Examples -------- - In the following example, we construct a NeighborsClassifier + In the following example, we construct a NearestNeighbors class from an array representing our data set and ask who's the closest point to [1,1,1] From 43827c998d00fc8aa75cb530b88d67ad7917d434 Mon Sep 17 00:00:00 2001 From: shivamgargsya Date: Sun, 15 Dec 2019 01:37:31 +0530 Subject: [PATCH 114/448] DOC FIX default value for xticks_rotation in plot_confusion_matrix (#15890) --- sklearn/metrics/_plot/confusion_matrix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index be59c8dd9a847..fd9cd48489029 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -56,7 +56,7 @@ def plot(self, include_values=True, cmap='viridis', Colormap recognized by matplotlib. xticks_rotation : {'vertical', 'horizontal'} or float, \ - default='vertical' + default='horizontal' Rotation of xtick labels. values_format : str, default=None @@ -160,7 +160,7 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None, Includes values in confusion matrix. xticks_rotation : {'vertical', 'horizontal'} or float, \ - default='vertical' + default='horizontal' Rotation of xtick labels. values_format : str, default=None From a05a5bc2b3d3ec77889125d82a88cad0b4c72a72 Mon Sep 17 00:00:00 2001 From: Gregory Morse Date: Sat, 14 Dec 2019 21:32:59 +0100 Subject: [PATCH 115/448] ENH Yield stack trace information in resilient mode model_selection warnings (#15622) --- doc/whats_new/v0.23.rst | 9 +++++++++ sklearn/exceptions.py | 5 ++++- sklearn/model_selection/_validation.py | 4 ++-- sklearn/model_selection/tests/test_validation.py | 10 ++++++++-- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index e7b9fdd24989d..30cf098eee768 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -85,6 +85,15 @@ Changelog :class:`linear_model.RidgeClassifierCV`. :pr:`15653` by :user:`Jérôme Dockès `. +:mod:`sklearn.model_selection` +.............................. + +- |Enhancement| :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` yields stack trace information + in fit failed warning messages in addition to previously emitted + type and details. + :pr:`15622` by :user:`Gregory Morse `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index c3da8e8d7986f..4acf7863dd682 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -139,7 +139,10 @@ class FitFailedWarning(RuntimeWarning): ... print(repr(w[-1].message)) FitFailedWarning('Estimator fit failed. The score on this train-test partition for these parameters will be set to 0.000000. - Details: \\nValueError: Penalty term must be positive; got (C=-2)\\n'...) + Details: + \\nTraceback (most recent call last):...\\nValueError: + Penalty term must be positive; got (C=-2)\\n') + .. versionchanged:: 0.18 Moved from sklearn.cross_validation. diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 88eb5d49c4d0f..67a30c6416031 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -13,7 +13,7 @@ import warnings import numbers import time -from traceback import format_exception_only +from traceback import format_exc from contextlib import suppress import numpy as np @@ -532,7 +532,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, warnings.warn("Estimator fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%s" % - (error_score, format_exception_only(type(e), e)[0]), + (error_score, format_exc()), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index c72ac0c1b7a14..d1c67930fac77 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1632,8 +1632,14 @@ def test_fit_and_score_failing(): "partition for these parameters will be set to %f. " "Details: \n%s" % (fit_and_score_kwargs['error_score'], error_message)) - # check if the same warning is triggered - assert_warns_message(FitFailedWarning, warning_message, _fit_and_score, + + def test_warn_trace(msg): + assert 'Traceback (most recent call last):\n' in msg + split = msg.splitlines() # note: handles more than '\n' + mtb = split[0] + '\n' + split[-1] + return warning_message in mtb + # check traceback is included + assert_warns_message(FitFailedWarning, test_warn_trace, _fit_and_score, *fit_and_score_args, **fit_and_score_kwargs) fit_and_score_kwargs = {'error_score': 'raise'} From 078e3ef296f29b6a601b075ebd43c2792e8c620f Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 17 Dec 2019 02:28:31 -0500 Subject: [PATCH 116/448] [MRG] MNT Fix assert run python script (#15905) --- build_tools/azure/install.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 2566ba4f4f3aa..b8b71fb0eec58 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -25,7 +25,7 @@ IF "%PYTHON_ARCH%"=="64" ( pip install numpy scipy cython pytest wheel pillow joblib ) if "%COVERAGE%" == "true" ( - pip install coverage codecov pytest-cov + pip install coverage==4.5.3 codecov pytest-cov ) python --version pip --version From 13134a884b92f8c601162ce4f125c2fc17c6ed4a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 17 Dec 2019 21:09:49 -0500 Subject: [PATCH 117/448] Fix imports in pip3 ubuntu by suffixing affected files (#15891) --- sklearn/_build_utils/deprecated_modules.py | 8 ++++---- sklearn/cluster/__init__.py | 6 +++--- sklearn/cluster/{_hierarchical.py => _agglomerative.py} | 6 +++--- sklearn/cluster/{_k_means.py => _kmeans.py} | 0 sklearn/cluster/_spectral.py | 2 +- sklearn/cluster/tests/test_hierarchical.py | 5 +++-- sklearn/cluster/tests/test_k_means.py | 8 ++++---- sklearn/decomposition/__init__.py | 2 +- sklearn/decomposition/{_online_lda.py => _lda.py} | 0 sklearn/decomposition/tests/test_online_lda.py | 4 ++-- sklearn/feature_extraction/__init__.py | 2 +- sklearn/feature_extraction/{_hashing.py => _hash.py} | 0 sklearn/feature_extraction/text.py | 2 +- 13 files changed, 23 insertions(+), 22 deletions(-) rename sklearn/cluster/{_hierarchical.py => _agglomerative.py} (99%) rename sklearn/cluster/{_k_means.py => _kmeans.py} (100%) rename sklearn/decomposition/{_online_lda.py => _lda.py} (100%) rename sklearn/feature_extraction/{_hashing.py => _hash.py} (100%) diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py index 9ff7c7f224710..5f11ac1714022 100644 --- a/sklearn/_build_utils/deprecated_modules.py +++ b/sklearn/_build_utils/deprecated_modules.py @@ -47,9 +47,9 @@ 'SpectralBiclustering'), ('_birch', 'sklearn.cluster.birch', 'sklearn.cluster', 'Birch'), ('_dbscan', 'sklearn.cluster.dbscan_', 'sklearn.cluster', 'DBSCAN'), - ('_hierarchical', 'sklearn.cluster.hierarchical', 'sklearn.cluster', + ('_agglomerative', 'sklearn.cluster.hierarchical', 'sklearn.cluster', 'FeatureAgglomeration'), - ('_k_means', 'sklearn.cluster.k_means_', 'sklearn.cluster', 'KMeans'), + ('_kmeans', 'sklearn.cluster.k_means_', 'sklearn.cluster', 'KMeans'), ('_mean_shift', 'sklearn.cluster.mean_shift_', 'sklearn.cluster', 'MeanShift'), ('_optics', 'sklearn.cluster.optics_', 'sklearn.cluster', 'OPTICS'), @@ -101,7 +101,7 @@ ('_kernel_pca', 'sklearn.decomposition.kernel_pca', 'sklearn.decomposition', 'KernelPCA'), ('_nmf', 'sklearn.decomposition.nmf', 'sklearn.decomposition', 'NMF'), - ('_online_lda', 'sklearn.decomposition.online_lda', + ('_lda', 'sklearn.decomposition.online_lda', 'sklearn.decomposition', 'LatentDirichletAllocation'), ('_online_lda_fast', 'sklearn.decomposition.online_lda_fast', 'sklearn.decomposition', 'mean_change'), @@ -140,7 +140,7 @@ ('_dict_vectorizer', 'sklearn.feature_extraction.dict_vectorizer', 'sklearn.feature_extraction', 'DictVectorizer'), - ('_hashing', 'sklearn.feature_extraction.hashing', + ('_hash', 'sklearn.feature_extraction.hashing', 'sklearn.feature_extraction', 'FeatureHasher'), ('_stop_words', 'sklearn.feature_extraction.stop_words', 'sklearn.feature_extraction.text', 'ENGLISH_STOP_WORDS'), diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 2cdf4b074e1c3..5f3cc58507576 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -7,9 +7,9 @@ from ._mean_shift import (mean_shift, MeanShift, estimate_bandwidth, get_bin_seeds) from ._affinity_propagation import affinity_propagation, AffinityPropagation -from ._hierarchical import (ward_tree, AgglomerativeClustering, linkage_tree, - FeatureAgglomeration) -from ._k_means import k_means, KMeans, MiniBatchKMeans +from ._agglomerative import (ward_tree, AgglomerativeClustering, + linkage_tree, FeatureAgglomeration) +from ._kmeans import k_means, KMeans, MiniBatchKMeans from ._dbscan import dbscan, DBSCAN from ._optics import (OPTICS, cluster_optics_dbscan, compute_optics_graph, cluster_optics_xi) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_agglomerative.py similarity index 99% rename from sklearn/cluster/_hierarchical.py rename to sklearn/cluster/_agglomerative.py index 9883f7e809d48..c64b9b03f8d84 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_agglomerative.py @@ -25,7 +25,7 @@ from ._feature_agglomeration import AgglomerationTransform from ..utils._fast_dict import IntFloatDict from ..utils.fixes import _astype_copy_false -from ..utils import deprecated + ############################################################################### # For non fully-connected graphs @@ -249,8 +249,8 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): else: if n_clusters > n_samples: raise ValueError('Cannot provide more clusters than samples. ' - '%i n_clusters was asked, and there are %i samples.' - % (n_clusters, n_samples)) + '%i n_clusters was asked, and there are %i ' + 'samples.' % (n_clusters, n_samples)) n_nodes = 2 * n_samples - n_clusters # create inertia matrix diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_kmeans.py similarity index 100% rename from sklearn/cluster/_k_means.py rename to sklearn/cluster/_kmeans.py diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 78cdcc5073ccc..b6c5586e75b47 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -15,7 +15,7 @@ from ..metrics.pairwise import pairwise_kernels from ..neighbors import kneighbors_graph, NearestNeighbors from ..manifold import spectral_embedding -from ._k_means import k_means +from ._kmeans import k_means def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 49d102a57e4f3..49966b4338999 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -22,8 +22,9 @@ from sklearn.cluster import ward_tree from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration -from sklearn.cluster._hierarchical import (_hc_cut, _TREE_BUILDERS, - linkage_tree, _fix_connectivity) +from sklearn.cluster._agglomerative import (_hc_cut, _TREE_BUILDERS, + linkage_tree, + _fix_connectivity) from sklearn.feature_extraction.image import grid_to_graph from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\ manhattan_distances, pairwise_distances diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index ea7e5b7825437..50c91382d3117 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -22,8 +22,8 @@ from sklearn.metrics.cluster import v_measure_score from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans -from sklearn.cluster._k_means import _labels_inertia -from sklearn.cluster._k_means import _mini_batch_step +from sklearn.cluster._kmeans import _labels_inertia +from sklearn.cluster._kmeans import _mini_batch_step from sklearn.datasets import make_blobs from io import StringIO from sklearn.metrics.cluster import homogeneity_score @@ -734,7 +734,7 @@ def test_k_means_function(): def test_x_squared_norms_init_centroids(): # Test that x_squared_norms can be None in _init_centroids - from sklearn.cluster._k_means import _init_centroids + from sklearn.cluster._kmeans import _init_centroids X_norms = np.sum(X**2, axis=1) precompute = _init_centroids( @@ -921,7 +921,7 @@ def test_sample_weight_length(): def test_check_normalize_sample_weight(): - from sklearn.cluster._k_means import _check_normalize_sample_weight + from sklearn.cluster._kmeans import _check_normalize_sample_weight sample_weight = None checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) assert _num_samples(X) == _num_samples(checked_sample_weight) diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index 93a51b04f38d2..c8b93ea2e4f9c 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -16,7 +16,7 @@ MiniBatchDictionaryLearning, SparseCoder) from ._factor_analysis import FactorAnalysis from ..utils.extmath import randomized_svd -from ._online_lda import LatentDirichletAllocation +from ._lda import LatentDirichletAllocation __all__ = ['DictionaryLearning', 'FastICA', diff --git a/sklearn/decomposition/_online_lda.py b/sklearn/decomposition/_lda.py similarity index 100% rename from sklearn/decomposition/_online_lda.py rename to sklearn/decomposition/_lda.py diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index fdf993f2759bf..ca8392616e761 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -8,8 +8,8 @@ import pytest from sklearn.decomposition import LatentDirichletAllocation -from sklearn.decomposition._online_lda import (_dirichlet_expectation_1d, - _dirichlet_expectation_2d) +from sklearn.decomposition._lda import (_dirichlet_expectation_1d, + _dirichlet_expectation_2d) from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal diff --git a/sklearn/feature_extraction/__init__.py b/sklearn/feature_extraction/__init__.py index 2103fc67589c3..4591bfc6980c8 100644 --- a/sklearn/feature_extraction/__init__.py +++ b/sklearn/feature_extraction/__init__.py @@ -5,7 +5,7 @@ """ from ._dict_vectorizer import DictVectorizer -from ._hashing import FeatureHasher +from ._hash import FeatureHasher from .image import img_to_graph, grid_to_graph from . import text diff --git a/sklearn/feature_extraction/_hashing.py b/sklearn/feature_extraction/_hash.py similarity index 100% rename from sklearn/feature_extraction/_hashing.py rename to sklearn/feature_extraction/_hash.py diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2cd86a29cf4fa..2d8f7d840c55b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -27,7 +27,7 @@ from ..base import BaseEstimator, TransformerMixin from ..preprocessing import normalize -from ._hashing import FeatureHasher +from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES from ..utils import _IS_32BIT, deprecated From 4ad4cc632ec37f7b5e05bde66dd675dd4ef7c545 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 18 Dec 2019 15:40:03 +0800 Subject: [PATCH 118/448] MNT Raise erorr when normalize is invalid in confusion_matrix (#15888) --- doc/whats_new/v0.22.rst | 7 +++++++ sklearn/metrics/_classification.py | 4 ++++ sklearn/metrics/_plot/confusion_matrix.py | 4 ---- sklearn/metrics/tests/test_classification.py | 7 +++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index af08b832e9f6f..19a8327783b20 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -15,6 +15,13 @@ This is a bug-fix release to primarily resolve some packaging issues in version Changelog --------- +:mod:`sklearn.metrics` +...................... + +- |Fix| :func:`metrics.plot_confusion_matrix` now raises error when `normalize` + is invalid. Previously, it runs fine with no normalization. + :pr:`15888` by `Hanmin Qin`_. + :mod:`sklearn.utils` .................... diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 322ac3409722f..343e63b6c0ae9 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -283,6 +283,10 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, check_consistent_length(y_true, y_pred, sample_weight) + if normalize not in ['true', 'pred', 'all', None]: + raise ValueError("normalize must be one of {'true', 'pred', " + "'all', None}") + n_labels = labels.size label_to_ind = {y: x for x, y in enumerate(labels)} # convert yt, yp into index diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index fd9cd48489029..1c3fc2715ffb3 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -184,10 +184,6 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None, if not is_classifier(estimator): raise ValueError("plot_confusion_matrix only supports classifiers") - if normalize not in {'true', 'pred', 'all', None}: - raise ValueError("normalize must be one of {'true', 'pred', " - "'all', None}") - y_pred = estimator.predict(X) cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 4c1db4b55bb16..c33c3a829cc16 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -526,6 +526,13 @@ def test_confusion_matrix_normalize(normalize, cm_dtype, expected_results): assert cm.dtype.kind == cm_dtype +def test_confusion_matrix_normalize_wrong_option(): + y_test = [0, 0, 0, 0, 1, 1, 1, 1] + y_pred = [0, 0, 0, 0, 0, 0, 0, 0] + with pytest.raises(ValueError, match='normalize must be one of'): + confusion_matrix(y_test, y_pred, normalize=True) + + def test_confusion_matrix_normalize_single_class(): y_test = [0, 0, 0, 0, 1, 1, 1, 1] y_pred = [0, 0, 0, 0, 0, 0, 0, 0] From 7c58c2037f1238f4098db533ad7c519c0d553ad4 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 18 Dec 2019 07:02:15 -0500 Subject: [PATCH 119/448] [MRG] MNT Adds comment regarding coverage pinning (#15912) * MNT Adds comment regarding coverage pinning * CLN Improves english --- build_tools/azure/install.cmd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index b8b71fb0eec58..387b555af62f5 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -25,6 +25,10 @@ IF "%PYTHON_ARCH%"=="64" ( pip install numpy scipy cython pytest wheel pillow joblib ) if "%COVERAGE%" == "true" ( + @rem Using coverage 5.0 will trigger relpath between 2 windows + @rem paths from different drives. Pinning can be removed when + @rem https://github.com/scikit-learn/scikit-learn/issues/15908 + @rem is resolved. pip install coverage==4.5.3 codecov pytest-cov ) python --version From b1a336782e92d77d81a85db088b704e08d3945d7 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 18 Dec 2019 07:21:50 -0500 Subject: [PATCH 120/448] [MRG] DOC Increases search results for API object results (#15574) --- .../static/js/searchtools.js | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/doc/themes/scikit-learn-modern/static/js/searchtools.js b/doc/themes/scikit-learn-modern/static/js/searchtools.js index ca53abe4f0038..0d4ca2328b079 100644 --- a/doc/themes/scikit-learn-modern/static/js/searchtools.js +++ b/doc/themes/scikit-learn-modern/static/js/searchtools.js @@ -11,7 +11,9 @@ * - Removes ajax call to get context for each result * - Adjusts Search.query to remove duplicates in search results. * - Adjusts Scorer to rank objects higher. - * - Adds Search._total_results to limit the number of search results. + * - Adds Search._total_non_object_results to limit the number of search non + * object results. Object results do not perform another GET resquest, so they + * are cheap to display. */ if (!Scorer) { @@ -63,10 +65,10 @@ var Search = { _index: null, _queued_query: null, _pulse_status: -1, - _total_results: 10, + _total_non_object_results: 10, htmlToText: function (htmlString) { - var htmlString = htmlString.replace(//g, ""); + var htmlString = htmlString.replace(//g, ""); var htmlElement = document.createElement("span"); htmlElement.innerHTML = htmlString; $(htmlElement) @@ -218,22 +220,23 @@ var Search = { objectterms.slice(i + 1, objectterms.length) ); - if (results.length < this._total_results) { - results = $u.uniq(results.concat( - this.performObjectSearch(objectterms[i], others) - ), false, function (item) {return item[1]}); - } + results = $u.uniq(results.concat( + this.performObjectSearch(objectterms[i], others) + ), false, function (item) {return item[1]}); } - if (results.length < this._total_results) { - // lookup as search terms in fulltext - results = results.concat( - this.performTermsSearch(searchterms, excluded, terms, titleterms) - ); - } + var total_object_results = results.length; + + // lookup as search terms in fulltext + results = results.concat( + this.performTermsSearch(searchterms, excluded, terms, titleterms) + ); - if (results.length > this._total_results) { - results = results.slice(0, this._total_results); + // Only have _total_non_object_results results above the number of + // total number of object results + var results_limit = total_object_results + this._total_non_object_results + if (results.length > results_limit) { + results = results.slice(0, results_limit); } // let the scorer override scores with a custom scoring function From 2d687bfd0206cd5f3758ea8a6d24f8dffc8ef3f2 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 18 Dec 2019 15:23:02 -0500 Subject: [PATCH 121/448] MNT Ignores warning in pyamg for deprecated scipy.random (#15914) --- sklearn/cluster/tests/test_spectral.py | 4 ++++ sklearn/manifold/tests/test_spectral_embedding.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index dc79f427afcdf..f5591c7348ebe 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -191,6 +191,10 @@ def test_discretize(n_samples): assert adjusted_rand_score(y_true, y_pred) > 0.8 +# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand +# https://github.com/scikit-learn/scikit-learn/issues/15913 +@pytest.mark.filterwarnings( + "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*") def test_spectral_clustering_with_arpack_amg_solvers(): # Test that spectral_clustering is the same for arpack and amg solver # Based on toy example from plot_segmentation_toy.py diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 295367a422f04..f99eae3783c05 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -181,6 +181,10 @@ def test_spectral_embedding_callable_affinity(X, seed=36): assert _check_with_col_sign_flipping(embed_rbf, embed_callable, 0.05) +# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand +# https://github.com/scikit-learn/scikit-learn/issues/15913 +@pytest.mark.filterwarnings( + "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*") def test_spectral_embedding_amg_solver(seed=36): # Test spectral embedding with amg solver pytest.importorskip('pyamg') @@ -211,6 +215,10 @@ def test_spectral_embedding_amg_solver(seed=36): assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 1e-5) +# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand +# https://github.com/scikit-learn/scikit-learn/issues/15913 +@pytest.mark.filterwarnings( + "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*") def test_spectral_embedding_amg_solver_failure(seed=36): # Test spectral embedding with amg solver failure, see issue #13393 pytest.importorskip('pyamg') From a5b67f972b941c4a5039033fd82fb9c5b90e8c78 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 18 Dec 2019 21:29:16 +0100 Subject: [PATCH 122/448] DOC Instructions to troubleshoot Windows path length limit (#15916) --- doc/install.rst | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/install.rst b/doc/install.rst index 1e6ed734e1085..886ed272a65ba 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -126,7 +126,7 @@ If you have not installed NumPy or SciPy yet, you can also install these using conda or pip. When using pip, please ensure that *binary wheels* are used, and NumPy and SciPy are not recompiled from source, which can happen when using particular configurations of operating system and hardware (such as Linux on -a Raspberry Pi). +a Raspberry Pi). If you must install scikit-learn and its dependencies with pip, you can install it as ``scikit-learn[alldeps]``. @@ -255,3 +255,38 @@ WinPython for Windows The `WinPython `_ project distributes scikit-learn as an additional plugin. + + +Troubleshooting +=============== + +.. _windows_longpath: + +Error caused by file path length limit on Windows +------------------------------------------------- + +It can happen that pip fails to install packages when reaching the default path +size limit of Windows if Python is installed in a nested location such as the +`AppData` folder structure under the user home directory, for instance:: + + C:\Users\username>C:\Users\username\AppData\Local\Microsoft\WindowsApps\python.exe -m pip install scikit-learn + Collecting scikit-learn + ... + Installing collected packages: scikit-learn + ERROR: Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: 'C:\\Users\\username\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python37\\site-packages\\sklearn\\datasets\\tests\\data\\openml\\292\\api-v1-json-data-list-data_name-australian-limit-2-data_version-1-status-deactivated.json.gz' + +In this case it is possible to lift that limit in the Windows registry by +using the ``regedit`` tool: + +#. Type "regedit" in the Windows start menu to launch ``regedit``. + +#. Go to the + ``Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem`` + key. + +#. Edit the value of the ``LongPathsEnabled`` property of that key and set + it to 1. + +#. Reinstall scikit-learn (ignoring the previous broken installation):: + + pip install --exists-action=i scikit-learn From cc8d2d26b8f023e5ac122bfc35c7dd174a5d467d Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Thu, 19 Dec 2019 08:07:03 -0500 Subject: [PATCH 123/448] DOC add versionadded directive to some estimators (#15849) --- sklearn/cross_decomposition/_pls.py | 6 ++++++ sklearn/feature_extraction/image.py | 3 +++ sklearn/model_selection/_split.py | 2 ++ sklearn/pipeline.py | 2 ++ sklearn/preprocessing/_data.py | 2 ++ 5 files changed, 15 insertions(+) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 72ee5d4af6ba6..125c5946b1562 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -530,6 +530,8 @@ class PLSRegression(_PLS): Read more in the :ref:`User Guide `. + .. versionadded:: 0.8 + Parameters ---------- n_components : int, (default 2) @@ -668,6 +670,8 @@ class PLSCanonical(_PLS): Read more in the :ref:`User Guide `. + .. versionadded:: 0.8 + Parameters ---------- n_components : int, (default 2). @@ -810,6 +814,8 @@ class PLSSVD(TransformerMixin, BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.8 + Parameters ---------- n_components : int, default 2 diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 2cec6739e7f98..d0da784c526d7 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -298,6 +298,7 @@ def _extract_patches(arr, patch_shape=8, extraction_step=1): patches = as_strided(arr, shape=shape, strides=strides) return patches + @deprecated("The function feature_extraction.image.extract_patches has been " "deprecated in 0.22 and will be removed in 0.24.") def extract_patches(arr, patch_shape=8, extraction_step=1): @@ -483,6 +484,8 @@ class PatchExtractor(BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.9 + Parameters ---------- patch_size : tuple of ints (patch_height, patch_width) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 9a85b4049a3c3..bb0643e8c8edb 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1816,6 +1816,8 @@ class PredefinedSplit(BaseCrossValidator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.16 + Parameters ---------- test_fold : array-like, shape (n_samples,) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e4ac9007fe8e5..453ce2228d50d 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -47,6 +47,8 @@ class Pipeline(_BaseComposition): Read more in the :ref:`User Guide `. + .. versionadded:: 0.5 + Parameters ---------- steps : list diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index cae75be2e591c..9227efa958b43 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2679,6 +2679,8 @@ class PowerTransformer(TransformerMixin, BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- method : str, (default='yeo-johnson') From 2e90b897768fd360ef855cb46e0b37f2b6faaf72 Mon Sep 17 00:00:00 2001 From: Oliver Urs Lenz Date: Thu, 19 Dec 2019 14:44:44 +0100 Subject: [PATCH 124/448] DOC clarify doc-string of roc_auc_score and add references (#15293) --- doc/modules/model_evaluation.rst | 16 +++-- sklearn/metrics/_ranking.py | 86 +++++++++++++++++---------- sklearn/metrics/tests/test_ranking.py | 2 +- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 7af1e46578de6..24bf9541ebab4 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1348,8 +1348,8 @@ the one-vs-rest algorithm computes the average of the ROC AUC scores for each class against all other classes. In both cases, the predicted labels are provided in an array with values from 0 to ``n_classes``, and the scores correspond to the probability estimates that a sample belongs to a particular -class. The OvO and OvR algorithms supports weighting uniformly -(``average='macro'``) and weighting by the prevalence (``average='weighted'``). +class. The OvO and OvR algorithms support weighting uniformly +(``average='macro'``) and by prevalence (``average='weighted'``). **One-vs-one Algorithm**: Computes the average AUC of all possible pairwise combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted @@ -1380,10 +1380,10 @@ the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to ``'weighted'``. The ``'weighted'`` option returns a prevalence-weighted average as described in [FC2009]_. -**One-vs-rest Algorithm**: Computes the AUC of each class against the rest. -The algorithm is functionally the same as the multilabel case. To enable this -algorithm set the keyword argument ``multiclass`` to ``'ovr'``. Similar to -OvO, OvR supports two types of averaging: ``'macro'`` [F2006]_ and +**One-vs-rest Algorithm**: Computes the AUC of each class against the rest +[PD2000]_. The algorithm is functionally the same as the multilabel case. To +enable this algorithm set the keyword argument ``multiclass`` to ``'ovr'``. +Like OvO, OvR supports two types of averaging: ``'macro'`` [F2006]_ and ``'weighted'`` [F2001]_. In applications where a high false positive rate is not tolerable the parameter @@ -1422,6 +1422,10 @@ to the given limit. `_ Pattern Recognition Letters. 30. 27-38. + .. [PD2000] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving + probability estimation trees (Section 6.2), CeDER Working Paper #IS-00-04, + Stern School of Business, New York University. + .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis. `_ Pattern Recognition Letters, 27(8), pp. 861-874. diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 71731025e5649..e525539c0d706 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -248,27 +248,32 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. - Note: this implementation is restricted to the binary classification task - or multilabel classification task in label indicator format. + Note: this implementation can be used with binary, multiclass and + multilabel classification, but some restrictions apply (see Parameters). Read more in the :ref:`User Guide `. Parameters ---------- - y_true : array, shape = [n_samples] or [n_samples, n_classes] - True binary labels or binary label indicators. - The multiclass case expects shape = [n_samples] and labels - with values in ``range(n_classes)``. - - y_score : array, shape = [n_samples] or [n_samples, n_classes] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). For binary - y_true, y_score is supposed to be the score of the class with greater - label. The multiclass case expects shape = [n_samples, n_classes] - where the scores correspond to probability estimates. - - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + y_true : array-like of shape (n_samples,) or (n_samples, n_classes) + True labels or binary label indicators. The binary and multiclass cases + expect labels with shape (n_samples,) while the multilabel case expects + binary label indicators with shape (n_samples, n_classes). + + y_score : array-like of shape (n_samples,) or (n_samples, n_classes) + Target scores. In the binary and multilabel cases, these can be either + probability estimates or non-thresholded decision values (as returned + by `decision_function` on some classifiers). In the multiclass case, + these must be probability estimates which sum to 1. The binary + case expects a shape (n_samples,), and the scores must be the scores of + the class with the greater label. The multiclass and multilabel + cases expect a shape (n_samples, n_classes). In the multiclass case, + the order of the class scores must correspond to the order of + ``labels``, if provided, or else to the numerical or lexicographical + order of the labels in ``y_true``. + + average : {'micro', 'macro', 'samples', 'weighted'} or None, \ + default='macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: Note: multiclass ROC AUC currently only handles the 'macro' and @@ -291,26 +296,32 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, sample_weight : array-like of shape (n_samples,), default=None Sample weights. - max_fpr : float > 0 and <= 1, optional - If not ``None``, the standardized partial AUC [3]_ over the range + max_fpr : float > 0 and <= 1, default=None + If not ``None``, the standardized partial AUC [2]_ over the range [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, should be either equal to ``None`` or ``1.0`` as AUC ROC partial computation currently is not supported for multiclass. - multi_class : string, 'ovr' or 'ovo', optional(default='raise') - Determines the type of multiclass configuration to use. - ``multi_class`` must be provided when ``y_true`` is multiclass. + multi_class : {'raise', 'ovr', 'ovo'}, default='raise' + Multiclass only. Determines the type of configuration to use. The + default value raises an error, so either ``'ovr'`` or ``'ovo'`` must be + passed explicitly. ``'ovr'``: - Calculate metrics for the multiclass case using the one-vs-rest - approach. + Computes the AUC of each class against the rest [3]_ [4]_. This + treats the multiclass case in the same way as the multilabel case. + Sensitive to class imbalance even when ``average == 'macro'``, + because class imbalance affects the composition of each of the + 'rest' groupings. ``'ovo'``: - Calculate metrics for the multiclass case using the one-vs-one - approach. + Computes the average AUC of all possible pairwise combinations of + classes [5]_. Insensitive to class imbalance when + ``average == 'macro'``. - labels : array, shape = [n_classes] or None, optional (default=None) - List of labels to index ``y_score`` used for multiclass. If ``None``, - the lexicon order of ``y_true`` is used to index ``y_score``. + labels : array-like of shape (n_classes,), default=None + Multiclass only. List of labels that index the classes in ``y_score``. + If ``None``, the numerical or lexicographical order of the labels in + ``y_true`` is used. Returns ------- @@ -321,12 +332,22 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, .. [1] `Wikipedia entry for the Receiver operating characteristic `_ - .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition - Letters, 2006, 27(8):861-874. - - .. [3] `Analyzing a portion of the ROC curve. McClish, 1989 + .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 `_ + .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving + probability estimation trees (Section 6.2), CeDER Working Paper + #IS-00-04, Stern School of Business, New York University. + + .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern + Recognition Letters, 27(8), 861-874. + `_ + + .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area + Under the ROC Curve for Multiple Class Classification Problems. + Machine Learning, 45(2), 171-186. + `_ + See also -------- average_precision_score : Area under the precision-recall curve @@ -341,7 +362,6 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> roc_auc_score(y_true, y_scores) 0.75 - """ y_type = type_of_target(y_true) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 7ce7cf3e3814e..4542b8e2a2964 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -554,7 +554,7 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels): result_unweighted) # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm - # on the same input (Provost & Domingos, 2001) + # on the same input (Provost & Domingos, 2000) result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5 assert_almost_equal( roc_auc_score( From 88b6322877a557f94919bed34bed806efc82aa9c Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 19 Dec 2019 09:51:22 -0500 Subject: [PATCH 125/448] MNT Adds skip lint to azure pipeline CI (#15904) --- azure-pipelines.yml | 10 ++++++++-- doc/developers/contributing.rst | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5f84a1ae94857..bf0cd41e1d366 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -12,8 +12,14 @@ jobs: - bash: conda create --name flake8_env --yes flake8 displayName: Install flake8 - bash: | - source activate flake8_env - ./build_tools/circle/linting.sh + if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then + # skip linting + echo "Skipping linting" + exit 0 + else + source activate flake8_env + ./build_tools/circle/linting.sh + fi displayName: Run linting diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 8d12187ade00b..6cca31e81813b 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -437,6 +437,7 @@ message, the following actions are taken. ---------------------- ------------------- [scipy-dev] Add a Travis build with our dependencies (numpy, scipy, etc ...) development builds [ci skip] CI is skipped completely + [lint skip] Azure pipeline skips linting [doc skip] Docs are not built [doc quick] Docs built, but excludes example gallery plots [doc build] Docs built including example gallery plots From a6c07f2b0862885850d455a3a8b995d57f7d5648 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 19 Dec 2019 09:56:12 -0500 Subject: [PATCH 126/448] BLD Fixes bug when building with NO_MATHJAX=1 (#15892) --- doc/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/conf.py b/doc/conf.py index 0386f7676e0be..c4d7e578216fd 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,6 +51,7 @@ if os.environ.get('NO_MATHJAX'): extensions.append('sphinx.ext.imgmath') imgmath_image_format = 'svg' + mathjax_path = '' else: extensions.append('sphinx.ext.mathjax') mathjax_path = ('https://cdn.jsdelivr.net/npm/mathjax@3/es5/' From 556eb9246d1290ea248f2e3b3581dc53e75b8061 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 19 Dec 2019 12:05:17 -0500 Subject: [PATCH 127/448] [MRG] BUG Checks to number of axes in passed in ax more generically (#15760) --- doc/whats_new/v0.22.rst | 7 +++++ sklearn/inspection/_partial_dependence.py | 18 ++++++----- .../tests/test_plot_partial_dependence.py | 31 +++++++++++-------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 19a8327783b20..ae9cbbd74e313 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -28,6 +28,13 @@ Changelog - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with boolean columns to floats. :pr:`15797` by `Thomas Fan`_. +:mod:`sklearn.inspection` +......................... + +- |Fix| :func:`inspection.plot_partial_dependence` and + :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks + the number of axes passed in. :pr:`15760` by `Thomas Fan`_. + .. _changes_0_22: Version 0.22.0 diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 40a7f073ca818..12233a766969c 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -655,10 +655,12 @@ def convert_feature(fx): features = tmp_features - if isinstance(ax, list): - if len(ax) != len(features): - raise ValueError("Expected len(ax) == len(features), " - "got len(ax) = {}".format(len(ax))) + # Early exit if the axes does not have the correct number of axes + if ax is not None and not isinstance(ax, plt.Axes): + axes = np.asarray(ax, dtype=object) + if axes.size != len(features): + raise ValueError("Expected ax to have {} axes, got {}".format( + len(features), axes.size)) for i in chain.from_iterable(features): if i >= len(feature_names): @@ -886,16 +888,16 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): axes_ravel[i] = self.figure_.add_subplot(spec) else: # array-like - ax = check_array(ax, dtype=object, ensure_2d=False) + ax = np.asarray(ax, dtype=object) + if ax.size != n_features: + raise ValueError("Expected ax to have {} axes, got {}" + .format(n_features, ax.size)) if ax.ndim == 2: n_cols = ax.shape[1] else: n_cols = None - if ax.ndim == 1 and ax.shape[0] != n_features: - raise ValueError("Expected len(ax) == len(features), " - "got len(ax) = {}".format(len(ax))) self.bounding_ax_ = None self.figure_ = ax.ravel()[0].figure self.axes_ = ax diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py index 222ab0fc45ccd..abae91d4d2642 100644 --- a/sklearn/inspection/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/tests/test_plot_partial_dependence.py @@ -222,26 +222,31 @@ def test_plot_partial_dependence_passing_numpy_axes(pyplot, clf_boston, assert len(disp2.axes_[0, 1].get_lines()) == 2 +@pytest.mark.parametrize("nrows, ncols", [(2, 2), (3, 1)]) def test_plot_partial_dependence_incorrent_num_axes(pyplot, clf_boston, - boston): - grid_resolution = 25 - fig, (ax1, ax2, ax3) = pyplot.subplots(1, 3) + boston, nrows, ncols): + grid_resolution = 5 + fig, axes = pyplot.subplots(nrows, ncols) + axes_formats = [list(axes.ravel()), tuple(axes.ravel()), axes] - msg = r"Expected len$ax$ == len$features$, got len$ax$ = 3" - with pytest.raises(ValueError, match=msg): - plot_partial_dependence(clf_boston, boston.data, - ['CRIM', ('CRIM', 'ZN')], - grid_resolution=grid_resolution, - feature_names=boston.feature_names, - ax=[ax1, ax2, ax3]) + msg = "Expected ax to have 2 axes, got {}".format(nrows * ncols) disp = plot_partial_dependence(clf_boston, boston.data, - ['CRIM', ('CRIM', 'ZN')], + ['CRIM', 'ZN'], grid_resolution=grid_resolution, feature_names=boston.feature_names) - with pytest.raises(ValueError, match=msg): - disp.plot(ax=[ax1, ax2, ax3]) + for ax_format in axes_formats: + with pytest.raises(ValueError, match=msg): + plot_partial_dependence(clf_boston, boston.data, + ['CRIM', 'ZN'], + grid_resolution=grid_resolution, + feature_names=boston.feature_names, + ax=ax_format) + + # with axes object + with pytest.raises(ValueError, match=msg): + disp.plot(ax=ax_format) def test_plot_partial_dependence_with_same_axes(pyplot, clf_boston, boston): From a4db10152b4fc1b976a54eca83965b6aa0121597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 19 Dec 2019 21:52:06 +0100 Subject: [PATCH 128/448] EXA Minor fixes in plot_sparse_logistic_regression_20newsgroups.py (#15925) --- .../plot_sparse_logistic_regression_20newsgroups.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py index 78fdc64684550..7bfad99d991c5 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py +++ b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py @@ -1,7 +1,7 @@ """ -===================================================== -Multiclass sparse logisitic regression on newgroups20 -===================================================== +==================================================== +Multiclass sparse logistic regression on 20newgroups +==================================================== Comparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression to classify documents from the newgroups20 dataset. Multinomial logistic @@ -42,7 +42,6 @@ # Turn down for faster run time n_samples = 10000 -# Memorized fetch_rcv1 for faster access X, y = fetch_20newsgroups_vectorized('all', return_X_y=True) X = X[:n_samples] y = y[:n_samples] From 0d3de54c688a19ebe147ca7c6d78fa717de77a71 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 22:10:56 +0100 Subject: [PATCH 129/448] BUG Do not shadow public functions with deprecated modules (#15846) --- sklearn/decomposition/__init__.py | 38 +++++++++++++------- sklearn/inspection/__init__.py | 21 ++++++++--- sklearn/tests/test_import_deprecations.py | 6 ++++ sklearn/utils/tests/test_deprecated_utils.py | 35 ++++++++++++++++++ 4 files changed, 84 insertions(+), 16 deletions(-) diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index c8b93ea2e4f9c..42f661171eafe 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -4,19 +4,33 @@ this module can be regarded as dimensionality reduction techniques. """ -from ._nmf import NMF, non_negative_factorization -from ._pca import PCA -from ._incremental_pca import IncrementalPCA -from ._kernel_pca import KernelPCA -from ._sparse_pca import SparsePCA, MiniBatchSparsePCA -from ._truncated_svd import TruncatedSVD -from ._fastica import FastICA, fastica -from ._dict_learning import (dict_learning, dict_learning_online, +# TODO: remove me in 0.24 (as well as the noqa markers) and +# import the dict_learning func directly from the ._dict_learning +# module instead. +# Pre-cache the import of the deprecated module so that import +# sklearn.decomposition.dict_learning returns the function as in +# 0.21, instead of the module. +# https://github.com/scikit-learn/scikit-learn/issues/15842 +import warnings +with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + from .dict_learning import dict_learning + + +from ._nmf import NMF, non_negative_factorization # noqa +from ._pca import PCA # noqa +from ._incremental_pca import IncrementalPCA # noqa +from ._kernel_pca import KernelPCA # noqa +from ._sparse_pca import SparsePCA, MiniBatchSparsePCA # noqa +from ._truncated_svd import TruncatedSVD # noqa +from ._fastica import FastICA, fastica # noqa +from ._dict_learning import (dict_learning_online, sparse_encode, DictionaryLearning, - MiniBatchDictionaryLearning, SparseCoder) -from ._factor_analysis import FactorAnalysis -from ..utils.extmath import randomized_svd -from ._lda import LatentDirichletAllocation + MiniBatchDictionaryLearning, SparseCoder) # noqa +from ._factor_analysis import FactorAnalysis # noqa +from ..utils.extmath import randomized_svd # noqa +from ._lda import LatentDirichletAllocation # noqa + __all__ = ['DictionaryLearning', 'FastICA', diff --git a/sklearn/inspection/__init__.py b/sklearn/inspection/__init__.py index 04d9d84ecaf02..904d16d74b016 100644 --- a/sklearn/inspection/__init__.py +++ b/sklearn/inspection/__init__.py @@ -1,8 +1,21 @@ """The :mod:`sklearn.inspection` module includes tools for model inspection.""" -from ._partial_dependence import partial_dependence -from ._partial_dependence import plot_partial_dependence -from ._partial_dependence import PartialDependenceDisplay -from ._permutation_importance import permutation_importance + +# TODO: remove me in 0.24 (as well as the noqa markers) and +# import the partial_dependence func directly from the +# ._partial_dependence module instead. +# Pre-cache the import of the deprecated module so that import +# sklearn.inspection.partial_dependence returns the function as in +# 0.21, instead of the module +# https://github.com/scikit-learn/scikit-learn/issues/15842 +import warnings +with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + from .partial_dependence import partial_dependence + +from ._partial_dependence import plot_partial_dependence # noqa +from ._partial_dependence import PartialDependenceDisplay # noqa +from ._permutation_importance import permutation_importance # noqa + __all__ = [ 'partial_dependence', diff --git a/sklearn/tests/test_import_deprecations.py b/sklearn/tests/test_import_deprecations.py index 13b31e89b2862..29c4259fe1e5a 100644 --- a/sklearn/tests/test_import_deprecations.py +++ b/sklearn/tests/test_import_deprecations.py @@ -24,6 +24,12 @@ def test_import_is_deprecated(deprecated_path, importee): # TODO: remove in 0.24 + # Special case for: + # https://github.com/scikit-learn/scikit-learn/issues/15842 + if deprecated_path in ("sklearn.decomposition.dict_learning", + "sklearn.inspection.partial_dependence"): + pytest.skip("No warning can be raised for " + deprecated_path) + expected_message = ( "The {deprecated_path} module is deprecated in version " "0.22 and will be removed in version 0.24. " diff --git a/sklearn/utils/tests/test_deprecated_utils.py b/sklearn/utils/tests/test_deprecated_utils.py index da41e7e44ddb3..08bd95aacc284 100644 --- a/sklearn/utils/tests/test_deprecated_utils.py +++ b/sklearn/utils/tests/test_deprecated_utils.py @@ -1,7 +1,10 @@ import pytest +import types import numpy as np +import warnings from sklearn.dummy import DummyClassifier +from sklearn.utils import all_estimators from sklearn.utils.estimator_checks import choose_check_classifiers_labels from sklearn.utils.estimator_checks import NotAnArray from sklearn.utils.estimator_checks import enforce_estimator_tags_y @@ -94,3 +97,35 @@ def test_safe_indexing(): with pytest.warns(FutureWarning, match="removed in version 0.24"): safe_indexing([1, 2], 0) + + +# TODO: remove in 0.24 +def test_partial_dependence_no_shadowing(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15842 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + from sklearn.inspection.partial_dependence import partial_dependence as _ # noqa + + # Calling all_estimators() also triggers a recursive import of all + # submodules, including deprecated ones. + all_estimators() + + from sklearn.inspection import partial_dependence + assert isinstance(partial_dependence, types.FunctionType) + + +# TODO: remove in 0.24 +def test_dict_learning_no_shadowing(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15842 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + from sklearn.decomposition.dict_learning import dict_learning as _ # noqa + + # Calling all_estimators() also triggers a recursive import of all + # submodules, including deprecated ones. + all_estimators() + + from sklearn.decomposition import dict_learning + assert isinstance(dict_learning, types.FunctionType) From 2f86cdb6fe46052315137f7e8403fa70a6d20773 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 01:23:42 +0100 Subject: [PATCH 130/448] Import sklearn._distributor_init first (#15929) --- sklearn/__init__.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 970d37480a370..7186df8e948ba 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -70,12 +70,18 @@ # We are not importing the rest of scikit-learn during the build # process, as it may not be compiled yet else: - from . import __check_build + # `_distributor_init` allows distributors to run custom init code. + # For instance, for the Windows wheel, this is used to pre-load the + # vcomp shared library runtime for OpenMP embedded in the sklearn/.libs + # sub-folder. + # It is necessary to do this prior to importing show_versions as the + # later is linked to the OpenMP runtime to make it possible to introspect + # it and importing it first would fail if the OpenMP dll cannot be found. + from . import _distributor_init # noqa: F401 + from . import __check_build # noqa: F401 from .base import clone from .utils._show_versions import show_versions - __check_build # avoid flakes unused variable error - __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition', 'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions', 'experimental', 'externals', 'feature_extraction', @@ -90,9 +96,6 @@ 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] - # Allow distributors to run custom init code - from . import _distributor_init # noqa: F401 - def setup_module(module): """Fixture for the tests to assure globally controllable seeding of RNGs""" From 0e10b3a3cbeb0dcadbfc33bdd8b601ccfaf65353 Mon Sep 17 00:00:00 2001 From: Brian Wignall Date: Thu, 19 Dec 2019 19:26:02 -0500 Subject: [PATCH 131/448] DOC Fix typos, via a Levenshtein-style corrector (#15923) --- azure-pipelines.yml | 2 +- benchmarks/bench_plot_randomized_svd.py | 2 +- benchmarks/bench_text_vectorizers.py | 2 +- build_tools/azure/install.sh | 2 +- doc/developers/advanced_installation.rst | 2 +- doc/developers/contributing.rst | 2 +- doc/developers/maintainer.rst | 2 +- doc/glossary.rst | 2 +- doc/modules/clustering.rst | 2 +- doc/themes/scikit-learn-modern/javascript.html | 2 +- examples/manifold/plot_t_sne_perplexity.py | 2 +- examples/model_selection/plot_roc.py | 2 +- examples/plot_changed_only_pprint_parameter.py | 2 +- examples/plot_roc_curve_visualization_api.py | 2 +- sklearn/decomposition/_incremental_pca.py | 2 +- sklearn/decomposition/_lda.py | 2 +- sklearn/dummy.py | 2 +- sklearn/ensemble/_gb.py | 2 +- sklearn/ensemble/_hist_gradient_boosting/loss.py | 2 +- sklearn/ensemble/_stacking.py | 2 +- sklearn/ensemble/tests/test_weight_boosting.py | 2 +- sklearn/externals/joblib/numpy_pickle.py | 2 +- sklearn/metrics/_plot/precision_recall_curve.py | 2 +- sklearn/metrics/_regression.py | 2 +- sklearn/neighbors/_quad_tree.pyx | 2 +- sklearn/pipeline.py | 2 +- sklearn/svm/src/liblinear/liblinear_helper.c | 2 +- sklearn/svm/src/libsvm/svm.cpp | 4 ++-- sklearn/svm/src/libsvm/svm.h | 4 ++-- sklearn/tree/_classes.py | 3 ++- sklearn/utils/_testing.py | 2 +- sklearn/utils/deprecation.py | 2 +- sklearn/utils/graph_shortest_path.pyx | 2 +- 33 files changed, 36 insertions(+), 35 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bf0cd41e1d366..e2ff71802ce72 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -74,7 +74,7 @@ jobs: # It runs tests requiring pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - # FIXME: pinned until SciPy wheels are available for Pyhon 3.8 + # FIXME: pinned until SciPy wheels are available for Python 3.8 PYTHON_VERSION: '3.8' PYTEST_VERSION: '4.6.2' COVERAGE: 'true' diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 0fe050eaf7e30..e322cda8e87e9 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -104,7 +104,7 @@ # in case the reconstructed (dense) matrix is too large MAX_MEMORY = np.int(2e9) -# The following datasets can be dowloaded manually from: +# The following datasets can be downloaded manually from: # CIFAR 10: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz # SVHN: http://ufldl.stanford.edu/housenumbers/train_32x32.mat CIFAR_FOLDER = "./cifar-10-batches-py/" diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py index 196e677e9b49c..96dbc04312291 100644 --- a/benchmarks/bench_text_vectorizers.py +++ b/benchmarks/bench_text_vectorizers.py @@ -32,7 +32,7 @@ def f(): text = fetch_20newsgroups(subset='train').data[:1000] print("="*80 + '\n#' + " Text vectorizers benchmark" + '\n' + '='*80 + '\n') -print("Using a subset of the 20 newsrgoups dataset ({} documents)." +print("Using a subset of the 20 newsgroups dataset ({} documents)." .format(len(text))) print("This benchmarks runs in ~1 min ...") diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index f4f60df8a9626..aa2707bb03837 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -13,7 +13,7 @@ make_conda() { version_ge() { # The two version numbers are separated with a new line is piped to sort # -rV. The -V activates for version number sorting and -r sorts in - # decending order. If the first argument is the top element of the sort, it + # descending order. If the first argument is the top element of the sort, it # is greater than or equal to the second argument. test "$(printf "${1}\n${2}" | sort -rV | head -n 1)" == "$1" } diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index c58eb14e828d2..d52dfba48c0e1 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -49,7 +49,7 @@ feature, code or documentation improvement). If you plan on submitting a pull-request, you should clone from your fork instead. -#. Install a compiler with OpenMP_ support for your platform. See intructions +#. Install a compiler with OpenMP_ support for your platform. See instructions for :ref:`compiler_windows`, :ref:`compiler_macos`, :ref:`compiler_linux` and :ref:`compiler_freebsd`. diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 6cca31e81813b..16adf4a607d90 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -377,7 +377,7 @@ complies with the following rules before marking a PR as ``[MRG]``. The methods available in scikit-learn. 10. New features often need to be illustrated with narrative documentation in - the user guide, with small code snipets. If relevant, please also add + the user guide, with small code snippets. If relevant, please also add references in the literature, with PDF links when possible. 11. The user guide should also include expected time and space complexity diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index e91f01999b12e..66d5250af1644 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -62,7 +62,7 @@ Making a release 2. On the branch for releasing, update the version number in sklearn/__init__.py, the ``__version__`` variable by removing ``dev*`` only when ready to release. - On master, increment the verson in the same place (when branching for + On master, increment the version in the same place (when branching for release). 3. Create the tag and push it:: diff --git a/doc/glossary.rst b/doc/glossary.rst index 3a01b76a45781..e259fa69745bc 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1161,7 +1161,7 @@ Methods TODO: `This gist `_ - higlights the use of the different formats for multilabel. + highlights the use of the different formats for multilabel. multioutput classification A list of 2d arrays, corresponding to each multiclass decision function. diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index ed79304fcbdee..5649c3f5237da 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -775,7 +775,7 @@ core sample, and is at least ``eps`` in distance from any core sample, is considered an outlier by the algorithm. While the parameter ``min_samples`` primarily controls how tolerant the -algorithm is towards noise (on noisy and large data sets it may be desiable +algorithm is towards noise (on noisy and large data sets it may be desirable to increase this parameter), the parameter ``eps`` is *crucial to choose appropriately* for the data set and distance function and usually cannot be left at the default value. It controls the local neighborhood of the points. diff --git a/doc/themes/scikit-learn-modern/javascript.html b/doc/themes/scikit-learn-modern/javascript.html index bdeab8abb9f42..fc0dca1040e03 100644 --- a/doc/themes/scikit-learn-modern/javascript.html +++ b/doc/themes/scikit-learn-modern/javascript.html @@ -114,7 +114,7 @@ prevScrollpos = lastScrollTop; }; - /*** high preformance scroll event listener***/ + /*** high performance scroll event listener***/ var raf = window.requestAnimationFrame || window.webkitRequestAnimationFrame || window.mozRequestAnimationFrame || diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py index cda936cf72142..dd7b4d1f21a09 100644 --- a/examples/manifold/plot_t_sne_perplexity.py +++ b/examples/manifold/plot_t_sne_perplexity.py @@ -6,7 +6,7 @@ An illustration of t-SNE on the two concentric circles and the S-curve datasets for different perplexity values. -We observe a tendency towards clearer shapes as the preplexity value increases. +We observe a tendency towards clearer shapes as the perplexity value increases. The size, the distance and the shape of clusters may vary upon initialization, perplexity values and does not always convey a meaning. diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py index d995c5c653ce4..d32ab06f7bf25 100644 --- a/examples/model_selection/plot_roc.py +++ b/examples/model_selection/plot_roc.py @@ -151,7 +151,7 @@ # ......................................... # The :func:`sklearn.metrics.roc_auc_score` function can be used for # multi-class classification. The multi-class One-vs-One scheme compares every -# unique pairwise combination of classes. In this section, we calcuate the AUC +# unique pairwise combination of classes. In this section, we calculate the AUC # using the OvR and OvO schemes. We report a macro average, and a # prevalence-weighted average. y_prob = classifier.predict_proba(X_test) diff --git a/examples/plot_changed_only_pprint_parameter.py b/examples/plot_changed_only_pprint_parameter.py index 1a687cff046d8..a35471105b6c1 100644 --- a/examples/plot_changed_only_pprint_parameter.py +++ b/examples/plot_changed_only_pprint_parameter.py @@ -5,7 +5,7 @@ This example illustrates the use of the print_changed_only global parameter. -Setting print_changed_only to True will alterate the representation of +Setting print_changed_only to True will alternate the representation of estimators to only show the parameters that have been set to non-default values. This can be used to have more compact representations. """ diff --git a/examples/plot_roc_curve_visualization_api.py b/examples/plot_roc_curve_visualization_api.py index 55dec5649beeb..67592c12ec845 100644 --- a/examples/plot_roc_curve_visualization_api.py +++ b/examples/plot_roc_curve_visualization_api.py @@ -44,7 +44,7 @@ # We train a random forest classifier and create a plot comparing it to the SVC # ROC curve. Notice how `svc_disp` uses # :func:`~sklearn.metrics.RocCurveDisplay.plot` to plot the SVC ROC curve -# without recomputing the values of the roc curve itself. Futhermore, we +# without recomputing the values of the roc curve itself. Furthermore, we # pass `alpha=0.8` to the plot functions to adjust the alpha values of the # curves. rfc = RandomForestClassifier(n_estimators=10, random_state=42) diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 9fc0936b880cc..fe7c57c61999a 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -270,7 +270,7 @@ def partial_fit(self, X, y=None, check_input=True): self.mean_ = .0 self.var_ = .0 - # Update stats - they are 0 if this is the fisrt step + # Update stats - they are 0 if this is the first step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var( X, last_mean=self.mean_, last_variance=self.var_, diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index c10bad994d9cf..8fcb51896d190 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -193,7 +193,7 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): evaluate_every : int, optional (default=0) How often to evaluate perplexity. Only used in `fit` method. - set it to 0 or negative number to not evalute perplexity in + set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 04322f0fc3bd1..3c7d3286dd85b 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -50,7 +50,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): .. versionchanged:: 0.22 The default value of `strategy` will change to "prior" in version 0.24. Starting from version 0.22, a warning will be raised if - `strategy` is not explicity set. + `strategy` is not explicitly set. .. versionadded:: 0.17 Dummy Classifier now supports prior fitting strategy using diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 667a526e486a9..7b7c0e465e45d 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -604,7 +604,7 @@ def _make_estimator(self, append=True): raise NotImplementedError() def _raw_predict_init(self, X): - """Check input and compute raw predictions of the init estimtor.""" + """Check input and compute raw predictions of the init estimator.""" self._check_initialized() X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) if X.shape[1] != self.n_features_: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index bcfec023b5571..dae85f57134e4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -154,7 +154,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, class LeastAbsoluteDeviation(BaseLoss): - """Least asbolute deviation, for regression. + """Least absolute deviation, for regression. For a given sample x_i, the loss is defined as:: diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index c18291b9f4461..2fe284253ccc9 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -63,7 +63,7 @@ def _concatenate_predictions(self, X, predictions): and `self.passthrough` is True, the output of `transform` will be sparse. - This helper is in charge of ensuring the preditions are 2D arrays and + This helper is in charge of ensuring the predictions are 2D arrays and it will drop one of the probability column when using probabilities in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1) """ diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index d225f2c78c2b1..c71329be9ec71 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -526,7 +526,7 @@ def test_adaboostregressor_sample_weight(): X[-1] *= 10 y[-1] = 10000 - # random_state=0 ensure that the underlying boostrap will use the outlier + # random_state=0 ensure that the underlying bootstrap will use the outlier regr_no_outlier = AdaBoostRegressor( base_estimator=LinearRegression(), n_estimators=1, random_state=0 ) diff --git a/sklearn/externals/joblib/numpy_pickle.py b/sklearn/externals/joblib/numpy_pickle.py index 7a4a2885c9f15..e79a0e1c5c056 100644 --- a/sklearn/externals/joblib/numpy_pickle.py +++ b/sklearn/externals/joblib/numpy_pickle.py @@ -1,3 +1,3 @@ -# Import necessary to preserve backward compatibliity of pickles +# Import necessary to preserve backward compatibility of pickles from joblib.numpy_pickle import * diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py index d515b9aa86b1d..b16fc96e857cd 100644 --- a/sklearn/metrics/_plot/precision_recall_curve.py +++ b/sklearn/metrics/_plot/precision_recall_curve.py @@ -97,7 +97,7 @@ def plot(self, ax=None, name=None, **kwargs): def plot_precision_recall_curve(estimator, X, y, sample_weight=None, response_method="auto", name=None, ax=None, **kwargs): - """Plot Precision Recall Curve for binary classifers. + """Plot Precision Recall Curve for binary classifiers. Extra keyword arguments will be passed to matplotlib's `plot`. diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 6c3c83a0c0c7c..7c115928a1340 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -801,7 +801,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): Gamma deviance is equivalent to the Tweedie deviance with the power parameter `p=2`. It is invariant to scaling of - the target variable, and mesures relative errors. + the target variable, and measures relative errors. Read more in the :ref:`User Guide `. diff --git a/sklearn/neighbors/_quad_tree.pyx b/sklearn/neighbors/_quad_tree.pyx index af510c317d639..5623799124f7c 100644 --- a/sklearn/neighbors/_quad_tree.pyx +++ b/sklearn/neighbors/_quad_tree.pyx @@ -97,7 +97,7 @@ cdef class _QuadTree: return self._get_cell_ndarray()['is_leaf'][:self.cell_count] def build_tree(self, X): - """Build a tree from an arary of points X.""" + """Build a tree from an array of points X.""" cdef: int i DTYPE_t[3] pt diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 453ce2228d50d..af2feed1a861e 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -307,7 +307,7 @@ def _fit(self, X, y=None, **fit_params): cloned_transformer = clone(transformer) else: cloned_transformer = clone(transformer) - # Fit or load from cache the current transfomer + # Fit or load from cache the current transformer X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', diff --git a/sklearn/svm/src/liblinear/liblinear_helper.c b/sklearn/svm/src/liblinear/liblinear_helper.c index ffcbe86e01035..86d88e7da9273 100644 --- a/sklearn/svm/src/liblinear/liblinear_helper.c +++ b/sklearn/svm/src/liblinear/liblinear_helper.c @@ -172,7 +172,7 @@ struct problem * csr_set_problem (char *X, int double_precision_X, } -/* Create a paramater struct with and return it */ +/* Create a parameter struct with and return it */ struct parameter *set_parameter(int solver_type, double eps, double C, npy_intp nr_weight, char *weight_label, char *weight, int max_iter, unsigned seed, diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp index 8bf3aa42ed488..9321340acaaed 100644 --- a/sklearn/svm/src/libsvm/svm.cpp +++ b/sklearn/svm/src/libsvm/svm.cpp @@ -923,7 +923,7 @@ int Solver::select_working_set(int &out_i, int &out_j) // return i,j such that // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) // j: minimizes the decrease of obj value - // (if quadratic coefficeint <= 0, replace it with tau) + // (if quadratic coefficient <= 0, replace it with tau) // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) double Gmax = -INF; @@ -1166,7 +1166,7 @@ int Solver_NU::select_working_set(int &out_i, int &out_j) // return i,j such that y_i = y_j and // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) // j: minimizes the decrease of obj value - // (if quadratic coefficeint <= 0, replace it with tau) + // (if quadratic coefficient <= 0, replace it with tau) // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) double Gmaxp = -INF; diff --git a/sklearn/svm/src/libsvm/svm.h b/sklearn/svm/src/libsvm/svm.h index 2187e3df2916f..4002a77c93ac4 100644 --- a/sklearn/svm/src/libsvm/svm.h +++ b/sklearn/svm/src/libsvm/svm.h @@ -79,7 +79,7 @@ struct svm_model int *sv_ind; /* index of support vectors */ double *rho; /* constants in decision functions (rho[k*(k-1)/2]) */ - double *probA; /* pariwise probability information */ + double *probA; /* pairwise probability information */ double *probB; /* for classification only */ @@ -104,7 +104,7 @@ struct svm_csr_model int *sv_ind; /* index of support vectors */ double *rho; /* constants in decision functions (rho[k*(k-1)/2]) */ - double *probA; /* pariwise probability information */ + double *probA; /* pairwise probability information */ double *probB; /* for classification only */ diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 9e45edd6bb063..e5b6b97922054 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -570,7 +570,8 @@ def feature_importances_(self): Returns ------- feature_importances_ : ndarray of shape (n_features,) - Normalized total reduction of critera by feature (Gini importance). + Normalized total reduction of criteria by feature + (Gini importance). """ check_is_fitted(self) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 707c1dbd1b82d..fc9acde7e7d84 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -450,7 +450,7 @@ def all_estimators(type_filter=None): ------- estimators : list of tuples List of (name, class), where ``name`` is the class name as string - and ``class`` is the actuall type of the class. + and ``class`` is the actual type of the class. """ def is_abstract(c): if not(hasattr(c, '__abstractmethods__')): diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py index 2a33f34dfd2b8..c14968cafde32 100644 --- a/sklearn/utils/deprecation.py +++ b/sklearn/utils/deprecation.py @@ -114,7 +114,7 @@ def _update_doc(self, olddoc): def _is_deprecated(func): - """Helper to check if func is wraped by our deprecated decorator""" + """Helper to check if func is wrapped by our deprecated decorator""" closures = getattr(func, '__closure__', []) if closures is None: closures = [] diff --git a/sklearn/utils/graph_shortest_path.pyx b/sklearn/utils/graph_shortest_path.pyx index 30cbec1d5d471..7d2e74127f153 100644 --- a/sklearn/utils/graph_shortest_path.pyx +++ b/sklearn/utils/graph_shortest_path.pyx @@ -215,7 +215,7 @@ cdef np.ndarray dijkstra(dist_matrix, graph, &heap, nodes) else: #use the csr -> csc sparse matrix conversion to quickly get - # both directions of neigbors + # both directions of neighbors dist_matrix_T = dist_matrix.T.tocsr() distances2 = np.asarray(dist_matrix_T.data, From 06e9a3fe39e0f5bcd46c4b8a65323a53286f8374 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 20 Dec 2019 13:29:07 +1100 Subject: [PATCH 132/448] =?UTF-8?q?DOC=20in=20canned=20comment,=20mention?= =?UTF-8?q?=20that=20PR=20title=20becomes=20commit=20me=E2=80=A6=20(#15935?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/developers/tips.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst index 4c313701b4aa6..ed049285c36f3 100644 --- a/doc/developers/tips.rst +++ b/doc/developers/tips.rst @@ -181,10 +181,10 @@ Issue/Comment: Linking to comments Please use links to comments, which make it a lot easier to see what you are referring to, rather than just linking to the issue. See [this](https://stackoverflow.com/questions/25163598/how-do-i-reference-a-specific-issue-comment-on-github) for more details. -PR-NEW: Better description +PR-NEW: Better description and title :: - Thanks for the pull request! Please make the title of the PR descriptive so that we can easily recall the issue it is resolving. You should state what issue (or PR) it fixes/resolves in the description (see [here](http://scikit-learn.org/dev/developers/contributing.html#contributing-pull-requests)). + Thanks for the pull request! Please make the title of the PR more descriptive. The title will become the commit message when this is merged. You should state what issue (or PR) it fixes/resolves in the description using the syntax described [here](http://scikit-learn.org/dev/developers/contributing.html#contributing-pull-requests). PR-NEW: Fix # :: From 613a3338f37a32577573055fdb5178359b50e9ce Mon Sep 17 00:00:00 2001 From: Ritchie Ng Date: Fri, 20 Dec 2019 16:34:58 +0800 Subject: [PATCH 133/448] DOC/EXA Correct spelling of "Classification" (#15938) --- examples/linear_model/plot_sparse_logistic_regression_mnist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py index 56b5457c6a27e..ab3749fb5e7f8 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py +++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py @@ -1,6 +1,6 @@ """ ===================================================== -MNIST classfification using multinomial logistic + L1 +MNIST classification using multinomial logistic + L1 ===================================================== Here we fit a multinomial logistic regression with L1 penalty on a subset of From a5542e9490c63e44e5611583b5e8764149b230c8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 17:11:21 +0100 Subject: [PATCH 134/448] BUG fix pip3 ubuntu update by suffixing file (#15928) --- sklearn/_build_utils/deprecated_modules.py | 2 +- sklearn/datasets/__init__.py | 6 +++--- sklearn/datasets/_rcv1.py | 2 +- .../{_svmlight_format.py => _svmlight_format_io.py} | 6 ++++-- sklearn/tests/test_common.py | 2 +- sklearn/tests/test_docstring_parameters.py | 2 +- sklearn/utils/_testing.py | 2 +- 7 files changed, 12 insertions(+), 10 deletions(-) rename sklearn/datasets/{_svmlight_format.py => _svmlight_format_io.py} (99%) diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py index 5f11ac1714022..045dc3d297be0 100644 --- a/sklearn/_build_utils/deprecated_modules.py +++ b/sklearn/_build_utils/deprecated_modules.py @@ -133,7 +133,7 @@ 'sklearn.datasets', 'make_classification'), ('_species_distributions', 'sklearn.datasets.species_distributions', 'sklearn.datasets', 'fetch_species_distributions'), - ('_svmlight_format', 'sklearn.datasets.svmlight_format', + ('_svmlight_format_io', 'sklearn.datasets.svmlight_format', 'sklearn.datasets', 'load_svmlight_file'), ('_twenty_newsgroups', 'sklearn.datasets.twenty_newsgroups', 'sklearn.datasets', 'strip_newsgroup_header'), diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 2377de2bfd189..e7c93bb180567 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -42,9 +42,9 @@ from ._samples_generator import make_gaussian_quantiles from ._samples_generator import make_biclusters from ._samples_generator import make_checkerboard -from ._svmlight_format import load_svmlight_file -from ._svmlight_format import load_svmlight_files -from ._svmlight_format import dump_svmlight_file +from ._svmlight_format_io import load_svmlight_file +from ._svmlight_format_io import load_svmlight_files +from ._svmlight_format_io import dump_svmlight_file from ._olivetti_faces import fetch_olivetti_faces from ._species_distributions import fetch_species_distributions from ._california_housing import fetch_california_housing diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index 53edc9a3407d8..887a8271eae5e 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -23,7 +23,7 @@ from ._base import _fetch_remote from ._base import RemoteFileMetadata from ._base import _refresh_cache -from ._svmlight_format import load_svmlight_files +from ._svmlight_format_io import load_svmlight_files from ..utils import shuffle as shuffle_ from ..utils import Bunch diff --git a/sklearn/datasets/_svmlight_format.py b/sklearn/datasets/_svmlight_format_io.py similarity index 99% rename from sklearn/datasets/_svmlight_format.py rename to sklearn/datasets/_svmlight_format_io.py index d344b310be995..91bb35ff2ec75 100644 --- a/sklearn/datasets/_svmlight_format.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -453,8 +453,10 @@ def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None, Xval = check_array(X, accept_sparse='csr') if Xval.shape[0] != yval.shape[0]: - raise ValueError("X.shape[0] and y.shape[0] should be the same, got" - " %r and %r instead." % (Xval.shape[0], yval.shape[0])) + raise ValueError( + "X.shape[0] and y.shape[0] should be the same, got" + " %r and %r instead." % (Xval.shape[0], yval.shape[0]) + ) # We had some issues with CSR matrices with unsorted indices (e.g. #1501), # so sort them here, but first make sure we don't modify the user's X. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 9fc3075c5fe28..08388289bd043 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -179,7 +179,7 @@ def test_import_all_consistency(): for modname in submods + ['sklearn']: if ".tests." in modname: continue - if IS_PYPY and ('_svmlight_format' in modname or + if IS_PYPY and ('_svmlight_format_io' in modname or 'feature_extraction._hashing_fast' in modname): continue package = __import__(modname, fromlist="dummy") diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 31c7d268737b9..28af419195813 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -140,7 +140,7 @@ def test_tabs(): for importer, modname, ispkg in walk_packages(sklearn.__path__, prefix='sklearn.'): - if IS_PYPY and ('_svmlight_format' in modname or + if IS_PYPY and ('_svmlight_format_io' in modname or 'feature_extraction._hashing_fast' in modname): continue diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index fc9acde7e7d84..b4a747b1df7ce 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -466,7 +466,7 @@ def is_abstract(c): path=path, prefix='sklearn.', onerror=lambda x: None): if ".tests." in modname or "externals" in modname: continue - if IS_PYPY and ('_svmlight_format' in modname or + if IS_PYPY and ('_svmlight_format_io' in modname or 'feature_extraction._hashing_fast' in modname): continue # Ignore deprecation warnings triggered at import time. From 9626583e8feacb1ea8394f6fe8e6f37fee2d0ab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 20 Dec 2019 17:14:07 +0100 Subject: [PATCH 135/448] ENH Improve column_or_1d error message (#15926) --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/metrics/tests/test_classification.py | 2 +- sklearn/preprocessing/tests/test_label.py | 4 ++-- sklearn/utils/validation.py | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 30cf098eee768..cc7d334b9519a 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -106,3 +106,9 @@ Changelog - |Fix| :func:`tree.plot_tree` `rotate` parameter was unused and has been deprecated. :pr:`15806` by :user:`Chiara Marmo `. + +:mod:`sklearn.utils` +.................... + +- |Enhancement| improve error message in :func:`utils.validation.column_or_1d`. + :pr:`15926` by :user:`Loïc Estève `. diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index c33c3a829cc16..197749d0ff2dd 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -486,7 +486,7 @@ def test_multilabel_confusion_matrix_errors(): # Bad sample_weight with pytest.raises(ValueError, match="inconsistent numbers of samples"): multilabel_confusion_matrix(y_true, y_pred, sample_weight=[1, 2]) - with pytest.raises(ValueError, match="bad input shape"): + with pytest.raises(ValueError, match="should be a 1d array"): multilabel_confusion_matrix(y_true, y_pred, sample_weight=[[1, 2, 3], [2, 3, 4], diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 69c025fcc76e3..6cdb198182a20 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -222,7 +222,7 @@ def test_label_encoder_negative_ints(): def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() le.fit(np.array(["apple", "orange"], dtype=dtype)) - msg = "bad input shape" + msg = "should be a 1d array" with pytest.raises(ValueError, match=msg): le.transform("apple") @@ -245,7 +245,7 @@ def test_label_encoder_errors(): le.inverse_transform([-2, -3, -4]) # Fail on inverse_transform("") - msg = "bad input shape ()" + msg = r"should be a 1d array.+shape " with pytest.raises(ValueError, match=msg): le.inverse_transform("") diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5502fdd534965..e08495de30af5 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -743,7 +743,9 @@ def column_or_1d(y, warn=False): DataConversionWarning, stacklevel=2) return np.ravel(y) - raise ValueError("bad input shape {0}".format(shape)) + raise ValueError( + "y should be a 1d array, " + "got an array of shape {} instead.".format(shape)) def check_random_state(seed): From 1b55e2f5cae77c50d0205a7ddfbce0fb2425de26 Mon Sep 17 00:00:00 2001 From: inderjeet <43402782+inder128@users.noreply.github.com> Date: Fri, 20 Dec 2019 22:05:59 +0530 Subject: [PATCH 136/448] [MRG] Ways to compute center_shift_total were different in "full" and "elkan" algorithms. (#15930) --- doc/whats_new/v0.22.rst | 7 +++++++ sklearn/cluster/_k_means_elkan.pyx | 4 ++-- sklearn/cluster/tests/test_k_means.py | 15 +++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index ae9cbbd74e313..068ef5ed5ecab 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -15,6 +15,13 @@ This is a bug-fix release to primarily resolve some packaging issues in version Changelog --------- +:mod:`sklearn.cluster` +...................... + +- |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping + criterion as with the default ``algorithm="full"``. :pr:`15930` by + :user:`inder128`. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index abf2ea8aeac8d..87d32d1e47858 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -246,8 +246,8 @@ def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_, print('Iteration %i, inertia %s' % (iteration, np.sum((X_ - centers_[labels]) ** 2 * sample_weight[:,np.newaxis]))) - center_shift_total = np.sum(center_shift) - if center_shift_total ** 2 < tol: + center_shift_total = np.sum(center_shift ** 2) + if center_shift_total < tol: if verbose: print("center shift %e within tolerance %e" % (center_shift_total, tol)) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 50c91382d3117..5a6125d28ced5 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -69,17 +69,19 @@ def test_kmeans_results(representation, algo, dtype): @pytest.mark.parametrize('distribution', ['normal', 'blobs']) -def test_elkan_results(distribution): +@pytest.mark.parametrize('tol', [1e-2, 1e-4, 1e-8]) +def test_elkan_results(distribution, tol): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': - X = rnd.normal(size=(50, 10)) + X = rnd.normal(size=(5000, 10)) else: X, _ = make_blobs(random_state=rnd) - km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) + km_full = KMeans(algorithm='full', n_clusters=5, + random_state=0, n_init=1, tol=tol) km_elkan = KMeans(algorithm='elkan', n_clusters=5, - random_state=0, n_init=1) + random_state=0, n_init=1, tol=tol) km_full.fit(X) km_elkan.fit(X) @@ -87,6 +89,11 @@ def test_elkan_results(distribution): km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_) + # The number of iterations and inertia should be close but not + # necessarily exactly the same because of rounding errors. + assert km_elkan.n_iter_ == pytest.approx(km_full.n_iter_, rel=0.01) + assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) + def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold From 3a5aced32b72bc742217fb183c6fc75440cbff28 Mon Sep 17 00:00:00 2001 From: scibol Date: Fri, 20 Dec 2019 18:43:30 +0100 Subject: [PATCH 137/448] TST Fixes integer test for train and test indices (#15941) --- sklearn/model_selection/tests/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index eb5eb192a6921..253593968ad24 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -175,7 +175,7 @@ def test_cross_validator_with_default_params(): # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert np.asarray(train).dtype.kind == 'i' - assert np.asarray(train).dtype.kind == 'i' + assert np.asarray(test).dtype.kind == 'i' # Test if the repr works without any errors assert cv_repr == repr(cv) From a68ba97309512f691feb25e6a07af7561fc56e3b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 19:14:49 +0100 Subject: [PATCH 138/448] BUG ensure that parallel/sequential give the same permutation importances (#15933) --- doc/whats_new/v0.22.rst | 13 ++ sklearn/inspection/_permutation_importance.py | 61 ++++----- .../tests/test_permutation_importance.py | 126 ++++++++++++++++++ 3 files changed, 168 insertions(+), 32 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 068ef5ed5ecab..be5688d3a32ae 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -15,12 +15,25 @@ This is a bug-fix release to primarily resolve some packaging issues in version Changelog --------- + :mod:`sklearn.cluster` ...................... - |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping criterion as with the default ``algorithm="full"``. :pr:`15930` by :user:`inder128`. + +:mod:`sklearn.inspection` +......................... + +- |Fix| :func:`inspection.permutation_importance` will return the same + `importances` when a `random_state` is given for both `n_jobs=1` or + `n_jobs>1` both with shared memory backends (thread-safety) and + isolated memory, process-based backends. + Also avoid casting the data as object dtype and avoid read-only error + on large dataframes with `n_jobs>1` as reported in :issue:`15810`. + Follow-up of :pr:`15898` by :user:`Shivam Gargsya `. + :pr:`15933` by :user:`Guillaume Lemaitre ` and `Olivier Grisel`_. :mod:`sklearn.metrics` ...................... diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index d71d5fd3f3a68..80bf4d2e2a62c 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -4,41 +4,36 @@ from joblib import delayed from ..metrics import check_scoring +from ..utils import Bunch from ..utils import check_random_state from ..utils import check_array -from ..utils import Bunch - - -def _safe_column_setting(X, col_idx, values): - """Set column on X using `col_idx`""" - if hasattr(X, "iloc"): - X.iloc[:, col_idx] = values - else: - X[:, col_idx] = values - - -def _safe_column_indexing(X, col_idx): - """Return column from X using `col_idx`""" - if hasattr(X, "iloc"): - return X.iloc[:, col_idx].values - else: - return X[:, col_idx] def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, n_repeats, scorer): """Calculate score when `col_idx` is permuted.""" - original_feature = _safe_column_indexing(X, col_idx).copy() - temp = original_feature.copy() + random_state = check_random_state(random_state) + # Work on a copy of X to to ensure thread-safety in case of threading based + # parallelism. Furthermore, making a copy is also useful when the joblib + # backend is 'loky' (default) or the old 'multiprocessing': in those cases, + # if X is large it will be automatically be backed by a readonly memory map + # (memmap). X.copy() on the other hand is always guaranteed to return a + # writable data-structure whose columns can be shuffled inplace. + X_permuted = X.copy() scores = np.zeros(n_repeats) + shuffling_idx = np.arange(X.shape[0]) for n_round in range(n_repeats): - random_state.shuffle(temp) - _safe_column_setting(X, col_idx, temp) - feature_score = scorer(estimator, X, y) + random_state.shuffle(shuffling_idx) + if hasattr(X_permuted, "iloc"): + col = X_permuted.iloc[shuffling_idx, col_idx] + col.index = X_permuted.index + X_permuted.iloc[:, col_idx] = col + else: + X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] + feature_score = scorer(estimator, X_permuted, y) scores[n_round] = feature_score - _safe_column_setting(X, col_idx, original_feature) return scores @@ -104,20 +99,22 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. https://doi.org/10.1023/A:1010933404324 """ - if hasattr(X, "iloc"): - X = X.copy() # Dataframe - else: - X = check_array(X, force_all_finite='allow-nan', dtype=np.object, - copy=True) - + if not hasattr(X, "iloc"): + X = check_array(X, force_all_finite='allow-nan', dtype=None) + + # Precompute random seed from the random state to be used + # to get a fresh independent RandomState instance for each + # parallel call to _calculate_permutation_scores, irrespective of + # the fact that variables are shared or not depending on the active + # joblib backend (sequential, thread-based or process-based). random_state = check_random_state(random_state) - scorer = check_scoring(estimator, scoring=scoring) + random_seed = random_state.randint(np.iinfo(np.int32).max + 1) + scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) - scores = np.zeros((X.shape[1], n_repeats)) scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)( - estimator, X, y, col_idx, random_state, n_repeats, scorer + estimator, X, y, col_idx, random_seed, n_repeats, scorer ) for col_idx in range(X.shape[1])) importances = baseline_score - np.array(scores) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 671a1e11b1fec..2a31a031f2938 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -6,7 +6,9 @@ from sklearn.compose import ColumnTransformer from sklearn.datasets import load_boston from sklearn.datasets import load_iris +from sklearn.datasets import make_classification from sklearn.datasets import make_regression +from sklearn.dummy import DummyClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression @@ -14,9 +16,13 @@ from sklearn.impute import SimpleImputer from sklearn.inspection import permutation_importance from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale +from sklearn.utils import parallel_backend +from sklearn.utils._testing import _convert_container + @pytest.mark.parametrize("n_jobs", [1, 2]) def test_permutation_importance_correlated_feature_regression(n_jobs): @@ -150,3 +156,123 @@ def test_permutation_importance_linear_regresssion(): scoring='neg_mean_squared_error') assert_allclose(expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6) + + +def test_permutation_importance_equivalence_sequential_parallel(): + # regression test to make sure that sequential and parallel calls will + # output the same results. + X, y = make_regression(n_samples=500, n_features=10, random_state=0) + lr = LinearRegression().fit(X, y) + + importance_sequential = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=1 + ) + + # First check that the problem is structured enough and that the model is + # complex enough to not yield trivial, constant importances: + imp_min = importance_sequential['importances'].min() + imp_max = importance_sequential['importances'].max() + assert imp_max - imp_min > 0.3 + + # The actually check that parallelism does not impact the results + # either with shared memory (threading) or without isolated memory + # via process-based parallelism using the default backend + # ('loky' or 'multiprocessing') depending on the joblib version: + + # process-based parallelism (by default): + importance_processes = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2) + assert_allclose( + importance_processes['importances'], + importance_sequential['importances'] + ) + + # thread-based parallelism: + with parallel_backend("threading"): + importance_threading = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + ) + assert_allclose( + importance_threading['importances'], + importance_sequential['importances'] + ) + + +@pytest.mark.parametrize("n_jobs", [None, 1, 2]) +def test_permutation_importance_equivalence_array_dataframe(n_jobs): + # This test checks that the column shuffling logic has the same behavior + # both a dataframe and a simple numpy array. + pd = pytest.importorskip('pandas') + + # regression test to make sure that sequential and parallel calls will + # output the same results. + X, y = make_regression(n_samples=100, n_features=5, random_state=0) + X_df = pd.DataFrame(X) + + # Add a categorical feature that is statistically linked to y: + binner = KBinsDiscretizer(n_bins=3, encode="ordinal") + cat_column = binner.fit_transform(y.reshape(-1, 1)) + + # Concatenate the extra column to the numpy array: integers will be + # cast to float values + X = np.hstack([X, cat_column]) + assert X.dtype.kind == "f" + + # Insert extra column as a non-numpy-native dtype (while keeping backward + # compat for old pandas versions): + if hasattr(pd, "Categorical"): + cat_column = pd.Categorical(cat_column.ravel()) + else: + cat_column = cat_column.ravel() + new_col_idx = len(X_df.columns) + X_df[new_col_idx] = cat_column + assert X_df[new_col_idx].dtype == cat_column.dtype + + # Stich an aribtrary index to the dataframe: + X_df.index = np.arange(len(X_df)).astype(str) + + rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) + rf.fit(X, y) + + n_repeats = 3 + importance_array = permutation_importance( + rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs + ) + + # First check that the problem is structured enough and that the model is + # complex enough to not yield trivial, constant importances: + imp_min = importance_array['importances'].min() + imp_max = importance_array['importances'].max() + assert imp_max - imp_min > 0.3 + + # Now check that importances computed on dataframe matche the values + # of those computed on the array with the same data. + importance_dataframe = permutation_importance( + rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs + ) + assert_allclose( + importance_array['importances'], + importance_dataframe['importances'] + ) + + +@pytest.mark.parametrize("input_type", ["array", "dataframe"]) +def test_permutation_importance_large_memmaped_data(input_type): + # Smoke, non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15810 + n_samples, n_features = int(5e4), 4 + X, y = make_classification(n_samples=n_samples, n_features=n_features, + random_state=0) + assert X.nbytes > 1e6 # trigger joblib memmaping + + X = _convert_container(X, input_type) + clf = DummyClassifier(strategy='prior').fit(X, y) + + # Actual smoke test: should not raise any error: + n_repeats = 5 + r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2) + + # Auxiliary check: DummyClassifier is feature independent: + # permutating feature should not change the predictions + expected_importances = np.zeros((n_features, n_repeats)) + assert_allclose(expected_importances, r.importances) From fa4646749ce47cf4fe8d15575c448948b5625209 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 21 Dec 2019 00:56:38 +0100 Subject: [PATCH 139/448] Formatting fixes in changelog (#15944) --- doc/whats_new/v0.22.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index be5688d3a32ae..00c957448550f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -19,10 +19,10 @@ Changelog :mod:`sklearn.cluster` ...................... -- |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping - criterion as with the default ``algorithm="full"``. :pr:`15930` by +- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now uses the same + stopping criterion as with the default ``algorithm="full"``. :pr:`15930` by :user:`inder128`. - + :mod:`sklearn.inspection` ......................... @@ -51,7 +51,7 @@ Changelog :mod:`sklearn.inspection` ......................... -- |Fix| :func:`inspection.plot_partial_dependence` and +- |Fix| :func:`inspection.plot_partial_dependence` and :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks the number of axes passed in. :pr:`15760` by `Thomas Fan`_. From 0c0b6dad69eed6bf42e98ef5c865718ec49bb932 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Sat, 21 Dec 2019 22:00:37 +0530 Subject: [PATCH 140/448] MRG FIX: order of values of self.quantiles_ in QuantileTransformer (#15751) --- doc/whats_new/v0.22.rst | 4 ++++ sklearn/preprocessing/_data.py | 10 ++++++++++ sklearn/preprocessing/tests/test_data.py | 21 +++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 00c957448550f..d1cfe0d42239c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -817,6 +817,10 @@ Changelog :class:`preprocessing.KernelCenterer` :pr:`14336` by :user:`Gregory Dexter `. +- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the + `quantiles_` attribute to be completely sorted in non-decreasing manner. + :pr:`15751` by :user:`Tirth Patel `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 9227efa958b43..b047908842b38 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,6 +2262,11 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) + # Due to floating-point precision error in `np.nanpercentile`, + # make sure that quantiles are monotonically increasing. + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. @@ -2305,6 +2310,11 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append( np.nanpercentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) + # due to floating-point precision error in `np.nanpercentile`, + # make sure the quantiles are monotonically increasing + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) def fit(self, X, y=None): """Compute the quantiles used for transforming. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 9a8e31d468f1c..cdff446cb336c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -25,6 +25,7 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import skip_if_32bit +from sklearn.utils._testing import _convert_container from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.preprocessing._data import _handle_zeros_in_scale @@ -1532,6 +1533,26 @@ def test_quantile_transform_nan(): assert not np.isnan(transformer.quantiles_[:, 1:]).any() +@pytest.mark.parametrize("array_type", ['array', 'sparse']) +def test_quantile_transformer_sorted_quantiles(array_type): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15733 + # Taken from upstream bug report: + # https://github.com/numpy/numpy/issues/14685 + X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) + X = 0.1 * X.reshape(-1, 1) + X = _convert_container(X, array_type) + + n_quantiles = 100 + qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) + + # Check that the estimated quantile threasholds are monotically + # increasing: + quantiles = qt.quantiles_[:, 0] + assert len(quantiles) == 100 + assert all(np.diff(quantiles) >= 0) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), From b42125bec308cccf4c911db65123870b04c08158 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sat, 21 Dec 2019 12:07:54 -0500 Subject: [PATCH 141/448] [MRG] BUG Fixes constrast in plot_confusion_matrix (#15936) --- doc/whats_new/v0.22.rst | 4 ++++ sklearn/metrics/_plot/confusion_matrix.py | 2 +- .../_plot/tests/test_plot_confusion_matrix.py | 18 ++++++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index d1cfe0d42239c..f7b5fda459a09 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -42,6 +42,10 @@ Changelog is invalid. Previously, it runs fine with no normalization. :pr:`15888` by `Hanmin Qin`_. +- |Fix| :func:`metrics.plot_confusion_matrix` now colors the label color + correctly to maximize contrast with its background. :pr:`15936` by + `Thomas Fan`_ and :user:`DizietAsahi`. + :mod:`sklearn.utils` .................... diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index 1c3fc2715ffb3..f759c4d5c1c3d 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -93,7 +93,7 @@ def plot(self, include_values=True, cmap='viridis', values_format = '.2g' # print text with appropriate color depending on background - thresh = (cm.max() - cm.min()) / 2. + thresh = (cm.max() + cm.min()) / 2.0 for i, j in product(range(n_classes), range(n_classes)): color = cmap_max if cm[i, j] < thresh else cmap_min self.text_[i, j] = ax.text(j, i, diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py index dbd515f3527cc..2d53e6bf24dc0 100644 --- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py +++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py @@ -200,7 +200,7 @@ def test_confusion_matrix_contrast(pyplot): assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0]) assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0]) - # oof-diagonal text is white + # off-diagonal text is white assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0]) assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0]) @@ -209,10 +209,24 @@ def test_confusion_matrix_contrast(pyplot): assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0]) assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0]) - # oof-diagonal text is black + # off-diagonal text is black assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0]) assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0]) + # Regression test for #15920 + cm = np.array([[19, 34], [32, 58]]) + disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1]) + + disp.plot(cmap=pyplot.cm.Blues) + min_color = pyplot.cm.Blues(0) + max_color = pyplot.cm.Blues(255) + assert_allclose(disp.text_[0, 0].get_color(), max_color) + assert_allclose(disp.text_[0, 1].get_color(), max_color) + assert_allclose(disp.text_[1, 0].get_color(), max_color) + assert_allclose(disp.text_[1, 1].get_color(), min_color) + + + @pytest.mark.parametrize( "clf", [LogisticRegression(), From dfad061da209869716d4113987db6a655f5c5c3b Mon Sep 17 00:00:00 2001 From: Bibhash Chandra Mitra Date: Sat, 21 Dec 2019 22:49:50 +0530 Subject: [PATCH 142/448] BUG use zero_division argument in classification_report (#15879) --- doc/whats_new/v0.22.rst | 4 ++++ sklearn/metrics/_classification.py | 3 ++- sklearn/metrics/tests/test_classification.py | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f7b5fda459a09..e5cf5a29a8d52 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -46,6 +46,10 @@ Changelog correctly to maximize contrast with its background. :pr:`15936` by `Thomas Fan`_ and :user:`DizietAsahi`. +- |Fix| :func:`metrics.classification_report` does no longer ignore the + value of the ``zero_division`` keyword argument. :pr:`15879` + by :user:`Bibhash Chandra Mitra `. + :mod:`sklearn.utils` .................... diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 343e63b6c0ae9..cba7f2c2e8fc8 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -1964,7 +1964,8 @@ class 2 1.00 0.67 0.80 3 # compute averages with specified averaging method avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support( y_true, y_pred, labels=labels, - average=average, sample_weight=sample_weight) + average=average, sample_weight=sample_weight, + zero_division=zero_division) avg = [avg_p, avg_r, avg_f1, np.sum(s)] if output_dict: diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 197749d0ff2dd..947ca047438d8 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -20,7 +20,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_warns_div0 from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import assert_warns_message @@ -154,6 +153,22 @@ def test_classification_report_dictionary_output(): assert type(expected_report['macro avg']['support']) == int +@pytest.mark.parametrize('zero_division', ["warn", 0, 1]) +def test_classification_report_zero_division_warning(zero_division): + y_true, y_pred = ["a", "b", "c"], ["a", "b", "d"] + with warnings.catch_warnings(record=True) as record: + classification_report( + y_true, y_pred, zero_division=zero_division, output_dict=True) + if zero_division == "warn": + assert len(record) > 1 + for item in record: + msg = ("Use `zero_division` parameter to control this " + "behavior.") + assert msg in str(item.message) + else: + assert not record + + def test_multilabel_accuracy_score_subset_accuracy(): # Dense label indicator matrix format y1 = np.array([[0, 1, 1], [1, 0, 1]]) From 0abd48b9c32076d706649d38f81d115bd133948c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 23 Dec 2019 10:22:55 +0100 Subject: [PATCH 143/448] MAINT Unpin coverage package (#15957) --- build_tools/azure/install.cmd | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 387b555af62f5..2566ba4f4f3aa 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -25,11 +25,7 @@ IF "%PYTHON_ARCH%"=="64" ( pip install numpy scipy cython pytest wheel pillow joblib ) if "%COVERAGE%" == "true" ( - @rem Using coverage 5.0 will trigger relpath between 2 windows - @rem paths from different drives. Pinning can be removed when - @rem https://github.com/scikit-learn/scikit-learn/issues/15908 - @rem is resolved. - pip install coverage==4.5.3 codecov pytest-cov + pip install coverage codecov pytest-cov ) python --version pip --version From f65cdf2d5e5f2b753393190598f9c9f63f34dbe1 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Mon, 23 Dec 2019 11:29:47 +0100 Subject: [PATCH 144/448] DOC change logreg solver in plot_logistic_path (#15927) --- examples/linear_model/plot_logistic_path.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py index 79b5522575eb0..7aead065f3445 100644 --- a/examples/linear_model/plot_logistic_path.py +++ b/examples/linear_model/plot_logistic_path.py @@ -14,7 +14,7 @@ coefficients are exactly 0. When regularization gets progressively looser, coefficients can get non-zero values one after the other. -Here we choose the SAGA solver because it can efficiently optimize for the +Here we choose the liblinear solver because it can efficiently optimize for the Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty. Also note that we set a low value for the tolerance to make sure that the model @@ -55,9 +55,10 @@ print("Computing regularization path ...") start = time() -clf = linear_model.LogisticRegression(penalty='l1', solver='saga', +clf = linear_model.LogisticRegression(penalty='l1', solver='liblinear', tol=1e-6, max_iter=int(1e6), - warm_start=True) + warm_start=True, + intercept_scaling=10000.) coefs_ = [] for c in cs: clf.set_params(C=c) From 7917187279fc972e14e9f764819dc02e104c3069 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 17:20:41 +0100 Subject: [PATCH 145/448] DOC fix whats new ordering (#15961) --- doc/whats_new/v0.22.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e5cf5a29a8d52..370f02fda49f9 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -35,6 +35,10 @@ Changelog Follow-up of :pr:`15898` by :user:`Shivam Gargsya `. :pr:`15933` by :user:`Guillaume Lemaitre ` and `Olivier Grisel`_. +- |Fix| :func:`inspection.plot_partial_dependence` and + :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks + the number of axes passed in. :pr:`15760` by `Thomas Fan`_. + :mod:`sklearn.metrics` ...................... @@ -56,13 +60,6 @@ Changelog - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with boolean columns to floats. :pr:`15797` by `Thomas Fan`_. -:mod:`sklearn.inspection` -......................... - -- |Fix| :func:`inspection.plot_partial_dependence` and - :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks - the number of axes passed in. :pr:`15760` by `Thomas Fan`_. - .. _changes_0_22: Version 0.22.0 From 88963f86fb63a13c0f6fa91061279a43a7f4b485 Mon Sep 17 00:00:00 2001 From: lkubin Date: Mon, 23 Dec 2019 23:34:49 +0100 Subject: [PATCH 146/448] ENH Make cross_val_predict support method="predict_proba" and y=None (#15918) --- doc/whats_new/v0.23.rst | 4 ++++ sklearn/model_selection/_validation.py | 8 ++++++-- sklearn/model_selection/tests/test_validation.py | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index cc7d334b9519a..babc557d9aaa0 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -94,6 +94,10 @@ Changelog type and details. :pr:`15622` by :user:`Gregory Morse `. +- |Fix| :func: `cross_val_predict` supports `method="predict_proba"` + when `y=None`. + :pr: `15918` by :user: `Luca Kubin `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 67a30c6416031..2fe4fcd7ff392 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -734,7 +734,7 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, # If classification methods produce multiple columns of output, # we need to manually encode classes to ensure consistent column ordering. encode = method in ['decision_function', 'predict_proba', - 'predict_log_proba'] + 'predict_log_proba'] and y is not None if encode: y = np.asarray(y) if y.ndim == 1: @@ -842,7 +842,11 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - if method in ['decision_function', 'predict_proba', 'predict_log_proba']: + + encode = method in ['decision_function', 'predict_proba', + 'predict_log_proba'] and y is not None + + if encode: if isinstance(predictions, list): predictions = [_enforce_prediction_order( estimator.classes_[i_label], predictions[i_label], diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index d1c67930fac77..67b66b6a91431 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -214,6 +214,9 @@ def predict(self, T): T = T.reshape(len(T), -1) return T[:, 0] + def predict_proba(self, T): + return T + def score(self, X=None, Y=None): return 1. / (1 + np.abs(self.a)) @@ -972,6 +975,19 @@ def test_cross_val_predict_unbalanced(): decimal=12) +def test_cross_val_predict_y_none(): + # ensure that cross_val_predict works when y is None + mock_classifier = MockClassifier() + rng = np.random.RandomState(42) + X = rng.rand(100, 10) + y_hat = cross_val_predict(mock_classifier, X, y=None, cv=5, + method='predict') + assert_allclose(X[:, 0], y_hat) + y_hat_proba = cross_val_predict(mock_classifier, X, y=None, cv=5, + method='predict_proba') + assert_allclose(X, y_hat_proba) + + def test_cross_val_score_sparse_fit_params(): iris = load_iris() X, y = iris.data, iris.target From 9408203ac43f69f51ec068d4ae7a721761e94c9d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 24 Dec 2019 09:41:21 +0100 Subject: [PATCH 147/448] COSMIT use np.iinfo to define the max int32 (#15960) --- sklearn/__init__.py | 2 +- sklearn/ensemble/_base.py | 4 +--- sklearn/feature_extraction/_hash.py | 2 +- sklearn/feature_extraction/_hashing_fast.pyx | 2 +- sklearn/feature_extraction/text.py | 2 +- sklearn/tree/_classes.py | 2 +- 6 files changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 7186df8e948ba..59aa672533524 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -106,7 +106,7 @@ def setup_module(module): # Check if a random seed exists in the environment, if not create one. _random_seed = os.environ.get('SKLEARN_SEED', None) if _random_seed is None: - _random_seed = np.random.uniform() * (2 ** 31 - 1) + _random_seed = np.random.uniform() * np.iinfo(np.int32).max _random_seed = int(_random_seed) print("I: Seeding RNGs with %r" % _random_seed) np.random.seed(_random_seed) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 5db30b9bbc600..9c6d8cbce0206 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -19,8 +19,6 @@ from ..utils import check_random_state from ..utils.metaestimators import _BaseComposition -MAX_RAND_SEED = np.iinfo(np.int32).max - def _parallel_fit_estimator(estimator, X, y, sample_weight=None): """Private function used to fit an estimator within a job.""" @@ -71,7 +69,7 @@ def _set_random_states(estimator, random_state=None): to_set = {} for key in sorted(estimator.get_params(deep=True)): if key == 'random_state' or key.endswith('__random_state'): - to_set[key] = random_state.randint(MAX_RAND_SEED) + to_set[key] = random_state.randint(np.iinfo(np.int32).max) if to_set: estimator.set_params(**to_set) diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 83c3cf4857dbd..f5a0ba540ccf9 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -101,7 +101,7 @@ def _validate_params(n_features, input_type): if not isinstance(n_features, numbers.Integral): raise TypeError("n_features must be integral, got %r (%s)." % (n_features, type(n_features))) - elif n_features < 1 or n_features >= 2 ** 31: + elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1: raise ValueError("Invalid number of features (%d)." % n_features) if input_type not in ("dict", "pair", "string"): diff --git a/sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing_fast.pyx index 87980db0f435d..d5f8de592b5c6 100644 --- a/sklearn/feature_extraction/_hashing_fast.pyx +++ b/sklearn/feature_extraction/_hashing_fast.pyx @@ -88,7 +88,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype, indices_a = np.frombuffer(indices, dtype=np.int32) indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype) - if indptr[-1] > 2147483648: # = 2**31 + if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1 if sp_version < (0, 14): raise ValueError(('sparse CSR array has {} non-zero ' 'elements and requires 64 bit indexing, ' diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2d8f7d840c55b..9771c62204444 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1150,7 +1150,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - if indptr[-1] > 2147483648: # = 2**31 - 1 + if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1 if _IS_32BIT: raise ValueError(('sparse CSR array has {} non-zero ' 'elements and requires 64 bit indexing, ' diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e5b6b97922054..e56b2e9a269c0 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -197,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters - max_depth = ((2 ** 31) - 1 if self.max_depth is None + max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) From e1f7b631ced4e8fdf535ca7d4e1d03256292ef3f Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Thu, 26 Dec 2019 06:04:54 -0500 Subject: [PATCH 148/448] ENH Allow usage of nu=inf in Matern kernel (#15972) Co-authored-by: Sam Dixon --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/gaussian_process/kernels.py | 4 ++++ sklearn/gaussian_process/tests/test_kernels.py | 11 +++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index babc557d9aaa0..90198d922c8e3 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -66,6 +66,12 @@ Changelog for datasets with large vocabularies combined with ``min_df`` or ``max_df``. :pr:`15834` by :user:`Santiago M. Mola `. +:mod:`sklearn.gaussian_process` +............................... + +- |Enhancement| :func:`gaussian_process.kernels.Matern` returns the RBF kernel when ``nu=np.inf``. + :pr: `15503` by :user:`Sam Dixon` . + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py index 51abe3ad24235..d1252a12a9257 100644 --- a/sklearn/gaussian_process/kernels.py +++ b/sklearn/gaussian_process/kernels.py @@ -1413,6 +1413,8 @@ def __call__(self, X, Y=None, eval_gradient=False): elif self.nu == 2.5: K = dists * math.sqrt(5) K = (1. + K + K ** 2 / 3.0) * np.exp(-K) + elif self.nu == np.inf: + K = np.exp(-dists ** 2 / 2.0) else: # general case; expensive to evaluate K = dists K[K == 0.0] += np.finfo(float).eps # strict zeros result in nan @@ -1449,6 +1451,8 @@ def __call__(self, X, Y=None, eval_gradient=False): elif self.nu == 2.5: tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis] K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp) + elif self.nu == np.inf: + K_gradient = D * K[..., np.newaxis] else: # approximate gradient numerically def f(theta): # helper function diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 6aaadd48ef317..e282786caf5ce 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -19,6 +19,7 @@ from sklearn.utils._testing import (assert_almost_equal, assert_array_equal, assert_array_almost_equal, + assert_allclose, assert_raise_message) @@ -270,6 +271,11 @@ def test_matern_kernel(): K_absexp = np.exp(-euclidean_distances(X, X, squared=False)) K = Matern(nu=0.5, length_scale=1.0)(X) assert_array_almost_equal(K, K_absexp) + # matern kernel with coef0==inf is equal to RBF kernel + K_rbf = RBF(length_scale=1.0)(X) + K = Matern(nu=np.inf, length_scale=1.0)(X) + assert_array_almost_equal(K, K_rbf) + assert_allclose(K, K_rbf) # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5]) # result in nearly identical results as the general case for coef0 in # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny] @@ -278,6 +284,11 @@ def test_matern_kernel(): K1 = Matern(nu=nu, length_scale=1.0)(X) K2 = Matern(nu=nu + tiny, length_scale=1.0)(X) assert_array_almost_equal(K1, K2) + # test that coef0==large is close to RBF + large = 100 + K1 = Matern(nu=large, length_scale=1.0)(X) + K2 = RBF(length_scale=1.0)(X) + assert_array_almost_equal(K1, K2, decimal=2) @pytest.mark.parametrize("kernel", kernels) From aa75e64c88212787c19a329f162d214db9a369f1 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Thu, 26 Dec 2019 06:10:17 -0500 Subject: [PATCH 149/448] DOC Apply numpydoc validation to VotingRegressor methods (#15969) Co-authored-by: Tiffany R. Williams --- sklearn/ensemble/_voting.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 3d6d8016cf6ed..23f381ca75750 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -326,7 +326,7 @@ class VotingRegressor(RegressorMixin, _BaseVoting): Parameters ---------- - estimators : list of (string, estimator) tuples + estimators : list of (str, estimator) tuples Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones of those original estimators that will be stored in the class attribute ``self.estimators_``. An estimator can be set to ``'drop'`` using @@ -357,6 +357,10 @@ class VotingRegressor(RegressorMixin, _BaseVoting): .. versionadded:: 0.20 + See Also + -------- + VotingClassifier: Soft Voting/Majority Rule classifier. + Examples -------- >>> import numpy as np @@ -370,10 +374,6 @@ class VotingRegressor(RegressorMixin, _BaseVoting): >>> er = VotingRegressor([('lr', r1), ('rf', r2)]) >>> print(er.fit(X, y).predict(X)) [ 3.3 5.7 11.8 19.7 28. 40.3] - - See also - -------- - VotingClassifier: Soft Voting/Majority Rule classifier. """ def __init__(self, estimators, weights=None, n_jobs=None): @@ -382,7 +382,7 @@ def __init__(self, estimators, weights=None, n_jobs=None): self.n_jobs = n_jobs def fit(self, X, y, sample_weight=None): - """ Fit the estimators. + """Fit the estimators. Parameters ---------- @@ -401,6 +401,7 @@ def fit(self, X, y, sample_weight=None): Returns ------- self : object + Fitted estimator. """ y = column_or_1d(y, warn=True) return super().fit(X, y, sample_weight) @@ -435,9 +436,8 @@ def transform(self, X): Returns ------- - predictions - array-like of shape (n_samples, n_classifiers), being - values predicted by each regressor. + predictions: array of shape (n_samples, n_classifiers) + Values predicted by each regressor. """ check_is_fitted(self) return self._predict(X) From 1cda8ed987347e4a0b21e969be221f9480ce3d6b Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Thu, 26 Dec 2019 06:30:41 -0500 Subject: [PATCH 150/448] ENH adding as_frame functionality for CA housing dataset loader (#15950) Co-authored-by: Stephanie Andrews --- doc/whats_new/v0.23.rst | 5 ++ sklearn/datasets/_base.py | 12 +++++ sklearn/datasets/_california_housing.py | 38 ++++++++++++-- .../datasets/tests/test_california_housing.py | 49 +++++++++++++++++-- 4 files changed, 96 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 90198d922c8e3..1941aacb7a7b0 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -58,6 +58,11 @@ Changelog :func:`datasets.make_moons` now accept two-element tuple. :pr:`15707` by :user:`Maciej J Mikulski `. +- |Feature| :func:`datasets.fetch_california_housing` now supports + heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950` + by :user:`Stephanie Andrews ` and + :user:`Reshama Shaikh `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 9f33bc1f5fbf7..334e0a72b47c6 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -17,6 +17,7 @@ from ..utils import Bunch from ..utils import check_random_state +from ..utils import check_pandas_support import numpy as np @@ -67,6 +68,17 @@ def clear_data_home(data_home=None): shutil.rmtree(data_home) +def _convert_data_dataframe(caller_name, data, target, + feature_names, target_names): + pd = check_pandas_support('{} with as_frame=True'.format(caller_name)) + data_df = pd.DataFrame(data, columns=feature_names) + target_df = pd.DataFrame(target, columns=target_names) + combined_df = pd.concat([data_df, target_df], axis=1) + X = combined_df[feature_names] + y = combined_df[target_names] + return combined_df, X, y + + def load_files(container_path, description=None, categories=None, load_content=True, shuffle=True, encoding=None, decode_error='strict', random_state=0): diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index bd02ff52ee19c..c71ebf3871b75 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -31,6 +31,7 @@ import joblib from . import get_data_home +from ._base import _convert_data_dataframe from ._base import _fetch_remote from ._base import _pkl_filepath from ._base import RemoteFileMetadata @@ -49,7 +50,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True, - return_X_y=False): + return_X_y=False, as_frame=False): """Load the California housing dataset (regression). ============== ============== @@ -78,15 +79,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True, .. versionadded:: 0.20 + as_frame : boolean, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string or categorical). The target is + a pandas DataFrame or Series depending on the number of target_columns. + + .. versionadded:: 0.23 + Returns ------- dataset : dict-like object with the following attributes: dataset.data : ndarray, shape [20640, 8] Each row corresponding to the 8 feature values in order. + If ``as_frame`` is True, ``data`` is a pandas object. dataset.target : numpy array of shape (20640,) Each value corresponds to the average house value in units of 100,000. + If ``as_frame`` is True, ``target`` is a pandas object. dataset.feature_names : array of length 8 Array of ordered feature names used in the dataset. @@ -98,6 +108,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True, .. versionadded:: 0.20 + frame : pandas DataFrame + Only present when `as_frame=True`. DataFrame with ``data`` and + ``target``. + + .. versionadded:: 0.23 + Notes ----- @@ -155,10 +171,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True, with open(join(module_path, 'descr', 'california_housing.rst')) as dfile: descr = dfile.read() + X = data + y = target + + frame = None + target_names = ["MedHouseVal", ] + if as_frame: + frame, X, y = _convert_data_dataframe("fetch_california_housing", + data, + target, + feature_names, + target_names) + if return_X_y: - return data, target + return X, y - return Bunch(data=data, - target=target, + return Bunch(data=X, + target=y, + frame=frame, + target_names=target_names, feature_names=feature_names, DESCR=descr) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index ef45226c01f02..56cd62ef8bc35 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -3,8 +3,9 @@ Skipped if california_housing is not already downloaded to data_home. """ +import pytest + from sklearn.datasets import fetch_california_housing -from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial @@ -13,14 +14,54 @@ def fetch(*args, **kwargs): return fetch_california_housing(*args, download_if_missing=False, **kwargs) -def test_fetch(): +def _is_california_housing_dataset_not_available(): try: - data = fetch() + fetch_california_housing(download_if_missing=False) + return False except IOError: - raise SkipTest("California housing dataset can not be loaded.") + return True + + +@pytest.mark.skipif( + _is_california_housing_dataset_not_available(), + reason='Download California Housing dataset to run this test' +) +def test_fetch(): + data = fetch() assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data, fetch_func) + + +@pytest.mark.skipif( + _is_california_housing_dataset_not_available(), + reason='Download California Housing dataset to run this test' +) +def test_fetch_asframe(): + pd = pytest.importorskip('pandas') + bunch = fetch(as_frame=True) + frame = bunch.frame + assert hasattr(bunch, 'frame') is True + assert frame.shape == (20640, 9) + assert isinstance(bunch.data, pd.DataFrame) + assert isinstance(bunch.target, pd.DataFrame) + + +@pytest.mark.skipif( + _is_california_housing_dataset_not_available(), + reason='Download California Housing dataset to run this test' +) +def test_pandas_dependency_message(): + try: + import pandas # noqa + pytest.skip("This test requires pandas to be not installed") + except ImportError: + # Check that pandas is imported lazily and that an informative error + # message is raised when pandas is missing: + expected_msg = ('fetch_california_housing with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_california_housing(as_frame=True) From bbb21e2ac5394e506ca18cb702ce97c350164e88 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Thu, 26 Dec 2019 06:37:34 -0500 Subject: [PATCH 151/448] DOC improve naive_bayes.py documentation (#15943) Co-authored-by: Jigna Panchal <40188288+jigna-panchal@users.noreply.github.com> --- sklearn/naive_bayes.py | 127 ++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 585ba69fbb1ce..cebc428e17b12 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -139,23 +139,23 @@ class GaussianNB(_BaseNB): Attributes ---------- - class_prior_ : array, shape (n_classes,) - probability of each class. - class_count_ : array, shape (n_classes,) number of training samples observed in each class. + class_prior_ : array, shape (n_classes,) + probability of each class. + classes_ : array, shape (n_classes,) class labels known to the classifier - theta_ : array, shape (n_classes, n_features) - mean of each feature per class + epsilon_ : float + absolute additive value to variances sigma_ : array, shape (n_classes, n_features) variance of each feature per class - epsilon_ : float - absolute additive value to variances + theta_ : array, shape (n_classes, n_features) + mean of each feature per class Examples -------- @@ -685,33 +685,33 @@ class MultinomialNB(_BaseDiscreteNB): Attributes ---------- + class_count_ : array, shape (n_classes,) + Number of samples encountered for each class during fitting. This + value is weighted by the sample weight when provided. + class_log_prior_ : array, shape (n_classes, ) Smoothed empirical log probability for each class. - intercept_ : array, shape (n_classes, ) - Mirrors ``class_log_prior_`` for interpreting MultinomialNB - as a linear model. - - feature_log_prob_ : array, shape (n_classes, n_features) - Empirical log probability of features - given a class, ``P(x_i|y)``. + classes_ : array, shape (n_classes,) + Class labels known to the classifier coef_ : array, shape (n_classes, n_features) Mirrors ``feature_log_prob_`` for interpreting MultinomialNB as a linear model. - class_count_ : array, shape (n_classes,) - Number of samples encountered for each class during fitting. This - value is weighted by the sample weight when provided. - - classes_ : array, shape (n_classes,) - Class labels known to the classifier - feature_count_ : array, shape (n_classes, n_features) Number of samples encountered for each (class, feature) during fitting. This value is weighted by the sample weight when provided. + feature_log_prob_ : array, shape (n_classes, n_features) + Empirical log probability of features + given a class, ``P(x_i|y)``. + + intercept_ : array, shape (n_classes, ) + Mirrors ``class_log_prior_`` for interpreting MultinomialNB + as a linear model. + n_features_ : int Number of features of each sample. @@ -797,31 +797,31 @@ class ComplementNB(_BaseDiscreteNB): Attributes ---------- - class_log_prior_ : array, shape (n_classes, ) - Smoothed empirical log probability for each class. Only used in edge - case with a single class in the training set. - - feature_log_prob_ : array, shape (n_classes, n_features) - Empirical weights for class complements. - class_count_ : array, shape (n_classes,) Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. + class_log_prior_ : array, shape (n_classes, ) + Smoothed empirical log probability for each class. Only used in edge + case with a single class in the training set. + classes_ : array, shape (n_classes,) Class labels known to the classifier + feature_all_ : array, shape (n_features,) + Number of samples encountered for each feature during fitting. This + value is weighted by the sample weight when provided. + feature_count_ : array, shape (n_classes, n_features) Number of samples encountered for each (class, feature) during fitting. This value is weighted by the sample weight when provided. + feature_log_prob_ : array, shape (n_classes, n_features) + Empirical weights for class complements. + n_features_ : int Number of features of each sample. - feature_all_ : array, shape (n_features,) - Number of samples encountered for each feature during fitting. This - value is weighted by the sample weight when provided. - Examples -------- >>> import numpy as np @@ -909,16 +909,13 @@ class BernoulliNB(_BaseDiscreteNB): Attributes ---------- - class_log_prior_ : array, shape = [n_classes] - Log probability of each class (smoothed). - - feature_log_prob_ : array, shape = [n_classes, n_features] - Empirical log probability of features given a class, P(x_i|y). - class_count_ : array, shape = [n_classes] Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. + class_log_prior_ : array, shape = [n_classes] + Log probability of each class (smoothed). + classes_ : array, shape (n_classes,) Class labels known to the classifier @@ -927,26 +924,12 @@ class BernoulliNB(_BaseDiscreteNB): during fitting. This value is weighted by the sample weight when provided. + feature_log_prob_ : array, shape = [n_classes, n_features] + Empirical log probability of features given a class, P(x_i|y). + n_features_ : int Number of features of each sample. - See Also - ---------- - MultinomialNB: The multinomial Naive Bayes classifier is \ - suitable for classification with discrete features. - - References - ---------- - C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to - Information Retrieval. Cambridge University Press, pp. 234-265. - https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html - - A. McCallum and K. Nigam (1998). A comparison of event models for naive - Bayes text classification. Proc. AAAI/ICML-98 Workshop on Learning for - Text Categorization, pp. 41-48. - - V. Metsis, I. Androutsopoulos and G. Paliouras (2006). Spam filtering with - naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). Examples -------- @@ -960,6 +943,19 @@ class BernoulliNB(_BaseDiscreteNB): BernoulliNB() >>> print(clf.predict(X[2:3])) [3] + + References + ---------- + C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to + Information Retrieval. Cambridge University Press, pp. 234-265. + https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html + + A. McCallum and K. Nigam (1998). A comparison of event models for naive + Bayes text classification. Proc. AAAI/ICML-98 Workshop on Learning for + Text Categorization, pp. 41-48. + + V. Metsis, I. Androutsopoulos and G. Paliouras (2006). Spam filtering with + naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ def __init__(self, alpha=1.0, binarize=.0, fit_prior=True, @@ -1036,22 +1032,25 @@ class CategoricalNB(_BaseDiscreteNB): Attributes ---------- - class_log_prior_ : array, shape (n_classes, ) - Smoothed empirical log probability for each class. - - feature_log_prob_ : list of arrays, len n_features + category_count_ : list of arrays, len n_features Holds arrays of shape (n_classes, n_categories of respective feature) - for each feature. Each array provides the empirical log probability - of categories given the respective feature and class, ``P(x_i|y)``. + for each feature. Each array provides the number of samples + encountered for each class and category of the specific feature. class_count_ : array, shape (n_classes,) Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. - category_count_ : list of arrays, len n_features + class_log_prior_ : array, shape (n_classes, ) + Smoothed empirical log probability for each class. + + classes_ : array, shape (n_classes,) + Class labels known to the classifier + + feature_log_prob_ : list of arrays, len n_features Holds arrays of shape (n_classes, n_categories of respective feature) - for each feature. Each array provides the number of samples - encountered for each class and category of the specific feature. + for each feature. Each array provides the empirical log probability + of categories given the respective feature and class, ``P(x_i|y)``. n_features_ : int Number of features of each sample. From 86b4636935c075fcb8ca254126180eab281eb1bb Mon Sep 17 00:00:00 2001 From: "@nkish" <19225359+ankishb@users.noreply.github.com> Date: Thu, 26 Dec 2019 11:43:10 +0000 Subject: [PATCH 152/448] DOC Fix default values in Perceptron documentation (#15965) --- sklearn/linear_model/_perceptron.py | 36 ++++----- sklearn/linear_model/_ridge.py | 109 ++++++++++++++-------------- 2 files changed, 74 insertions(+), 71 deletions(-) diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py index 10e4f27f5490e..157083c010390 100644 --- a/sklearn/linear_model/_perceptron.py +++ b/sklearn/linear_model/_perceptron.py @@ -12,25 +12,25 @@ class Perceptron(BaseSGDClassifier): Parameters ---------- - penalty : None, 'l2' or 'l1' or 'elasticnet' - The penalty (aka regularization term) to be used. Defaults to None. + penalty : {'l2','l1','elasticnet'}, default=None + The penalty (aka regularization term) to be used. - alpha : float + alpha : float, default=0.0001 Constant that multiplies the regularization term if regularization is - used. Defaults to 0.0001 + used. - fit_intercept : bool + fit_intercept : bool, default=True Whether the intercept should be estimated or not. If False, the - data is assumed to be already centered. Defaults to True. + data is assumed to be already centered. - max_iter : int, optional (default=1000) + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the :meth:`partial_fit` method. .. versionadded:: 0.19 - tol : float or None, optional (default=1e-3) + tol : float, default=1e-3 The stopping criterion. If it is not None, the iterations will stop when (loss > previous_loss - tol). @@ -39,20 +39,20 @@ class Perceptron(BaseSGDClassifier): shuffle : bool, default=True Whether or not the training data should be shuffled after each epoch. - verbose : integer, default=0 + verbose : int, default=0 The verbosity level - eta0 : double - Constant by which the updates are multiplied. Defaults to 1. + eta0 : double, default=1 + Constant by which the updates are multiplied. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of CPUs to use to do the OVA (One Versus All, for multi-class problems) computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -80,7 +80,7 @@ class Perceptron(BaseSGDClassifier): .. versionadded:: 0.20 - class_weight : dict, {class_label: weight} or "balanced" or None, optional + class_weight : dict, {class_label: weight} or "balanced", default=None Preset for the class_weight fit parameter. Weights associated with classes. If not given, all classes @@ -97,18 +97,18 @@ class Perceptron(BaseSGDClassifier): Attributes ---------- - coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\ - n_features] + coef_ : ndarray of shape = [1, n_features] if n_classes == 2 else \ + [n_classes, n_features] Weights assigned to the features. - intercept_ : array, shape = [1] if n_classes == 2 else [n_classes] + intercept_ : ndarray of shape = [1] if n_classes == 2 else [n_classes] Constants in decision function. n_iter_ : int The actual number of iterations to reach the stopping criterion. For multiclass fits, it is the maximum over every binary fit. - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The unique classes labels. t_ : int diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 2a24fba4675a5..ec2f29dbb2317 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -245,11 +245,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', Parameters ---------- - X : {array-like, sparse matrix, LinearOperator} of shape \ + X : {ndarray, sparse matrix, LinearOperator} of shape \ (n_samples, n_features) Training data - y : array-like of shape (n_samples,) or (n_samples, n_targets) + y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values alpha : float or array-like of shape (n_targets,) @@ -268,7 +268,8 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', .. versionadded:: 0.17 - solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'} + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \ + default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. @@ -308,7 +309,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', .. versionadded:: 0.19 SAGA solver. - max_iter : int, optional + max_iter : int, default=None Maximum number of iterations for conjugate gradient solver. For the 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' and saga solver, the default value is @@ -321,7 +322,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', Verbosity level. Setting verbose > 0 will display additional information depending on the solver used. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -349,14 +350,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', Returns ------- - coef : array of shape (n_features,) or (n_targets, n_features) + coef : ndarray of shape (n_features,) or (n_targets, n_features) Weight vector(s). n_iter : int, optional The actual number of iteration performed by the solver. Only returned if `return_n_iter` is True. - intercept : float or array of shape (n_targets,) + intercept : float or ndarray of shape (n_targets,) The intercept of the model. Only returned if `return_intercept` is True and if X is a scipy sparse array. @@ -618,7 +619,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): Parameters ---------- - alpha : {float, array-like of shape (n_targets,)}, default=1.0 + alpha : {float, ndarray of shape (n_targets,)}, default=1.0 Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. @@ -643,7 +644,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. - max_iter : int, optional + max_iter : int, default=None Maximum number of iterations for conjugate gradient solver. For 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. @@ -651,7 +652,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): tol : float, default=1e-3 Precision of the solution. - solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'} + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \ + default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. @@ -688,7 +690,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): .. versionadded:: 0.19 SAGA solver. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -700,14 +702,14 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): Attributes ---------- - coef_ : array of shape (n_features,) or (n_targets, n_features) + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) Weight vector(s). - intercept_ : float or array of shape (n_targets,) + intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. - n_iter_ : None or array of shape (n_targets,) + n_iter_ : None or ndarray of shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. @@ -747,13 +749,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data - y : array-like of shape (n_samples,) or (n_samples, n_targets) + y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values - sample_weight : float or array-like of shape (n_samples,), default=None + sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. @@ -798,14 +800,14 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. - max_iter : int, optional + max_iter : int, default=None Maximum number of iterations for conjugate gradient solver. The default value is determined by scipy.sparse.linalg. tol : float, default=1e-3 Precision of the solution. - class_weight : dict or 'balanced', optional + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -813,7 +815,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. - solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'} + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \ + default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. @@ -847,7 +850,7 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): .. versionadded:: 0.19 SAGA solver. - random_state : int, RandomState instance or None, default=None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -856,20 +859,20 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): Attributes ---------- - coef_ : array of shape (1, n_features) or (n_classes, n_features) + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. ``coef_`` is of shape (1, n_features) when the given problem is binary. - intercept_ : float or array of shape (n_targets,) + intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. - n_iter_ : None or array of shape (n_targets,) + n_iter_ : None or ndarray of shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The classes labels. See Also @@ -907,13 +910,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data. - y : array-like of shape (n_samples,) + y : ndarray of shape (n_samples,) Target values. - sample_weight : float or array-like of shape (n_samples,), default=None + sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. @@ -1135,7 +1138,7 @@ def _compute_gram(self, X, sqrt_sw): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : {ndarray, sparse matrix} of shape (n_samples, n_features) The preprocessed design matrix. sqrt_sw : ndarray of shape (n_samples,) @@ -1425,13 +1428,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data. Will be cast to float64 if necessary. - y : array-like of shape (n_samples,) or (n_samples, n_targets) + y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to float64 if necessary. - sample_weight : float or array-like of shape (n_samples,), default=None + sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. @@ -1543,14 +1546,14 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Training data. If using GCV, will be cast to float64 if necessary. - y : array-like of shape (n_samples,) or (n_samples, n_targets) + y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary. - sample_weight : float or array-like of shape (n_samples,), default=None + sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. @@ -1634,14 +1637,14 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. - scoring : string, callable or None, default=None + scoring : string, callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If None, the negative mean squared error if cv is 'auto' or None (i.e. when using generalized cross-validation), and r2 score otherwise. - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -1658,7 +1661,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. - gcv_mode : {None, 'auto', 'svd', eigen'}, optional + gcv_mode : {'auto', 'svd', eigen'}, default='auto' Flag indicating which strategy to use when performing Generalized Cross-Validation. Options are:: @@ -1670,7 +1673,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): The 'auto' mode is the default and is intended to pick the cheaper option of the two depending on the shape of the training data. - store_cv_values : boolean, default=False + store_cv_values : bool, default=False Flag indicating if the cross-validation values corresponding to each alpha should be stored in the ``cv_values_`` attribute (see below). This flag is only compatible with ``cv=None`` (i.e. using @@ -1678,17 +1681,17 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): Attributes ---------- - cv_values_ : array of shape (n_samples, n_alphas) or \ + cv_values_ : ndarray of shape (n_samples, n_alphas) or \ shape (n_samples, n_targets, n_alphas), optional Cross-validation values for each alpha (if ``store_cv_values=True``\ and ``cv=None``). After ``fit()`` has been called, this attribute \ will contain the mean squared errors (by default) or the values \ of the ``{loss,score}_func`` function (if provided in the constructor). - coef_ : array of shape (n_features) or (n_targets, n_features) + coef_ : ndarray of shape (n_features) or (n_targets, n_features) Weight vector(s). - intercept_ : float or array of shape (n_targets,) + intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. @@ -1750,12 +1753,12 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. - scoring : string, callable or None, default=None + scoring : string, callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -1767,7 +1770,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. - class_weight : dict or 'balanced', optional + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -1775,7 +1778,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` - store_cv_values : boolean, default=False + store_cv_values : bool, default=False Flag indicating if the cross-validation values corresponding to each alpha should be stored in the ``cv_values_`` attribute (see below). This flag is only compatible with ``cv=None`` (i.e. using @@ -1783,19 +1786,19 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): Attributes ---------- - cv_values_ : array of shape (n_samples, n_targets, n_alphas), optional + cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional Cross-validation values for each alpha (if ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been called, this attribute will contain the mean squared errors (by default) or the values of the ``{loss,score}_func`` function (if provided in the constructor). This attribute exists only when ``store_cv_values`` is True. - coef_ : array of shape (1, n_features) or (n_targets, n_features) + coef_ : ndarray of shape (1, n_features) or (n_targets, n_features) Coefficient of the features in the decision function. ``coef_`` is of shape (1, n_features) when the given problem is binary. - intercept_ : float or array of shape (n_targets,) + intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. @@ -1805,7 +1808,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): best_score_ : float Score of base estimator with best alpha. - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The classes labels. Examples @@ -1843,15 +1846,15 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. When using GCV, will be cast to float64 if necessary. - y : array-like of shape (n_samples,) + y : ndarray of shape (n_samples,) Target values. Will be cast to X's dtype if necessary. - sample_weight : float or array-like of shape (n_samples,), default=None + sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. From 8fc16206e8b2f2abe8e0a345ea3ea3d524ff1c1a Mon Sep 17 00:00:00 2001 From: "@nkish" <19225359+ankishb@users.noreply.github.com> Date: Thu, 26 Dec 2019 11:46:03 +0000 Subject: [PATCH 153/448] DOC Improve default values in logistic documentation (#15966) Co-authored-by: Guillaume Lemaitre --- sklearn/linear_model/_logistic.py | 268 +++++++++++++++--------------- 1 file changed, 135 insertions(+), 133 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index fd2eba7c7df82..7ea3f1b6566d7 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -52,18 +52,18 @@ def _intercept_dot(w, X, y): Parameters ---------- - w : ndarray, shape (n_features,) or (n_features + 1,) + w : ndarray of shape (n_features,) or (n_features + 1,) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - y : ndarray, shape (n_samples,) + y : ndarray of shape (n_samples,) Array of labels. Returns ------- - w : ndarray, shape (n_features,) + w : ndarray of shape (n_features,) Coefficient vector without the intercept weight (w[-1]) if the intercept should be fit. Unchanged otherwise. @@ -88,19 +88,19 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): Parameters ---------- - w : ndarray, shape (n_features,) or (n_features + 1,) + w : ndarray of shape (n_features,) or (n_features + 1,) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - y : ndarray, shape (n_samples,) + y : ndarray of shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. - sample_weight : array-like, shape (n_samples,) optional + sample_weight : array-like of shape (n_samples,), default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. @@ -109,7 +109,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): out : float Logistic loss. - grad : ndarray, shape (n_features,) or (n_features + 1,) + grad : ndarray of shape (n_features,) or (n_features + 1,) Logistic gradient. """ n_samples, n_features = X.shape @@ -139,19 +139,19 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None): Parameters ---------- - w : ndarray, shape (n_features,) or (n_features + 1,) + w : ndarray of shape (n_features,) or (n_features + 1,) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - y : ndarray, shape (n_samples,) + y : ndarray of shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. - sample_weight : array-like, shape (n_samples,) optional + sample_weight : array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. @@ -175,25 +175,25 @@ def _logistic_grad_hess(w, X, y, alpha, sample_weight=None): Parameters ---------- - w : ndarray, shape (n_features,) or (n_features + 1,) + w : ndarray of shape (n_features,) or (n_features + 1,) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - y : ndarray, shape (n_samples,) + y : ndarray of shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. - sample_weight : array-like, shape (n_samples,) optional + sample_weight : array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- - grad : ndarray, shape (n_features,) or (n_features + 1,) + grad : ndarray of shape (n_features,) or (n_features + 1,) Logistic gradient. Hs : callable @@ -252,20 +252,20 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight): Parameters ---------- - w : ndarray, shape (n_classes * n_features,) or + w : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - Y : ndarray, shape (n_samples, n_classes) + Y : ndarray of shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. - sample_weight : array-like, shape (n_samples,) + sample_weight : array-like of shape (n_samples,) Array of weights that are assigned to individual samples. Returns @@ -273,10 +273,10 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight): loss : float Multinomial loss. - p : ndarray, shape (n_samples, n_classes) + p : ndarray of shape (n_samples, n_classes) Estimated class probabilities. - w : ndarray, shape (n_classes, n_features) + w : ndarray of shape (n_classes, n_features) Reshaped param vector excluding intercept terms. Reference @@ -308,20 +308,20 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): Parameters ---------- - w : ndarray, shape (n_classes * n_features,) or + w : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - Y : ndarray, shape (n_samples, n_classes) + Y : ndarray of shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. - sample_weight : array-like, shape (n_samples,) + sample_weight : array-like of shape (n_samples,) Array of weights that are assigned to individual samples. Returns @@ -329,11 +329,11 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): loss : float Multinomial loss. - grad : ndarray, shape (n_classes * n_features,) or - (n_classes * (n_features + 1),) + grad : ndarray of shape (n_classes * n_features,) or \ + (n_classes * (n_features + 1),) Ravelled gradient of the multinomial loss. - p : ndarray, shape (n_samples, n_classes) + p : ndarray of shape (n_samples, n_classes) Estimated class probabilities Reference @@ -362,26 +362,26 @@ def _multinomial_grad_hess(w, X, Y, alpha, sample_weight): Parameters ---------- - w : ndarray, shape (n_classes * n_features,) or + w : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - Y : ndarray, shape (n_samples, n_classes) + Y : ndarray of shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. - sample_weight : array-like, shape (n_samples,) + sample_weight : array-like of shape (n_samples,) Array of weights that are assigned to individual samples. Returns ------- - grad : array, shape (n_classes * n_features,) or - (n_classes * (n_features + 1),) + grad : ndarray of shape (n_classes * n_features,) or \ + (n_classes * (n_features + 1),) Ravelled gradient of the multinomial loss. hessp : callable @@ -497,46 +497,47 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Input data. - y : array-like, shape (n_samples,) or (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Input data, target values. - pos_class : int, None + pos_class : int, default=None The class with respect to which we perform a one-vs-all fit. If None, then it is assumed that the given problem is binary. - Cs : int | array-like, shape (n_cs,) + Cs : int or array-like of shape (n_cs,), default=10 List of values for the regularization parameter or integer specifying the number of regularization parameters that should be used. In this case, the parameters will be chosen in a logarithmic scale between 1e-4 and 1e4. - fit_intercept : bool + fit_intercept : bool, default=True Whether to fit an intercept for the model. In this case the shape of the returned array is (n_cs, n_features + 1). - max_iter : int + max_iter : int, default=100 Maximum number of iterations for the solver. - tol : float + tol : float, default=1e-4 Stopping criterion. For the newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient. - verbose : int + verbose : int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. - solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'} + solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, \ + default='lbfgs' Numerical solver to use. - coef : array-like, shape (n_features,), default None + coef : array-like of shape (n_features,), default=None Initialization value for coefficients of logistic regression. Useless for liblinear solver. - class_weight : dict or 'balanced', optional + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -547,17 +548,17 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - dual : bool + dual : bool, default=False Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - penalty : str, 'l1', 'l2', or 'elasticnet' + penalty : {'l1', 'l2', 'elasticnet'}, default='l2' Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. - intercept_scaling : float, default 1. + intercept_scaling : float, default=1. Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], @@ -583,7 +584,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, .. versionchanged:: 0.22 Default changed from 'ovr' to 'auto' in 0.22. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -591,19 +592,19 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, instance used by `np.random`. Used when ``solver`` == 'sag' or 'liblinear'. - check_input : bool, default True + check_input : bool, default=True If False, the input arrays X and y will not be checked. - max_squared_sum : float, default None + max_squared_sum : float, default=None Maximum squared sum of X over samples. Used only in SAG solver. If None, it will be computed, going through all the samples. The value should be precomputed to speed up cross validation. - sample_weight : array-like, shape(n_samples,) optional + sample_weight : array-like of shape(n_samples,), default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. - l1_ratio : float or None, optional (default=None) + l1_ratio : float, default=None The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent @@ -612,7 +613,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, Returns ------- - coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) + coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1) List of coefficients for the Logistic Regression model. If fit_intercept is set to True then the second dimension will be n_features + 1, where the last item represents the intercept. For @@ -622,7 +623,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, Cs : ndarray Grid of Cs used for cross-validation. - n_iter : array, shape (n_cs,) + n_iter : array of shape (n_cs,) Actual number of iteration for each Cs. Notes @@ -834,10 +835,10 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. - y : array-like, shape (n_samples,) or (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Target labels. train : list of indices @@ -846,34 +847,34 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, test : list of indices The indices of the test set. - pos_class : int, None + pos_class : int, default=None The class with respect to which we perform a one-vs-all fit. If None, then it is assumed that the given problem is binary. - Cs : list of floats | int + Cs : int or list of floats, default=10 Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. If not provided, then a fixed set of values for Cs are used. - scoring : callable or None, optional, default: None + scoring : callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. For a list of scoring functions that can be used, look at :mod:`sklearn.metrics`. The default scoring option used is accuracy_score. - fit_intercept : bool + fit_intercept : bool, default=False If False, then the bias term is set to zero. Else the last term of each coef_ gives us the intercept. - max_iter : int + max_iter : int, default=100 Maximum number of iterations for the solver. - tol : float + tol : float, default=1e-4 Tolerance for stopping criteria. - class_weight : dict or 'balanced', optional + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -884,24 +885,25 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - verbose : int + verbose : int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. - solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'} + solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, \ + default='lbfgs' Decides which solver to use. - penalty : str, 'l1', 'l2', or 'elasticnet' + penalty : {'l1', 'l2', 'elasticnet'}, default='l2' Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. - dual : bool + dual : bool, default=False Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - intercept_scaling : float, default 1. + intercept_scaling : float, default=1. Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], @@ -913,13 +915,13 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - multi_class : {'ovr', 'multinomial'} + multi_class : {'auto', 'ovr', 'multinomial'}, default='auto' If the option chosen is 'ovr', then a binary problem is fit for each label. For 'multinomial' the loss minimised is the multinomial loss fit across the entire probability distribution, *even when the data is binary*. 'multinomial' is unavailable when solver='liblinear'. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -927,16 +929,16 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, instance used by `np.random`. Used when ``solver`` == 'sag' and 'liblinear'. - max_squared_sum : float, default None + max_squared_sum : float, default=None Maximum squared sum of X over samples. Used only in SAG solver. If None, it will be computed, going through all the samples. The value should be precomputed to speed up cross validation. - sample_weight : array-like, shape(n_samples,) optional + sample_weight : array-like of shape(n_samples,), default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. - l1_ratio : float or None, optional (default=None) + l1_ratio : float, default=None The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent @@ -945,7 +947,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, Returns ------- - coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) + coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1) List of coefficients for the Logistic Regression model. If fit_intercept is set to True then the second dimension will be n_features + 1, where the last item represents the intercept. @@ -953,10 +955,10 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, Cs : ndarray Grid of Cs used for cross-validation. - scores : ndarray, shape (n_cs,) + scores : ndarray of shape (n_cs,) Scores obtained for each Cs. - n_iter : array, shape(n_cs,) + n_iter : ndarray of shape(n_cs,) Actual number of iteration for each Cs. """ X_train = X[train] @@ -1042,7 +1044,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, Parameters ---------- - penalty : str, 'l1', 'l2', 'elasticnet' or 'none', optional (default='l2') + penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2' Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. If 'none' (not supported by the @@ -1051,24 +1053,24 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) - dual : bool, optional (default=False) + dual : bool, default=False Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - tol : float, optional (default=1e-4) + tol : float, default=1e-4 Tolerance for stopping criteria. - C : float, optional (default=1.0) + C : float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. - fit_intercept : bool, optional (default=True) + fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - intercept_scaling : float, optional (default=1) + intercept_scaling : float, default=1 Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], @@ -1081,7 +1083,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - class_weight : dict or 'balanced', optional (default=None) + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -1095,7 +1097,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionadded:: 0.17 *class_weight='balanced'* - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -1103,8 +1105,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, instance used by `np.random`. Used when ``solver`` == 'sag' or 'liblinear'. - solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ - optional (default='lbfgs') + solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ + default='lbfgs' Algorithm to use in the optimization problem. @@ -1129,10 +1131,10 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. - max_iter : int, optional (default=100) + max_iter : int, default=100 Maximum number of iterations taken for the solvers to converge. - multi_class : {'ovr', 'multinomial', 'auto'}, default='auto' + multi_class : {'auto', 'ovr', 'multinomial'}, default='auto' If the option chosen is 'ovr', then a binary problem is fit for each label. For 'multinomial' the loss minimised is the multinomial loss fit across the entire probability distribution, *even when the data is @@ -1145,11 +1147,11 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionchanged:: 0.22 Default changed from 'ovr' to 'auto' in 0.22. - verbose : int, optional (default=0) + verbose : int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. @@ -1157,7 +1159,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionadded:: 0.17 *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of CPU cores used when parallelizing over classes if multi_class='ovr'". This parameter is ignored when the ``solver`` is set to 'liblinear' regardless of whether 'multi_class' is specified or @@ -1165,7 +1167,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - l1_ratio : float or None, optional (default=None) + l1_ratio : float, default=None The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'`. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent @@ -1175,17 +1177,17 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, Attributes ---------- - classes_ : array, shape (n_classes, ) + classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. - coef_ : array, shape (1, n_features) or (n_classes, n_features) + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. `coef_` is of shape (1, n_features) when the given problem is binary. In particular, when `multi_class='multinomial'`, `coef_` corresponds to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False). - intercept_ : array, shape (1,) or (n_classes,) + intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. If `fit_intercept` is set to False, the intercept is set to zero. @@ -1194,7 +1196,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, corresponds to outcome 1 (True) and `-intercept_` corresponds to outcome 0 (False). - n_iter_ : array, shape (n_classes,) or (1, ) + n_iter_ : ndarray of shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. @@ -1288,14 +1290,14 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target vector relative to X. - sample_weight : array-like, shape (n_samples,) optional + sample_weight : array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. @@ -1529,18 +1531,18 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, Parameters ---------- - Cs : list of floats or int, optional (default=10) + Cs : int or list of floats, default=10 Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization. - fit_intercept : bool, optional (default=True) + fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - cv : int or cross-validation generator, optional (default=None) + cv : int or cross-validation generator, default=None The default cross-validation generator used is Stratified K-Folds. If an integer is provided, then it is the number of folds used. See the module :mod:`sklearn.model_selection` module for the @@ -1549,25 +1551,25 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - dual : bool, optional (default=False) + dual : bool, default=False Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - penalty : str, 'l1', 'l2', or 'elasticnet', optional (default='l2') + penalty : {'l1', 'l2', 'elasticnet'}, default='l2' Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. - scoring : string, callable, or None, optional (default=None) + scoring : str or callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. For a list of scoring functions that can be used, look at :mod:`sklearn.metrics`. The default scoring option used is 'accuracy'. - solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ - optional (default='lbfgs') + solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ + default='lbfgs' Algorithm to use in the optimization problem. @@ -1590,13 +1592,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, .. versionadded:: 0.19 SAGA solver. - tol : float, optional (default=1e-4) + tol : float, default=1e-4 Tolerance for stopping criteria. - max_iter : int, optional (default=100) + max_iter : int, default=100 Maximum number of iterations of the optimization algorithm. - class_weight : dict or 'balanced', optional (default=None) + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -1610,24 +1612,24 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, .. versionadded:: 0.17 class_weight == 'balanced' - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of CPU cores used during the cross-validation loop. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : int, optional (default=0) + verbose : int, default=0 For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any positive number for verbosity. - refit : bool, optional (default=True) + refit : bool, default=True If set to True, the scores are averaged across all folds, and the coefs and the C that corresponds to the best score is taken, and a final refit is done using these parameters. Otherwise the coefs, intercepts and C that correspond to the best scores across folds are averaged. - intercept_scaling : float, optional (default=1) + intercept_scaling : float, default=1 Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], @@ -1640,7 +1642,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - multi_class : {'ovr', 'multinomial', 'auto'}, default='auto' + multi_class : {'auto, 'ovr', 'multinomial'}, default='auto' If the option chosen is 'ovr', then a binary problem is fit for each label. For 'multinomial' the loss minimised is the multinomial loss fit across the entire probability distribution, *even when the data is @@ -1653,7 +1655,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, .. versionchanged:: 0.22 Default changed from 'ovr' to 'auto' in 0.22. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -1661,7 +1663,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, Note that this only applies to the solver and not the cross-validation generator. - l1_ratios : list of float or None, optional (default=None) + l1_ratios : list of float, default=None The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to using ``penalty='l2'``, while 1 is equivalent to using @@ -1670,30 +1672,30 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, Attributes ---------- - classes_ : array, shape (n_classes, ) + classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. - coef_ : array, shape (1, n_features) or (n_classes, n_features) + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. `coef_` is of shape (1, n_features) when the given problem is binary. - intercept_ : array, shape (1,) or (n_classes,) + intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. If `fit_intercept` is set to False, the intercept is set to zero. `intercept_` is of shape(1,) when the problem is binary. - Cs_ : array, shape (n_cs) + Cs_ : ndarray of shape (n_cs) Array of C i.e. inverse of regularization parameter values used for cross-validation. - l1_ratios_ : array, shape (n_l1_ratios) + l1_ratios_ : ndarray of shape (n_l1_ratios) Array of l1_ratios used for cross-validation. If no l1_ratio is used (i.e. penalty is not 'elasticnet'), this is set to ``[None]`` - coefs_paths_ : array, shape (n_folds, n_cs, n_features) or \ + coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or \ (n_folds, n_cs, n_features + 1) dict with classes as the keys, and the path of coefficients obtained during cross-validating across each fold and then across each Cs @@ -1715,19 +1717,19 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, has shape ``(n_folds, n_cs`` or ``(n_folds, n_cs, n_l1_ratios)`` if ``penalty='elasticnet'``. - C_ : array, shape (n_classes,) or (n_classes - 1,) + C_ : ndarray of shape (n_classes,) or (n_classes - 1,) Array of C that maps to the best scores across every class. If refit is set to False, then for each class, the best C is the average of the C's that correspond to the best scores for each fold. `C_` is of shape(n_classes,) when the problem is binary. - l1_ratio_ : array, shape (n_classes,) or (n_classes - 1,) + l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,) Array of l1_ratio that maps to the best scores across every class. If refit is set to False, then for each class, the best l1_ratio is the average of the l1_ratio's that correspond to the best scores for each fold. `l1_ratio_` is of shape(n_classes,) when the problem is binary. - n_iter_ : array, shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs) + n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs) Actual number of iterations for all classes, folds and Cs. In the binary or multinomial cases, the first dimension is equal to 1. If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds, @@ -1780,14 +1782,14 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target vector relative to X. - sample_weight : array-like, shape (n_samples,) optional + sample_weight : array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. From 2a185f91b38324d7148b182dce0aa994af0650d2 Mon Sep 17 00:00:00 2001 From: Pulkit Mehta Date: Thu, 26 Dec 2019 17:19:45 +0530 Subject: [PATCH 154/448] DOC Improve documentation of default values for imputers (#15964) --- sklearn/impute/_base.py | 16 ++++++++-------- sklearn/impute/_iterative.py | 26 +++++++++++++------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 2ad49833641dc..c952831d85e1f 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -126,7 +126,7 @@ class SimpleImputer(_BaseImputer): The placeholder for the missing values. All occurrences of `missing_values` will be imputed. - strategy : string, optional (default="mean") + strategy : string, default='mean' The imputation strategy. - If "mean", then replace missing values using the mean along @@ -141,16 +141,16 @@ class SimpleImputer(_BaseImputer): .. versionadded:: 0.20 strategy="constant" for fixed value imputation. - fill_value : string or numerical value, optional (default=None) + fill_value : string or numerical value, default=None When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and "missing_value" for strings or object data types. - verbose : integer, optional (default=0) + verbose : integer, default=0 Controls the verbosity of the imputer. - copy : boolean, optional (default=True) + copy : boolean, default=True If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: @@ -159,7 +159,7 @@ class SimpleImputer(_BaseImputer): - If X is encoded as a CSR matrix; - If add_indicator=True. - add_indicator : boolean, optional (default=False) + add_indicator : boolean, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no @@ -470,7 +470,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator): `missing_values` will be indicated (True in the output array), the other values will be marked as False. - features : str, optional + features : str, default=None Whether the imputer mask should represent all or a subset of features. @@ -478,7 +478,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator): features containing missing values during fit time. - If "all", the imputer mask will represent all features. - sparse : boolean or "auto", optional + sparse : boolean or "auto", default=None Whether the imputer mask format should be sparse or dense. - If "auto" (default), the imputer mask will be of same type as @@ -486,7 +486,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator): - If True, the imputer mask will be a sparse matrix. - If False, the imputer mask will be a numpy array. - error_on_new : boolean, optional + error_on_new : boolean, default=None If True (default), transform will raise an error when there are features with missing values in transform that have no missing values in fit. This is applicable only when ``features="missing-only"``. diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index fa9d576f04008..7983b8dbe4062 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -52,7 +52,7 @@ class IterativeImputer(_BaseImputer): If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method. - missing_values : int, np.nan, optional (default=np.nan) + missing_values : int, np.nan, default=np.nan The placeholder for the missing values. All occurrences of ``missing_values`` will be imputed. @@ -62,7 +62,7 @@ class IterativeImputer(_BaseImputer): ``return_std`` in its ``predict`` method if set to ``True``. Set to ``True`` if using ``IterativeImputer`` for multiple imputations. - max_iter : int, optional (default=10) + max_iter : int, default=10 Maximum number of imputation rounds to perform before returning the imputations computed during the final round. A round is a single imputation of each feature with missing values. The stopping criterion @@ -70,10 +70,10 @@ class IterativeImputer(_BaseImputer): where `X_t` is `X` at iteration `t. Note that early stopping is only applied if ``sample_posterior=False``. - tol : float, optional (default=1e-3) + tol : float, default=1e-3 Tolerance of the stopping condition. - n_nearest_features : int, optional (default=None) + n_nearest_features : int, default=None Number of other features to use to estimate the missing values of each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after @@ -83,12 +83,12 @@ class IterativeImputer(_BaseImputer): imputed target feature. Can provide significant speed-up when the number of features is huge. If ``None``, all features will be used. - initial_strategy : str, optional (default="mean") + initial_strategy : str, default='mean' Which strategy to use to initialize the missing values. Same as the ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` Valid values: {"mean", "median", "most_frequent", or "constant"}. - imputation_order : str, optional (default="ascending") + imputation_order : str, default='ascending' The order in which the features will be imputed. Possible values: "ascending" @@ -102,34 +102,34 @@ class IterativeImputer(_BaseImputer): "random" A random order for each round. - skip_complete : boolean, optional (default=False) + skip_complete : boolean, default=False If ``True`` then features with missing values during ``transform`` which did not have any missing values during ``fit`` will be imputed with the initial imputation method only. Set to ``True`` if you have many features with no missing values at both ``fit`` and ``transform`` time to save compute. - min_value : float, optional (default=None) + min_value : float, default=None Minimum possible imputed value. Default of ``None`` will set minimum to negative infinity. - max_value : float, optional (default=None) + max_value : float, default=None Maximum possible imputed value. Default of ``None`` will set maximum to positive infinity. - verbose : int, optional (default=0) + verbose : int, default=0 Verbosity flag, controls the debug messages that are issued as functions are evaluated. The higher, the more verbose. Can be 0, 1, or 2. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance or None, default=None The seed of the pseudo random number generator to use. Randomizes selection of estimator features if n_nearest_features is not None, the ``imputation_order`` if ``random``, and the sampling from posterior if ``sample_posterior`` is True. Use an integer for determinism. See :term:`the Glossary `. - add_indicator : boolean, optional (default=False) + add_indicator : boolean, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no @@ -443,7 +443,7 @@ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): X_filled : ndarray, shape (n_samples, n_features) Input data with the most recent imputations. - tolerance : float, optional (default=1e-6) + tolerance : float, default=1e-6 ``abs_corr_mat`` can have nans, which will be replaced with ``tolerance``. From c393bdc6dde1e2826002a6d2529f068a8376a7e9 Mon Sep 17 00:00:00 2001 From: David Breuer Date: Thu, 26 Dec 2019 13:02:31 +0100 Subject: [PATCH 155/448] EXA/MAINT Simplify code in manifold learning example (#15949) --- examples/manifold/plot_compare_methods.py | 94 +++++++---------------- 1 file changed, 29 insertions(+), 65 deletions(-) diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py index 7c5f3b6200635..ed01e8ac19b89 100644 --- a/examples/manifold/plot_compare_methods.py +++ b/examples/manifold/plot_compare_methods.py @@ -23,6 +23,8 @@ print(__doc__) +from collections import OrderedDict +from functools import partial from time import time import matplotlib.pyplot as plt @@ -39,81 +41,43 @@ n_neighbors = 10 n_components = 2 +# Create figure fig = plt.figure(figsize=(15, 8)) -plt.suptitle("Manifold Learning with %i points, %i neighbors" +fig.suptitle("Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors), fontsize=14) - +# Add 3d scatter plot ax = fig.add_subplot(251, projection='3d') ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) ax.view_init(4, -72) -methods = ['standard', 'ltsa', 'hessian', 'modified'] -labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE'] - -for i, method in enumerate(methods): +# Set-up manifold methods +LLE = partial(manifold.LocallyLinearEmbedding, + n_neighbors, n_components, eigen_solver='auto') + +methods = OrderedDict() +methods['LLE'] = LLE(method='standard') +methods['LTSA'] = LLE(method='ltsa') +methods['Hessian LLE'] = LLE(method='hessian') +methods['Modified LLE'] = LLE(method='modified') +methods['Isomap'] = manifold.Isomap(n_neighbors, n_components) +methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1) +methods['SE'] = manifold.SpectralEmbedding(n_components=n_components, + n_neighbors=n_neighbors) +methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', + random_state=0) + +# Plot results +for i, (label, method) in enumerate(methods.items()): t0 = time() - Y = manifold.LocallyLinearEmbedding(n_neighbors, n_components, - eigen_solver='auto', - method=method).fit_transform(X) + Y = method.fit_transform(X) t1 = time() - print("%s: %.2g sec" % (methods[i], t1 - t0)) - - ax = fig.add_subplot(252 + i) - plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) - plt.title("%s (%.2g sec)" % (labels[i], t1 - t0)) + print("%s: %.2g sec" % (label, t1 - t0)) + ax = fig.add_subplot(2, 5, 2 + i + (i > 3)) + ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) + ax.set_title("%s (%.2g sec)" % (label, t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) - plt.axis('tight') - -t0 = time() -Y = manifold.Isomap(n_neighbors, n_components).fit_transform(X) -t1 = time() -print("Isomap: %.2g sec" % (t1 - t0)) -ax = fig.add_subplot(257) -plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) -plt.title("Isomap (%.2g sec)" % (t1 - t0)) -ax.xaxis.set_major_formatter(NullFormatter()) -ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') - - -t0 = time() -mds = manifold.MDS(n_components, max_iter=100, n_init=1) -Y = mds.fit_transform(X) -t1 = time() -print("MDS: %.2g sec" % (t1 - t0)) -ax = fig.add_subplot(258) -plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) -plt.title("MDS (%.2g sec)" % (t1 - t0)) -ax.xaxis.set_major_formatter(NullFormatter()) -ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') - - -t0 = time() -se = manifold.SpectralEmbedding(n_components=n_components, - n_neighbors=n_neighbors) -Y = se.fit_transform(X) -t1 = time() -print("SpectralEmbedding: %.2g sec" % (t1 - t0)) -ax = fig.add_subplot(259) -plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) -plt.title("SpectralEmbedding (%.2g sec)" % (t1 - t0)) -ax.xaxis.set_major_formatter(NullFormatter()) -ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') - -t0 = time() -tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0) -Y = tsne.fit_transform(X) -t1 = time() -print("t-SNE: %.2g sec" % (t1 - t0)) -ax = fig.add_subplot(2, 5, 10) -plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) -plt.title("t-SNE (%.2g sec)" % (t1 - t0)) -ax.xaxis.set_major_formatter(NullFormatter()) -ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') + ax.axis('tight') plt.show() From 725ca8f32e1877881064886a422d8dc35449d7fe Mon Sep 17 00:00:00 2001 From: Fan Date: Thu, 26 Dec 2019 05:03:24 -0800 Subject: [PATCH 156/448] MAINT Remove redundant sample_weights check in DummyClassifier (#15510) Co-authored-by: Sallie Walecka --- sklearn/dummy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 3c7d3286dd85b..c1d22d60f39ab 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -156,7 +156,7 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] - check_consistent_length(X, y, sample_weight) + check_consistent_length(X, y) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) @@ -245,7 +245,7 @@ def predict(self, X): classes_ = [np.array([c]) for c in constant] y = _random_choice_csc(n_samples, classes_, class_prob, - self.random_state) + self.random_state) else: if self._strategy in ("most_frequent", "prior"): y = np.tile([classes_[k][class_prior_[k].argmax()] for From 3339d802402fd2f2ed5e954434c637bf7a68124d Mon Sep 17 00:00:00 2001 From: "@nkish" <19225359+ankishb@users.noreply.github.com> Date: Thu, 26 Dec 2019 13:35:52 +0000 Subject: [PATCH 157/448] DOC Improve default values in SGD documentation (#15967) --- sklearn/linear_model/_stochastic_gradient.py | 115 +++++++++---------- 1 file changed, 56 insertions(+), 59 deletions(-) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index eb1e9e7b545e7..a9775a4ae850e 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -244,12 +244,12 @@ def _make_validation_split(self, y): Parameters ---------- - y : array, shape (n_samples, ) + y : ndarray of shape (n_samples, ) Target values. Returns ------- - validation_mask : array, shape (n_samples, ) + validation_mask : ndarray of shape (n_samples, ) Equal to 1 on the validation set, 0 on the training set. """ n_samples = y.shape[0] @@ -362,11 +362,11 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, sample_weight : numpy array of shape [n_samples, ] The weight of each sample - validation_mask : numpy array of shape [n_samples, ] or None + validation_mask : numpy array of shape [n_samples, ], default=None Precomputed validation mask in case _fit_binary is called in the context of a one-vs-rest reduction. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -641,10 +641,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Subset of the training data. - y : numpy array, shape (n_samples,) + y : ndarray of shape (n_samples,) Subset of the target values. - classes : array, shape (n_classes,) + classes : ndarray of shape (n_classes,), default=None Classes across all calls to partial_fit. Can be obtained by via `np.unique(y_all)`, where y_all is the target vector of the entire dataset. @@ -652,7 +652,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): and can be omitted in the subsequent calls. Note that y doesn't need to contain all labels in `classes`. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like, shape (n_samples,), default=None Weights applied to individual samples. If not provided, uniform weights are assumed. @@ -685,16 +685,16 @@ def fit(self, X, y, coef_init=None, intercept_init=None, X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array, shape (n_samples,) + y : ndarray of shape (n_samples,) Target values. - coef_init : array, shape (n_classes, n_features) + coef_init : ndarray of shape (n_classes, n_features), default=None The initial coefficients to warm-start the optimization. - intercept_init : array, shape (n_classes,) + intercept_init : ndarray of shape (n_classes,), default=None The initial intercept to warm-start the optimization. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like, shape (n_samples,), default=None Weights applied to individual samples. If not provided, uniform weights are assumed. These weights will be multiplied with class_weight (passed through the @@ -738,7 +738,7 @@ class SGDClassifier(BaseSGDClassifier): Parameters ---------- - loss : str, default: 'hinge' + loss : str, default='hinge' The loss function to be used. Defaults to 'hinge', which gives a linear SVM. @@ -754,42 +754,41 @@ class SGDClassifier(BaseSGDClassifier): The other losses are designed for regression but can be useful in classification as well; see SGDRegressor for a description. - penalty : str, 'none', 'l2', 'l1', or 'elasticnet' + penalty : {'l2', 'l1', 'elasticnet'}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and 'elasticnet' might bring sparsity to the model (feature selection) not achievable with 'l2'. - alpha : float + alpha : float, default=0.0001 Constant that multiplies the regularization term. Defaults to 0.0001. Also used to compute learning_rate when set to 'optimal'. - l1_ratio : float + l1_ratio : float, default=0.15 The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Defaults to 0.15. - fit_intercept : bool + fit_intercept : bool, default=True Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True. - max_iter : int, optional (default=1000) + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the :meth:`partial_fit` method. .. versionadded:: 0.19 - tol : float or None, optional (default=1e-3) + tol : float, default=1e-3 The stopping criterion. If it is not None, the iterations will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. .. versionadded:: 0.19 - shuffle : bool, optional + shuffle : bool, default=True Whether or not the training data should be shuffled after each epoch. - Defaults to True. verbose : int, default=0 The verbosity level. @@ -802,21 +801,21 @@ class SGDClassifier(BaseSGDClassifier): For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of CPUs to use to do the OVA (One Versus All, for multi-class problems) computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - learning_rate : str, optional + learning_rate : str, default='optimal' The learning rate schedule: 'constant': @@ -832,12 +831,12 @@ class SGDClassifier(BaseSGDClassifier): training loss by tol or fail to increase validation score by tol if early_stopping is True, the current learning rate is divided by 5. - eta0 : double + eta0 : double, default=0.0 The initial learning rate for the 'constant', 'invscaling' or 'adaptive' schedules. The default value is 0.0 as eta0 is not used by the default schedule 'optimal'. - power_t : double + power_t : double, default=0.5 The exponent for inverse scaling learning rate [default 0.5]. early_stopping : bool, default=False @@ -861,7 +860,7 @@ class SGDClassifier(BaseSGDClassifier): .. versionadded:: 0.20 - class_weight : dict, {class_label: weight} or "balanced" or None, optional + class_weight : dict, {class_label: weight} or "balanced", default=None Preset for the class_weight fit parameter. Weights associated with classes. If not given, all classes @@ -893,11 +892,11 @@ class SGDClassifier(BaseSGDClassifier): Attributes ---------- - coef_ : array, shape (1, n_features) if n_classes == 2 else (n_classes,\ - n_features) + coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \ + (n_classes, n_features) Weights assigned to the features. - intercept_ : array, shape (1,) if n_classes == 2 else (n_classes,) + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) Constants in decision function. n_iter_ : int @@ -979,7 +978,7 @@ def predict_proba(self): Returns ------- - array, shape (n_samples, n_classes) + ndarray of shape (n_samples, n_classes) Returns the probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. @@ -1140,7 +1139,7 @@ def partial_fit(self, X, y, sample_weight=None): y : numpy array of shape (n_samples,) Subset of target values - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like, shape (n_samples,), default=None Weights applied to individual samples. If not provided, uniform weights are assumed. @@ -1198,16 +1197,16 @@ def fit(self, X, y, coef_init=None, intercept_init=None, X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data - y : numpy array, shape (n_samples,) + y : ndarray of shape (n_samples,) Target values - coef_init : array, shape (n_features,) + coef_init : ndarray of shape (n_features,), default=None The initial coefficients to warm-start the optimization. - intercept_init : array, shape (1,) + intercept_init : ndarray of shape (1,), default=None The initial intercept to warm-start the optimization. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like, shape (n_samples,), default=None Weights applied to individual samples (1. for unweighted). Returns @@ -1229,7 +1228,7 @@ def _decision_function(self, X): Returns ------- - array, shape (n_samples,) + ndarray of shape (n_samples,) Predicted target values per element in X. """ check_is_fitted(self) @@ -1249,7 +1248,7 @@ def predict(self, X): Returns ------- - array, shape (n_samples,) + ndarray of shape (n_samples,) Predicted target values per element in X. """ return self._decision_function(X) @@ -1359,7 +1358,7 @@ class SGDRegressor(BaseSGDRegressor): Parameters ---------- - loss : str, default: 'squared_loss' + loss : str, default='squared_loss' The loss function to be used. The possible values are 'squared_loss', 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' @@ -1371,44 +1370,42 @@ class SGDRegressor(BaseSGDRegressor): 'squared_epsilon_insensitive' is the same but becomes squared loss past a tolerance of epsilon. - penalty : str, 'none', 'l2', 'l1', or 'elasticnet' + penalty : {'l2', 'l1', 'elasticnet'}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and 'elasticnet' might bring sparsity to the model (feature selection) not achievable with 'l2'. - alpha : float - Constant that multiplies the regularization term. Defaults to 0.0001 + alpha : float, default=0.0001 + Constant that multiplies the regularization term. Also used to compute learning_rate when set to 'optimal'. - l1_ratio : float + l1_ratio : float, default=0.15 The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. - Defaults to 0.15. - fit_intercept : bool + fit_intercept : bool, default=True Whether the intercept should be estimated or not. If False, the - data is assumed to be already centered. Defaults to True. + data is assumed to be already centered. - max_iter : int, optional (default=1000) + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the :meth:`partial_fit` method. .. versionadded:: 0.19 - tol : float or None, optional (default=1e-3) + tol : float, default=1e-3 The stopping criterion. If it is not None, the iterations will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. .. versionadded:: 0.19 - shuffle : bool, optional + shuffle : bool, default=True Whether or not the training data should be shuffled after each epoch. - Defaults to True. - verbose : integer, default=0 + verbose : int, default=0 The verbosity level. epsilon : float, default=0.1 @@ -1419,14 +1416,14 @@ class SGDRegressor(BaseSGDRegressor): For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - learning_rate : string, optional + learning_rate : string, default='invscaling' The learning rate schedule: 'constant': @@ -1442,12 +1439,12 @@ class SGDRegressor(BaseSGDRegressor): training loss by tol or fail to increase validation score by tol if early_stopping is True, the current learning rate is divided by 5. - eta0 : double + eta0 : double, default=0.01 The initial learning rate for the 'constant', 'invscaling' or 'adaptive' schedules. The default value is 0.01. - power_t : double - The exponent for inverse scaling learning rate [default 0.25]. + power_t : double, default=0.25 + The exponent for inverse scaling learning rate. early_stopping : bool, default=False Whether to use early stopping to terminate training when validation @@ -1492,16 +1489,16 @@ class SGDRegressor(BaseSGDRegressor): Attributes ---------- - coef_ : array, shape (n_features,) + coef_ : ndarray of shape (n_features,) Weights assigned to the features. - intercept_ : array, shape (1,) + intercept_ : ndarray of shape (1,) The intercept term. - average_coef_ : array, shape (n_features,) + average_coef_ : ndarray of shape (n_features,) Averaged weights assigned to the features. - average_intercept_ : array, shape (1,) + average_intercept_ : ndarray of shape (1,) The averaged intercept term. n_iter_ : int From 68c08703851dbf1f8c0d4161c98fb444055a68df Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Dec 2019 14:41:16 +0100 Subject: [PATCH 158/448] TST Add test for permutation importance with high cardinality (#15939) --- .../tests/test_permutation_importance.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 2a31a031f2938..c13638b2fc0c7 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -15,6 +15,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.impute import SimpleImputer from sklearn.inspection import permutation_importance +from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import OneHotEncoder @@ -24,6 +25,7 @@ from sklearn.utils._testing import _convert_container + @pytest.mark.parametrize("n_jobs", [1, 2]) def test_permutation_importance_correlated_feature_regression(n_jobs): # Make sure that feature highly correlated to the target have a higher @@ -82,6 +84,79 @@ def test_permutation_importance_correlated_feature_regression_pandas(n_jobs): assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_robustness_to_high_cardinality_noisy_feature(n_jobs, seed=42): + # Permutation variable importance should not be affected by the high + # cardinality bias of traditional feature importances, especially when + # computed on a held-out test set: + rng = np.random.RandomState(seed) + n_repeats = 5 + n_samples = 1000 + n_classes = 5 + n_informative_features = 2 + n_noise_features = 1 + n_features = n_informative_features + n_noise_features + + # Generate a multiclass classification dataset and a set of informative + # binary features that can be used to predict some classes of y exactly + # while leaving some classes unexplained to make the problem harder. + classes = np.arange(n_classes) + y = rng.choice(classes, size=n_samples) + X = np.hstack([(y == c).reshape(-1, 1) + for c in classes[:n_informative_features]]) + X = X.astype(np.float32) + + # Not all target classes are explained by the binary class indicator + # features: + assert n_informative_features < n_classes + + # Add 10 other noisy features with high cardinality (numerical) values + # that can be used to overfit the training data. + X = np.concatenate([X, rng.randn(n_samples, n_noise_features)], axis=1) + assert X.shape == (n_samples, n_features) + + # Split the dataset to be able to evaluate on a held-out test set. The + # Test size should be large enough for importance measurements to be + # stable: + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=rng) + clf = RandomForestClassifier(n_estimators=5, random_state=rng) + clf.fit(X_train, y_train) + + # Variable importances computed by impurity decrease on the tree node + # splits often use the noisy features in splits. This can give misleading + # impression that high cardinality noisy variables are the most important: + tree_importances = clf.feature_importances_ + informative_tree_importances = tree_importances[:n_informative_features] + noisy_tree_importances = tree_importances[n_informative_features:] + assert informative_tree_importances.max() < noisy_tree_importances.min() + + # Let's check that permutation-based feature importances do not have this + # problem. + r = permutation_importance(clf, X_test, y_test, n_repeats=n_repeats, + random_state=rng, n_jobs=n_jobs) + + assert r.importances.shape == (X.shape[1], n_repeats) + + # Split the importances between informative and noisy features + informative_importances = r.importances_mean[:n_informative_features] + noisy_importances = r.importances_mean[n_informative_features:] + + # Because we do not have a binary variable explaining each target classes, + # the RF model will have to use the random variable to make some + # (overfitting) splits (as max_depth is not set). Therefore the noisy + # variables will be non-zero but with small values oscillating around + # zero: + assert max(np.abs(noisy_importances)) > 1e-7 + assert noisy_importances.max() < 0.05 + + # The binary features correlated with y should have a higher importance + # than the high cardinality noisy features. + # The maximum test accuracy is 2 / 5 == 0.4, each informative feature + # contributing approximately a bit more than 0.2 of accuracy. + assert informative_importances.min() > 0.15 + + def test_permutation_importance_mixed_types(): rng = np.random.RandomState(42) n_repeats = 4 From 74a08746d471b08463ff79c417246f58d54fec5d Mon Sep 17 00:00:00 2001 From: "@nkish" <19225359+ankishb@users.noreply.github.com> Date: Thu, 26 Dec 2019 15:18:06 +0000 Subject: [PATCH 159/448] DOC Improve defaults in neural network documentation (#15968) --- .../neural_network/_multilayer_perceptron.py | 136 +++++++++--------- sklearn/neural_network/_rbm.py | 46 +++--- .../neural_network/_stochastic_optimizers.py | 18 +-- 3 files changed, 100 insertions(+), 100 deletions(-) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index b6367d32e57a9..51af0e33139dd 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -140,13 +140,13 @@ def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas, Parameters ---------- - packed_coef_inter : array-like + packed_coef_inter : ndarray A vector comprising the flattened coefficients and intercepts. - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. - y : array-like, shape (n_samples,) + y : ndarray of shape (n_samples,) The target values. activations : list, length = n_layers - 1 @@ -185,10 +185,10 @@ def _backprop(self, X, y, activations, deltas, coef_grads, Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. - y : array-like, shape (n_samples,) + y : ndarray of shape (n_samples,) The target values. activations : list, length = n_layers - 1 @@ -613,10 +613,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : ndarray or sparse matrix of shape (n_samples, n_features) The input data. - y : array-like, shape (n_samples,) or (n_samples, n_outputs) + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). @@ -632,10 +632,10 @@ def partial_fit(self): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. - y : array-like, shape (n_samples,) + y : ndarray of shape (n_samples,) The target values. Returns @@ -656,12 +656,12 @@ def _predict(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. Returns ------- - y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) + y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) The decision function of the samples for each class in the model. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) @@ -698,11 +698,11 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Parameters ---------- - hidden_layer_sizes : tuple, length = n_layers - 2, default (100,) + hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,) The ith element represents the number of neurons in the ith hidden layer. - activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu' + activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu' Activation function for the hidden layer. - 'identity', no-op activation, useful to implement linear bottleneck, @@ -717,7 +717,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): - 'relu', the rectified linear unit function, returns f(x) = max(0, x) - solver : {'lbfgs', 'sgd', 'adam'}, default 'adam' + solver : {'lbfgs', 'sgd', 'adam'}, default='adam' The solver for weight optimization. - 'lbfgs' is an optimizer in the family of quasi-Newton methods. @@ -733,15 +733,15 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): For small datasets, however, 'lbfgs' can converge faster and perform better. - alpha : float, optional, default 0.0001 + alpha : float, default=0.0001 L2 penalty (regularization term) parameter. - batch_size : int, optional, default 'auto' + batch_size : int, default='auto' Size of minibatches for stochastic optimizers. If the solver is 'lbfgs', the classifier will not use minibatch. When set to "auto", `batch_size=min(200, n_samples)` - learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant' + learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant' Learning rate schedule for weight updates. - 'constant' is a constant learning rate given by @@ -759,55 +759,55 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Only used when ``solver='sgd'``. - learning_rate_init : double, optional, default 0.001 + learning_rate_init : double, default=0.001 The initial learning rate used. It controls the step-size in updating the weights. Only used when solver='sgd' or 'adam'. - power_t : double, optional, default 0.5 + power_t : double, default=0.5 The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to 'invscaling'. Only used when solver='sgd'. - max_iter : int, optional, default 200 + max_iter : int, default=200 Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations. For stochastic solvers ('sgd', 'adam'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps. - shuffle : bool, optional, default True + shuffle : bool, default=True Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance or None, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - tol : float, optional, default 1e-4 + tol : float, default=1e-4 Tolerance for the optimization. When the loss or score is not improving by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, unless ``learning_rate`` is set to 'adaptive', convergence is considered to be reached and training stops. - verbose : bool, optional, default False + verbose : bool, default=False Whether to print progress messages to stdout. - warm_start : bool, optional, default False + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. - momentum : float, default 0.9 + momentum : float, default=0.9 Momentum for gradient descent update. Should be between 0 and 1. Only used when solver='sgd'. - nesterovs_momentum : boolean, default True + nesterovs_momentum : boolean, default=True Whether to use Nesterov's momentum. Only used when solver='sgd' and momentum > 0. - early_stopping : bool, default False + early_stopping : bool, default=False Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically set aside 10% of training data as validation and terminate training when @@ -816,29 +816,29 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): except in a multilabel setting. Only effective when solver='sgd' or 'adam' - validation_fraction : float, optional, default 0.1 + validation_fraction : float, default=0.1 The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True - beta_1 : float, optional, default 0.9 + beta_1 : float, default=0.9 Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1). Only used when solver='adam' - beta_2 : float, optional, default 0.999 + beta_2 : float, default=0.999 Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1). Only used when solver='adam' - epsilon : float, optional, default 1e-8 + epsilon : float, default=1e-8 Value for numerical stability in adam. Only used when solver='adam' - n_iter_no_change : int, optional, default 10 + n_iter_no_change : int, default=10 Maximum number of epochs to not meet ``tol`` improvement. Only effective when solver='sgd' or 'adam' .. versionadded:: 0.20 - max_fun : int, optional, default 15000 + max_fun : int, default=15000 Only used when solver='lbfgs'. Maximum number of loss function calls. The solver iterates until convergence (determined by 'tol'), number of iterations reaches max_iter, or this number of loss function calls. @@ -849,7 +849,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Attributes ---------- - classes_ : array or list of array of shape (n_classes,) + classes_ : ndarray or list of ndarray of shape (n_classes,) Class labels for each output. loss_ : float @@ -959,12 +959,12 @@ def predict(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. Returns ------- - y : array-like, shape (n_samples,) or (n_samples, n_classes) + y : ndarray, shape (n_samples,) or (n_samples, n_classes) The predicted classes. """ check_is_fitted(self) @@ -980,10 +980,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : ndarray or sparse matrix of shape (n_samples, n_features) The input data. - y : array-like, shape (n_samples,) or (n_samples, n_outputs) + y : ndarray, shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). @@ -1041,12 +1041,12 @@ def predict_log_proba(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The input data. Returns ------- - log_y_prob : array-like, shape (n_samples, n_classes) + log_y_prob : ndarray of shape (n_samples, n_classes) The predicted log-probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. Equivalent to log(predict_proba(X)) @@ -1059,12 +1059,12 @@ def predict_proba(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. Returns ------- - y_prob : array-like, shape (n_samples, n_classes) + y_prob : ndarray of shape (n_samples, n_classes) The predicted probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. """ @@ -1090,11 +1090,11 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): Parameters ---------- - hidden_layer_sizes : tuple, length = n_layers - 2, default (100,) + hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,) The ith element represents the number of neurons in the ith hidden layer. - activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu' + activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu' Activation function for the hidden layer. - 'identity', no-op activation, useful to implement linear bottleneck, @@ -1109,7 +1109,7 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): - 'relu', the rectified linear unit function, returns f(x) = max(0, x) - solver : {'lbfgs', 'sgd', 'adam'}, default 'adam' + solver : {'lbfgs', 'sgd', 'adam'}, default='adam' The solver for weight optimization. - 'lbfgs' is an optimizer in the family of quasi-Newton methods. @@ -1125,15 +1125,15 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): For small datasets, however, 'lbfgs' can converge faster and perform better. - alpha : float, optional, default 0.0001 + alpha : float, default=0.0001 L2 penalty (regularization term) parameter. - batch_size : int, optional, default 'auto' + batch_size : int, default='auto' Size of minibatches for stochastic optimizers. If the solver is 'lbfgs', the classifier will not use minibatch. When set to "auto", `batch_size=min(200, n_samples)` - learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant' + learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant' Learning rate schedule for weight updates. - 'constant' is a constant learning rate given by @@ -1151,55 +1151,55 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): Only used when solver='sgd'. - learning_rate_init : double, optional, default 0.001 + learning_rate_init : double, default=0.001 The initial learning rate used. It controls the step-size in updating the weights. Only used when solver='sgd' or 'adam'. - power_t : double, optional, default 0.5 + power_t : double, default=0.5 The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to 'invscaling'. Only used when solver='sgd'. - max_iter : int, optional, default 200 + max_iter : int, default=200 Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations. For stochastic solvers ('sgd', 'adam'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps. - shuffle : bool, optional, default True + shuffle : bool, default=True Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance or None, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - tol : float, optional, default 1e-4 + tol : float, default=1e-4 Tolerance for the optimization. When the loss or score is not improving by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, unless ``learning_rate`` is set to 'adaptive', convergence is considered to be reached and training stops. - verbose : bool, optional, default False + verbose : bool, default=False Whether to print progress messages to stdout. - warm_start : bool, optional, default False + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. - momentum : float, default 0.9 + momentum : float, default=0.9 Momentum for gradient descent update. Should be between 0 and 1. Only used when solver='sgd'. - nesterovs_momentum : boolean, default True + nesterovs_momentum : boolean, default=True Whether to use Nesterov's momentum. Only used when solver='sgd' and momentum > 0. - early_stopping : bool, default False + early_stopping : bool, default=False Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically set aside 10% of training data as validation and terminate training when @@ -1207,29 +1207,29 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): ``n_iter_no_change`` consecutive epochs. Only effective when solver='sgd' or 'adam' - validation_fraction : float, optional, default 0.1 + validation_fraction : float, default=0.1 The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True - beta_1 : float, optional, default 0.9 + beta_1 : float, default=0.9 Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1). Only used when solver='adam' - beta_2 : float, optional, default 0.999 + beta_2 : float, default=0.999 Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1). Only used when solver='adam' - epsilon : float, optional, default 1e-8 + epsilon : float, default=1e-8 Value for numerical stability in adam. Only used when solver='adam' - n_iter_no_change : int, optional, default 10 + n_iter_no_change : int, default=10 Maximum number of epochs to not meet ``tol`` improvement. Only effective when solver='sgd' or 'adam' .. versionadded:: 0.20 - max_fun : int, optional, default 15000 + max_fun : int, default=15000 Only used when solver='lbfgs'. Maximum number of function calls. The solver iterates until convergence (determined by 'tol'), number of iterations reaches max_iter, or this number of function calls. @@ -1321,12 +1321,12 @@ def predict(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input data. Returns ------- - y : array-like, shape (n_samples, n_outputs) + y : ndarray of shape (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index efe3aeda951af..14960a8b2bb22 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -38,25 +38,25 @@ class BernoulliRBM(TransformerMixin, BaseEstimator): Parameters ---------- - n_components : int, optional + n_components : int, default=256 Number of binary hidden units. - learning_rate : float, optional + learning_rate : float, default=0.1 The learning rate for weight updates. It is *highly* recommended to tune this hyper-parameter. Reasonable values are in the 10**[0., -3.] range. - batch_size : int, optional + batch_size : int, default=10 Number of examples per minibatch. - n_iter : int, optional + n_iter : int, default=10 Number of iterations/sweeps over the training dataset to perform during training. - verbose : int, optional + verbose : int, default=0 The verbosity level. The default, zero, means silent mode. - random_state : integer or RandomState, optional + random_state : integer or RandomState, default=None A random number generator instance to define the state of the random permutations generator. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. @@ -113,12 +113,12 @@ def transform(self, X): Parameters ---------- - X : {array-like, sparse matrix} shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The data to be transformed. Returns ------- - h : array, shape (n_samples, n_components) + h : ndarray of shape (n_samples, n_components) Latent representations of the data. """ check_is_fitted(self) @@ -131,12 +131,12 @@ def _mean_hiddens(self, v): Parameters ---------- - v : array-like, shape (n_samples, n_features) + v : ndarray of shape (n_samples, n_features) Values of the visible layer. Returns ------- - h : array-like, shape (n_samples, n_components) + h : ndarray of shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.components_.T) @@ -148,7 +148,7 @@ def _sample_hiddens(self, v, rng): Parameters ---------- - v : array-like, shape (n_samples, n_features) + v : ndarray of shape (n_samples, n_features) Values of the visible layer to sample from. rng : RandomState @@ -156,7 +156,7 @@ def _sample_hiddens(self, v, rng): Returns ------- - h : array-like, shape (n_samples, n_components) + h : ndarray of shape (n_samples, n_components) Values of the hidden layer. """ p = self._mean_hiddens(v) @@ -167,7 +167,7 @@ def _sample_visibles(self, h, rng): Parameters ---------- - h : array-like, shape (n_samples, n_components) + h : ndarray of shape (n_samples, n_components) Values of the hidden layer to sample from. rng : RandomState @@ -175,7 +175,7 @@ def _sample_visibles(self, h, rng): Returns ------- - v : array-like, shape (n_samples, n_features) + v : ndarray of shape (n_samples, n_features) Values of the visible layer. """ p = np.dot(h, self.components_) @@ -188,12 +188,12 @@ def _free_energy(self, v): Parameters ---------- - v : array-like, shape (n_samples, n_features) + v : ndarray of shape (n_samples, n_features) Values of the visible layer. Returns ------- - free_energy : array-like, shape (n_samples,) + free_energy : ndarray of shape (n_samples,) The value of the free energy. """ return (- safe_sparse_dot(v, self.intercept_visible_) @@ -205,12 +205,12 @@ def gibbs(self, v): Parameters ---------- - v : array-like, shape (n_samples, n_features) + v : ndarray of shape (n_samples, n_features) Values of the visible layer to start from. Returns ------- - v_new : array-like, shape (n_samples, n_features) + v_new : ndarray of shape (n_samples, n_features) Values of the visible layer after one Gibbs step. """ check_is_fitted(self) @@ -227,7 +227,7 @@ def partial_fit(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Training data. Returns @@ -263,7 +263,7 @@ def _fit(self, v_pos, rng): Parameters ---------- - v_pos : array-like, shape (n_samples, n_features) + v_pos : ndarray of shape (n_samples, n_features) The data to use for training. rng : RandomState @@ -290,12 +290,12 @@ def score_samples(self, X): Parameters ---------- - X : {array-like, sparse matrix} shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Values of the visible layer. Must be all-boolean (not checked). Returns ------- - pseudo_likelihood : array-like, shape (n_samples,) + pseudo_likelihood : ndarray of shape (n_samples,) Value of the pseudo-likelihood (proxy for likelihood). Notes @@ -328,7 +328,7 @@ def fit(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix} shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Returns diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py index 3e49e94de8bd1..02fc53a7aecc2 100644 --- a/sklearn/neural_network/_stochastic_optimizers.py +++ b/sklearn/neural_network/_stochastic_optimizers.py @@ -16,7 +16,7 @@ class BaseOptimizer: The concatenated list containing coefs_ and intercepts_ in MLP model. Used for initializing velocities and updating params - learning_rate_init : float, optional, default 0.1 + learning_rate_init : float, default=0.1 The initial learning rate used. It controls the step-size in updating the weights @@ -80,11 +80,11 @@ class SGDOptimizer(BaseOptimizer): The concatenated list containing coefs_ and intercepts_ in MLP model. Used for initializing velocities and updating params - learning_rate_init : float, optional, default 0.1 + learning_rate_init : float, default=0.1 The initial learning rate used. It controls the step-size in updating the weights - lr_schedule : {'constant', 'adaptive', 'invscaling'}, default 'constant' + lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant' Learning rate schedule for weight updates. -'constant', is a constant learning rate given by @@ -100,10 +100,10 @@ class SGDOptimizer(BaseOptimizer): tol, or fail to increase validation score by tol if 'early_stopping' is on, the current learning rate is divided by 5. - momentum : float, optional, default 0.9 + momentum : float, default=0.9 Value of momentum used, must be larger than or equal to 0 - nesterov : bool, optional, default True + nesterov : bool, default=True Whether to use nesterov's momentum or not. Use nesterov's if True Attributes @@ -192,19 +192,19 @@ class AdamOptimizer(BaseOptimizer): The concatenated list containing coefs_ and intercepts_ in MLP model. Used for initializing velocities and updating params - learning_rate_init : float, optional, default 0.1 + learning_rate_init : float, default=0.1 The initial learning rate used. It controls the step-size in updating the weights - beta_1 : float, optional, default 0.9 + beta_1 : float, default=0.9 Exponential decay rate for estimates of first moment vector, should be in [0, 1) - beta_2 : float, optional, default 0.999 + beta_2 : float, default=0.999 Exponential decay rate for estimates of second moment vector, should be in [0, 1) - epsilon : float, optional, default 1e-8 + epsilon : float, default=1e-8 Value for numerical stability Attributes From d163d5ad9433d11b36fb3ce580d97c9462087a40 Mon Sep 17 00:00:00 2001 From: Niklas Date: Fri, 27 Dec 2019 04:31:09 -0500 Subject: [PATCH 160/448] FIX use safe_sparse_dot for callable kernel in LabelSpreading (#15868) --- doc/whats_new/v0.22.rst | 8 ++++ sklearn/semi_supervised/_label_propagation.py | 3 +- .../tests/test_label_propagation.py | 39 +++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 370f02fda49f9..487b3254f13a9 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -53,6 +53,14 @@ Changelog - |Fix| :func:`metrics.classification_report` does no longer ignore the value of the ``zero_division`` keyword argument. :pr:`15879` by :user:`Bibhash Chandra Mitra `. + +:mod:`sklearn.semi_supervised` +.............................. + +- |Fix| :class:`semi_supervised.LabelPropagation` and + :class:`semi_supervised.LabelSpreading` now allow callable kernel function to + return sparse weight matrix. + :pr:`15868` by :user:`Niklas Smedemark-Margulies `. :mod:`sklearn.utils` .................... diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 0ec687aae7d20..665b50dcfa507 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -195,7 +195,8 @@ class labels for weight_matrix in weight_matrices]) else: weight_matrices = weight_matrices.T - probabilities = np.dot(weight_matrices, self.label_distributions_) + probabilities = safe_sparse_dot( + weight_matrices, self.label_distributions_) normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T probabilities /= normalizer return probabilities diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 7e20350b20b2f..d983ab854948b 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -3,10 +3,13 @@ import numpy as np import pytest +from scipy.sparse import issparse from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_no_warnings from sklearn.semi_supervised import _label_propagation as label_propagation from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import train_test_split +from sklearn.neighbors import NearestNeighbors from sklearn.datasets import make_classification from sklearn.exceptions import ConvergenceWarning from numpy.testing import assert_array_almost_equal @@ -152,3 +155,39 @@ def test_convergence_warning(): mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500) assert_no_warnings(mdl.fit, X, y) + + +def test_predict_sparse_callable_kernel(): + # This is a non-regression test for #15866 + + # Custom sparse kernel (top-K RBF) + def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): + nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1) + nn.fit(X) + W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma + np.exp(W.data, out=W.data) + assert issparse(W) + return W.T + + n_classes = 4 + n_samples = 500 + n_test = 10 + X, y = make_classification(n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0) + + X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=n_test, + random_state=0) + + model = label_propagation.LabelSpreading(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test) >= 0.9 + + model = label_propagation.LabelPropagation(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test) >= 0.9 From 9accce5519bdc7733c28f52ec25310ab78ec7dfe Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 27 Dec 2019 05:46:27 -0500 Subject: [PATCH 161/448] BUG Adds attributes back to check_is_fitted (#15947) Co-authored-by: Olivier Grisel --- doc/whats_new/v0.22.rst | 8 +++ sklearn/feature_extraction/tests/test_text.py | 1 + sklearn/feature_extraction/text.py | 8 ++- sklearn/utils/tests/test_validation.py | 49 ++++++++++++++++++- sklearn/utils/validation.py | 26 ++++++++-- 5 files changed, 84 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 487b3254f13a9..a362f839bcca7 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -68,6 +68,14 @@ Changelog - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with boolean columns to floats. :pr:`15797` by `Thomas Fan`_. +- |Fix| :func:`utils.check_is_fitted` accepts back an explicit ``attributes`` + argument to check for specific attributes as explicit markers of a fitted + estimator. When no explicit ``attributes`` are provided, only the attributes + ending with a single "_" are used as "fitted" markers. The ``all_or_any`` + argument is also no longer deprecated. This change is made to + restore some backward compatibility with the behavior of this utility in + version 0.21. :pr:`15947` by `Thomas Fan`_. + .. _changes_0_22: Version 0.22.0 diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 25c353aae5276..f8f741a862594 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1099,6 +1099,7 @@ def test_vectorizer_string_object_as_input(Vectorizer): assert_raise_message( ValueError, message, vec.fit_transform, "hello world!") assert_raise_message(ValueError, message, vec.fit, "hello world!") + vec.fit(["some text", "some other text"]) assert_raise_message(ValueError, message, vec.transform, "hello world!") diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9771c62204444..82ba60a18da28 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1498,7 +1498,11 @@ def transform(self, X, copy=True): X.data += 1 if self.use_idf: - check_is_fitted(self, msg='idf vector is not fitted') + # idf_ being a property, the automatic attributes detection + # does not work as usual and we need to specify the attribute + # name: + check_is_fitted(self, attributes=["idf_"], + msg='idf vector is not fitted') expected_n_features = self._idf_diag.shape[0] if n_features != expected_n_features: @@ -1883,7 +1887,7 @@ def transform(self, raw_documents, copy="deprecated"): X : sparse matrix, [n_samples, n_features] Tf-idf-weighted document-term matrix. """ - check_is_fitted(self, msg='The tfidf vector is not fitted') + check_is_fitted(self, msg='The TF-IDF vectorizer is not fitted') # FIXME Remove copy parameter support in 0.24 if copy != "deprecated": diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index f121f11658051..b298424267067 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -5,6 +5,7 @@ from tempfile import NamedTemporaryFile from itertools import product +from operator import itemgetter import pytest from pytest import importorskip @@ -14,7 +15,6 @@ from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import assert_no_warnings -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import SkipTest @@ -50,7 +50,6 @@ import sklearn from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning -from sklearn.exceptions import DataConversionWarning from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import TempMemmap @@ -678,6 +677,52 @@ def test_check_is_fitted(): assert check_is_fitted(svr) is None +def test_check_is_fitted_attributes(): + class MyEstimator(): + def fit(self, X, y): + return self + + msg = "not fitted" + est = MyEstimator() + + with pytest.raises(NotFittedError, match=msg): + check_is_fitted(est, attributes=["a_", "b_"]) + with pytest.raises(NotFittedError, match=msg): + check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + with pytest.raises(NotFittedError, match=msg): + check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) + + est.a_ = "a" + with pytest.raises(NotFittedError, match=msg): + check_is_fitted(est, attributes=["a_", "b_"]) + with pytest.raises(NotFittedError, match=msg): + check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) + + est.b_ = "b" + check_is_fitted(est, attributes=["a_", "b_"]) + check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) + + +@pytest.mark.parametrize("wrap", + [itemgetter(0), list, tuple], + ids=["single", "list", "tuple"]) +def test_check_is_fitted_with_attributes(wrap): + ard = ARDRegression() + with pytest.raises(NotFittedError, match="is not fitted yet"): + check_is_fitted(ard, wrap(["coef_"])) + + ard.fit(*make_blobs()) + + # Does not raise + check_is_fitted(ard, wrap(["coef_"])) + + # Raises when using attribute that is not defined + with pytest.raises(NotFittedError, match="is not fitted yet"): + check_is_fitted(ard, wrap(["coef_bad_"])) + + def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b']) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e08495de30af5..2248389d0b3b1 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -852,18 +852,29 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, msg=None): +def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of fitted attributes (ending with a trailing underscore) and otherwise raises a NotFittedError with the given message. + This utility is meant to be used internally by estimators themselves, + typically in their own predict / transform methods. + Parameters ---------- estimator : estimator instance. estimator instance for which the check is performed. + attributes : str, list or tuple of str, default=None + Attribute name(s) given as string or a list/tuple of strings + Eg.: ``["coef_", "estimator_", ...], "coef_"`` + + If `None`, `estimator` is considered fitted if there exist an + attribute that ends with a underscore and does not start with double + underscore. + msg : string The default error message is, "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this @@ -874,6 +885,9 @@ def check_is_fitted(estimator, msg=None): Eg. : "Estimator, %(name)s, must be fitted before sparsifying". + all_or_any : callable, {all, any}, default all + Specify whether all or any of the given attributes must exist. + Returns ------- None @@ -892,9 +906,13 @@ def check_is_fitted(estimator, msg=None): if not hasattr(estimator, 'fit'): raise TypeError("%s is not an estimator instance." % (estimator)) - attrs = [v for v in vars(estimator) - if (v.endswith("_") or v.startswith("_")) - and not v.startswith("__")] + if attributes is not None: + if not isinstance(attributes, (list, tuple)): + attributes = [attributes] + attrs = all_or_any([hasattr(estimator, attr) for attr in attributes]) + else: + attrs = [v for v in vars(estimator) + if v.endswith("_") and not v.startswith("__")] if not attrs: raise NotFittedError(msg % {'name': type(estimator).__name__}) From 2287d2d229b763c7592127106c8947872e7326c2 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Fri, 27 Dec 2019 18:57:16 +0800 Subject: [PATCH 162/448] DOC update check_is_fitted what's new --- doc/whats_new/v0.22.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index a362f839bcca7..7ebe82a39b884 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -71,10 +71,10 @@ Changelog - |Fix| :func:`utils.check_is_fitted` accepts back an explicit ``attributes`` argument to check for specific attributes as explicit markers of a fitted estimator. When no explicit ``attributes`` are provided, only the attributes - ending with a single "_" are used as "fitted" markers. The ``all_or_any`` - argument is also no longer deprecated. This change is made to - restore some backward compatibility with the behavior of this utility in - version 0.21. :pr:`15947` by `Thomas Fan`_. + that end with a underscore and do not start with double underscore are used + as "fitted" markers. The ``all_or_any`` argument is also no longer + deprecated. This change is made to restore some backward compatibility with + the behavior of this utility in version 0.21. :pr:`15947` by `Thomas Fan`_. .. _changes_0_22: From d5dcf48c00dd5cdaa01296ebbd7ea06bda19ef47 Mon Sep 17 00:00:00 2001 From: Windber Date: Mon, 30 Dec 2019 02:16:48 +0800 Subject: [PATCH 163/448] DOC change python-devel to python3-devel for yum. (#15986) --- doc/developers/advanced_installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index d52dfba48c0e1..8fd0f9ecf0273 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -339,7 +339,7 @@ architecture (e.g. ARM), you can install the system versions:: On Red Hat and clones (e.g. CentOS), install the dependencies using:: - sudo yum -y install gcc gcc-c++ python-devel numpy scipy + sudo yum -y install gcc gcc-c++ python3-devel numpy scipy Linux compilers from conda-forge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 14bf5bc259c53ee4ebc5a8a048b9317785eb2a4d Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Sun, 29 Dec 2019 15:11:35 -0600 Subject: [PATCH 164/448] DOC Correct the default value of values_format in plot_confusion_matrix (#15981) --- sklearn/metrics/_plot/confusion_matrix.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index f759c4d5c1c3d..11a456aa635b1 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -61,8 +61,7 @@ def plot(self, include_values=True, cmap='viridis', values_format : str, default=None Format specification for values in confusion matrix. If `None`, - the format specification is '.2f' for a normalized matrix, and - 'd' for a unnormalized matrix. + the format specification is '.2g'. ax : matplotlib axes, default=None Axes object to plot on. If `None`, a new figure and axes is @@ -165,8 +164,7 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None, values_format : str, default=None Format specification for values in confusion matrix. If `None`, - the format specification is '.2f' for a normalized matrix, and - 'd' for a unnormalized matrix. + the format specification is '.2g'. cmap : str or matplotlib Colormap, default='viridis' Colormap recognized by matplotlib. From 1e166f77f83d8d973283e205e1e1b882b5a45bee Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 29 Dec 2019 18:18:40 -0500 Subject: [PATCH 165/448] [MRG] MNT Updates pypy to use 7.2.0 (#15954) --- .circleci/config.yml | 2 +- build_tools/circle/build_test_pypy.sh | 13 ++++++++----- conftest.py | 2 +- sklearn/exceptions.py | 5 ++--- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3933d5404202f..9fecc150ba297 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -101,7 +101,7 @@ jobs: pypy3: docker: - - image: pypy:3.6-7.1.1 + - image: pypy:3.6-7.2.0 steps: - restore_cache: keys: diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh index 60b81e60709f0..22e4790e7e4ab 100755 --- a/build_tools/circle/build_test_pypy.sh +++ b/build_tools/circle/build_test_pypy.sh @@ -18,11 +18,14 @@ source pypy-env/bin/activate python --version which python -# XXX: numpy version pinning can be reverted once PyPy -# compatibility is resolved for numpy v1.6.x. For instance, -# when PyPy3 >6.0 is released (see numpy/numpy#12740) -pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy Cython pytest -pip install scipy sphinx numpydoc docutils joblib pillow +pip install -U pip + +# pins versions to install wheel from https://antocuni.github.io/pypy-wheels/manylinux2010 +pip install --extra-index-url https://antocuni.github.io/pypy-wheels/manylinux2010 numpy==1.18.0 scipy==1.3.2 + +# Install Cython directly +pip install https://antocuni.github.io/pypy-wheels/ubuntu/Cython/Cython-0.29.14-py3-none-any.whl +pip install sphinx numpydoc docutils joblib pillow pytest ccache -M 512M export CCACHE_COMPRESS=1 diff --git a/conftest.py b/conftest.py index f640c0e3d001f..b98bb4b271aca 100644 --- a/conftest.py +++ b/conftest.py @@ -37,7 +37,7 @@ def pytest_collection_modifyitems(config, items): skip_marker = pytest.mark.skip( reason='FeatureHasher is not compatible with PyPy') for item in items: - if item.name.endswith(('hashing.FeatureHasher', + if item.name.endswith(('_hash.FeatureHasher', 'text.HashingVectorizer')): item.add_marker(skip_marker) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 4acf7863dd682..ea34365afa703 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -139,9 +139,8 @@ class FitFailedWarning(RuntimeWarning): ... print(repr(w[-1].message)) FitFailedWarning('Estimator fit failed. The score on this train-test partition for these parameters will be set to 0.000000. - Details: - \\nTraceback (most recent call last):...\\nValueError: - Penalty term must be positive; got (C=-2)\\n') + Details:...Traceback (most recent call last):...ValueError: + Penalty term must be positive; got (C=-2)... .. versionchanged:: 0.18 From dae52f9cb37142acd65fe8eb5146c7703af1340d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 30 Dec 2019 12:08:15 -0500 Subject: [PATCH 166/448] MNT Only tweets on main repo (#15993) --- .github/workflows/twitter.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/twitter.yml b/.github/workflows/twitter.yml index d0b41e1c684a0..ac2f037246257 100644 --- a/.github/workflows/twitter.yml +++ b/.github/workflows/twitter.yml @@ -15,6 +15,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Tweet URL of last commit as @sklearn_commits + if: github.repository == 'scikit-learn/scikit-learn' uses: xorilog/twitter-action@0.1 with: args: "-message \"https://github.com/scikit-learn/scikit-learn/commit/${{ github.sha }}\"" From eb3ad2dd5d5244fa384998596a2d6820aa8b16d7 Mon Sep 17 00:00:00 2001 From: Stephen Blystone <29995339+blynotes@users.noreply.github.com> Date: Mon, 30 Dec 2019 21:40:32 -0600 Subject: [PATCH 167/448] FIX Add missing 'values_format' param to disp.plot() in plot_confusion_matrix (#15937) --- doc/whats_new/v0.22.rst | 4 ++++ sklearn/metrics/_plot/confusion_matrix.py | 3 ++- .../_plot/tests/test_plot_confusion_matrix.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 7ebe82a39b884..74b20810de6db 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -54,6 +54,10 @@ Changelog value of the ``zero_division`` keyword argument. :pr:`15879` by :user:`Bibhash Chandra Mitra `. +- |Fix| Fixed a bug in :func:`metrics.plot_confusion_matrix` to correctly + pass the `values_format` parameter to the :class:`ConfusionMatrixDisplay` + plot() call. :pr:`15937` by :user:`Stephen Blystone `. + :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index 11a456aa635b1..537d2b9f0d838 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -195,4 +195,5 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None, disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels) return disp.plot(include_values=include_values, - cmap=cmap, ax=ax, xticks_rotation=xticks_rotation) + cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, + values_format=values_format) diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py index 2d53e6bf24dc0..9f708b151b81b 100644 --- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py +++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py @@ -245,3 +245,22 @@ def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes): assert_allclose(disp.confusion_matrix, cm) assert disp.text_.shape == (n_classes, n_classes) + + +@pytest.mark.parametrize("values_format", ['e', 'n']) +def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes, + fitted_clf, values_format): + # Make sure plot text is formatted with 'values_format'. + X, y = data + cm = confusion_matrix(y, y_pred) + disp = plot_confusion_matrix(fitted_clf, X, y, + include_values=True, + values_format=values_format) + + assert disp.text_.shape == (n_classes, n_classes) + + expected_text = np.array([format(v, values_format) + for v in cm.ravel()]) + text_text = np.array([ + t.get_text() for t in disp.text_.ravel()]) + assert_array_equal(expected_text, text_text) From 46001e7cfb163b73a83d5adf73fb454e8a3c64b6 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 31 Dec 2019 07:02:45 -0500 Subject: [PATCH 168/448] FIX support scalar values in fit_params in SearchCV (#15863) * support a scalar fit param * pep8 * TST add test for desired behavior * FIX introduce _check_fit_params to validate parameters * DOC update whats new * TST tests both grid-search and randomize-search * PEP8 * DOC revert unecessary change * TST add test for _check_fit_params * olivier comments * TST fixes * DOC whats new * DOC whats new * TST revert type of error * add olivier suggestions * address olivier comments * address thomas comments * PEP8 * comments olivier * TST fix test by passing X * avoid to call twice tocsr * add case column/row sparse in check_fit_param * provide optional indices * TST check content when indexing params * PEP8 * TST update tests to check identity * stupid fix * use a distribution in RandomizedSearchCV * MNT add lightgbm to one of the CI build * move to another build * do not install dependencies lightgbm * MNT comments on the CI setup * address some comments * Test fit_params compat without dependency on lightgbm Co-authored-by: Guillaume Lemaitre Co-authored-by: Olivier Grisel --- azure-pipelines.yml | 3 +- build_tools/azure/install.sh | 2 + doc/whats_new/v0.22.rst | 9 ++ sklearn/model_selection/_search.py | 6 +- sklearn/model_selection/_validation.py | 17 +-- sklearn/model_selection/tests/test_search.py | 110 +++++++++++++++---- sklearn/utils/tests/test_validation.py | 30 +++++ sklearn/utils/validation.py | 69 ++++++++++-- 8 files changed, 195 insertions(+), 51 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e2ff71802ce72..b029a2fd18574 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -71,10 +71,9 @@ jobs: JOBLIB_VERSION: '0.12.3' COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. - # It runs tests requiring pandas and PyAMG. + # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - # FIXME: pinned until SciPy wheels are available for Python 3.8 PYTHON_VERSION: '3.8' PYTEST_VERSION: '4.6.2' COVERAGE: 'true' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index aa2707bb03837..1ef981b5dd6e8 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -92,6 +92,8 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install numpy scipy cython joblib python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist python -m pip install pandas matplotlib pyamg + # do not install dependencies for lightgbm since it requires scikit-learn + python -m pip install lightgbm --no-deps fi if [[ "$COVERAGE" == "true" ]]; then diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 74b20810de6db..40f4ef92143d9 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -66,6 +66,15 @@ Changelog return sparse weight matrix. :pr:`15868` by :user:`Niklas Smedemark-Margulies `. +:mod:`sklearn.model_selection` +.............................. + +- |Fix| :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` accept scalar values provided in + `fit_params`. Change in 0.22 was breaking backward compatibility. + :pr:`15863` by :user:`Adrin Jalali ` and + :user:`Guillaume Lemaitre `. + :mod:`sklearn.utils` .................... diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index e6a8493ef6250..a70bdd7a2f9dc 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -33,7 +33,7 @@ from ..utils import check_random_state from ..utils.fixes import MaskedArray from ..utils.random import sample_without_replacement -from ..utils.validation import indexable, check_is_fitted +from ..utils.validation import indexable, check_is_fitted, _check_fit_params from ..utils.metaestimators import if_delegate_has_method from ..metrics._scorer import _check_multimetric_scoring from ..metrics import check_scoring @@ -648,9 +648,7 @@ def fit(self, X, y=None, groups=None, **fit_params): refit_metric = 'score' X, y, groups = indexable(X, y, groups) - # make sure fit_params are sliceable - fit_params_values = indexable(*fit_params.values()) - fit_params = dict(zip(fit_params.keys(), fit_params_values)) + fit_params = _check_fit_params(X, fit_params) n_splits = cv.get_n_splits(X, y, groups) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 2fe4fcd7ff392..f841484ce8eb0 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -23,6 +23,7 @@ from ..base import is_classifier, clone from ..utils import (indexable, check_random_state, _safe_indexing, _message_with_time) +from ..utils.validation import _check_fit_params from ..utils.validation import _is_arraylike, _num_samples from ..utils.metaestimators import _safe_split from ..metrics import check_scoring @@ -489,8 +490,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} - fit_params = {k: _index_param_value(X, v, train) - for k, v in fit_params.items()} + fit_params = _check_fit_params(X, fit_params, train) train_scores = {} if parameters is not None: @@ -830,8 +830,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, """ # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} - fit_params = {k: _index_param_value(X, v, train) - for k, v in fit_params.items()} + fit_params = _check_fit_params(X, fit_params, train) X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) @@ -941,16 +940,6 @@ def _check_is_permutation(indices, n_samples): return True -def _index_param_value(X, v, indices): - """Private helper function for parameter value indexing.""" - if not _is_arraylike(v) or _num_samples(v) != _num_samples(X): - # pass through: skip indexing - return v - if sp.issparse(v): - v = v.tocsr() - return _safe_indexing(v, indices) - - def permutation_test_score(estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=None, random_state=0, verbose=0, scoring=None): diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 056927bee75d0..bacfc20eb1fc1 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -27,7 +27,7 @@ from scipy.stats import bernoulli, expon, uniform -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.datasets import make_classification @@ -36,6 +36,7 @@ from sklearn.model_selection import fit_grid_point from sklearn.model_selection import cross_val_score +from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedShuffleSplit @@ -218,33 +219,25 @@ def test_grid_search_pipeline_steps(): assert not hasattr(param_grid['regressor'][1], 'coef_') -def check_hyperparameter_searcher_with_fit_params(klass, **klass_kwargs): +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_SearchCV_with_fit_params(SearchCV): X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(expected_fit_params=['spam', 'eggs']) - searcher = klass(clf, {'foo_param': [1, 2, 3]}, cv=2, **klass_kwargs) + searcher = SearchCV( + clf, {'foo_param': [1, 2, 3]}, cv=2, error_score="raise" + ) # The CheckingClassifier generates an assertion error if # a parameter is missing or has length != len(X). - assert_raise_message(AssertionError, - "Expected fit parameter(s) ['eggs'] not seen.", - searcher.fit, X, y, spam=np.ones(10)) - assert_raise_message( - ValueError, - "Found input variables with inconsistent numbers of samples: [", - searcher.fit, X, y, spam=np.ones(1), - eggs=np.zeros(10)) - searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) + err_msg = r"Expected fit parameter$s$ \['eggs'\] not seen." + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(10)) - -def test_grid_search_with_fit_params(): - check_hyperparameter_searcher_with_fit_params(GridSearchCV, - error_score='raise') - - -def test_random_search_with_fit_params(): - check_hyperparameter_searcher_with_fit_params(RandomizedSearchCV, n_iter=1, - error_score='raise') + err_msg = "Fit parameter spam has length 1; expected" + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10)) + searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) @ignore_warnings @@ -1846,3 +1839,78 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed(): attr_message = "GridSearchCV not identical with precomputed metric" assert (preds_original == preds_precomputed).all(), attr_message + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [(GridSearchCV, {'a': [0.1, 0.01]}), + (RandomizedSearchCV, {'a': uniform(1, 3)})] +) +def test_scalar_fit_param(SearchCV, param_search): + # unofficially sanctioned tolerance for scalar values in fit_params + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + class TestEstimator(BaseEstimator, ClassifierMixin): + def __init__(self, a=None): + self.a = a + + def fit(self, X, y, r=None): + self.r_ = r + + def predict(self, X): + return np.zeros(shape=(len(X))) + + model = SearchCV(TestEstimator(), param_search) + X, y = make_classification(random_state=42) + model.fit(X, y, r=42) + assert model.best_estimator_.r_ == 42 + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [(GridSearchCV, {'alpha': [0.1, 0.01]}), + (RandomizedSearchCV, {'alpha': uniform(0.01, 0.1)})] +) +def test_scalar_fit_param_compat(SearchCV, param_search): + # check support for scalar values in fit_params, for instance in LightGBM + # that do not exactly respect the scikit-learn API contract but that we do + # not want to break without an explicit deprecation cycle and API + # recommendations for implementing early stopping with a user provided + # validation set. non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + X_train, X_valid, y_train, y_valid = train_test_split( + *make_classification(random_state=42), random_state=42 + ) + + class _FitParamClassifier(SGDClassifier): + + def fit(self, X, y, sample_weight=None, tuple_of_arrays=None, + scalar_param=None, callable_param=None): + super().fit(X, y, sample_weight=sample_weight) + assert scalar_param > 0 + assert callable(callable_param) + + # The tuple of arrays should be preserved as tuple. + assert isinstance(tuple_of_arrays, tuple) + assert tuple_of_arrays[0].ndim == 2 + assert tuple_of_arrays[1].ndim == 1 + return self + + def _fit_param_callable(): + pass + + model = SearchCV( + _FitParamClassifier(), param_search + ) + + # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which + # is not the case for the following parameters. But this abuse is common in + # popular third-party libraries and we should tolerate this behavior for + # now and be careful not to break support for those without following + # proper deprecation cycle. + fit_params = { + 'tuple_of_arrays': (X_valid, y_valid), + 'callable_param': _fit_param_callable, + 'scalar_param': 42, + } + model.fit(X_train, y_train, **fit_params) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index b298424267067..cf4374f29e212 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -32,6 +32,7 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.datasets import make_blobs +from sklearn.utils import _safe_indexing from sklearn.utils.validation import ( has_fit_parameter, check_is_fitted, @@ -46,6 +47,7 @@ _check_sample_weight, _allclose_dense_sparse, FLOAT_DTYPES) +from sklearn.utils.validation import _check_fit_params import sklearn @@ -1098,3 +1100,31 @@ def __init__(self, a=1, b=1, *, c=1, d=1): with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"): A2(1, 2, 3, 4) + + +@pytest.mark.parametrize("indices", [None, [1, 3]]) +def test_check_fit_params(indices): + X = np.random.randn(4, 2) + fit_params = { + 'list': [1, 2, 3, 4], + 'array': np.array([1, 2, 3, 4]), + 'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T, + 'sparse-row': sp.csc_matrix([1, 2, 3, 4]), + 'scalar-int': 1, + 'scalar-str': 'xxx', + 'None': None, + } + result = _check_fit_params(X, fit_params, indices) + indices_ = indices if indices is not None else list(range(X.shape[0])) + + for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']: + assert result[key] is fit_params[key] + + assert result['list'] == _safe_indexing(fit_params['list'], indices_) + assert_array_equal( + result['array'], _safe_indexing(fit_params['array'], indices_) + ) + assert_allclose_dense_sparse( + result['sparse-col'], + _safe_indexing(fit_params['sparse-col'], indices_) + ) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2248389d0b3b1..9929ff2f35502 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -212,6 +212,26 @@ def check_consistent_length(*arrays): " samples: %r" % [int(l) for l in lengths]) +def _make_indexable(iterable): + """Ensure iterable supports indexing or convert to an indexable variant. + + Convert sparse matrices to csr and other non-indexable iterable to arrays. + Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. + + Parameters + ---------- + iterable : {list, dataframe, array, sparse} or None + Object to be converted to an indexable iterable. + """ + if sp.issparse(iterable): + return iterable.tocsr() + elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): + return iterable + elif iterable is None: + return iterable + return np.array(iterable) + + def indexable(*iterables): """Make arrays indexable for cross-validation. @@ -224,16 +244,7 @@ def indexable(*iterables): *iterables : lists, dataframes, arrays, sparse matrices List of objects to ensure sliceability. """ - result = [] - for X in iterables: - if sp.issparse(X): - result.append(X.tocsr()) - elif hasattr(X, "__getitem__") or hasattr(X, "iloc"): - result.append(X) - elif X is None: - result.append(X) - else: - result.append(np.array(X)) + result = [_make_indexable(X) for X in iterables] check_consistent_length(*result) return result @@ -1277,3 +1288,41 @@ def inner_f(*args, **kwargs): kwargs.update({k: arg for k, arg in zip(all_args, args)}) return f(**kwargs) return inner_f + + +def _check_fit_params(X, fit_params, indices=None): + """Check and validate the parameters passed during `fit`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data array. + + fit_params : dict + Dictionary containing the parameters passed at fit. + + indices : array-like of shape (n_samples,), default=None + Indices to be selected if the parameter has the same size as `X`. + + Returns + ------- + fit_params_validated : dict + Validated parameters. We ensure that the values support indexing. + """ + from . import _safe_indexing + fit_params_validated = {} + for param_key, param_value in fit_params.items(): + if (not _is_arraylike(param_value) or + _num_samples(param_value) != _num_samples(X)): + # Non-indexable pass-through (for now for backward-compatibility). + # https://github.com/scikit-learn/scikit-learn/issues/15805 + fit_params_validated[param_key] = param_value + else: + # Any other fit_params should support indexing + # (e.g. for cross-validation). + fit_params_validated[param_key] = _make_indexable(param_value) + fit_params_validated[param_key] = _safe_indexing( + fit_params_validated[param_key], indices + ) + + return fit_params_validated From 70d8b70d7a3fbd2ead8c27ed9190024ff10372ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Tue, 31 Dec 2019 06:19:15 -0800 Subject: [PATCH 169/448] Remove abstractmethod that silently brake downstream packages (#15996) --- doc/whats_new/v0.22.rst | 7 +++++++ sklearn/naive_bayes.py | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 40f4ef92143d9..e36d7e925529d 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -58,6 +58,13 @@ Changelog pass the `values_format` parameter to the :class:`ConfusionMatrixDisplay` plot() call. :pr:`15937` by :user:`Stephen Blystone `. +:mod:`sklearn.naive_bayes` +.......................... + +- |Fix| removed abstract method `_check_X` from :class:`naive_bayes.BaseNB` + that could break downstream projects inheriting from this deprecated + public base class. :pr:`15996` by :user:`Brigitta Sipőcz `. + :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index cebc428e17b12..d958645b178f6 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -51,12 +51,6 @@ def _joint_log_likelihood(self, X): predict_proba and predict_log_proba. """ - @abstractmethod - def _check_X(self, X): - """Validate input X - """ - pass - def predict(self, X): """ Perform classification on an array of test vectors X. From f28a90c9aa552740a6fe4d9eccc409f82392019b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 1 Jan 2020 04:11:17 +0100 Subject: [PATCH 170/448] FIX restore BaseNB._check_X without abstractmethod decoration (#15997) --- doc/whats_new/v0.22.rst | 7 ++++--- sklearn/naive_bayes.py | 8 ++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e36d7e925529d..394fd6ee8203c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -61,9 +61,10 @@ Changelog :mod:`sklearn.naive_bayes` .......................... -- |Fix| removed abstract method `_check_X` from :class:`naive_bayes.BaseNB` - that could break downstream projects inheriting from this deprecated - public base class. :pr:`15996` by :user:`Brigitta Sipőcz `. +- |Fix| Removed `abstractmethod` decorator for the method `_check_X` in + :class:`naive_bayes.BaseNB` that could break downstream projects inheriting + from this deprecated public base class. :pr:`15996` by + :user:`Brigitta Sipőcz `. :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index d958645b178f6..22bd339cbd6b0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -51,6 +51,14 @@ def _joint_log_likelihood(self, X): predict_proba and predict_log_proba. """ + def _check_X(self, X): + """To be overridden in subclasses with the actual checks.""" + # Note that this is not marked @abstractmethod as long as the + # deprecated public alias sklearn.naive_bayes.BayesNB exists + # (until 0.24) to preserve backward compat for 3rd party projects + # with existing derived classes. + return X + def predict(self, X): """ Perform classification on an array of test vectors X. From 072bfc95c4732f540003bd0cb76854588f6af986 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 2 Jan 2020 10:25:39 +0100 Subject: [PATCH 171/448] Update v0.22 changelog for 0.22.1 (#16002) - set the date - move entry for quantile transformer to the 0.22.1 section - fix alphabetical ordering of modules --- doc/whats_new/v0.22.rst | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 394fd6ee8203c..d5d7fddf7417d 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -7,7 +7,7 @@ Version 0.22.1 ============== -**In Development** +**January 2 2020** This is a bug-fix release to primarily resolve some packaging issues in version 0.22.0. It also includes minor documentation improvements and some bug fixes. @@ -53,11 +53,20 @@ Changelog - |Fix| :func:`metrics.classification_report` does no longer ignore the value of the ``zero_division`` keyword argument. :pr:`15879` by :user:`Bibhash Chandra Mitra `. - + - |Fix| Fixed a bug in :func:`metrics.plot_confusion_matrix` to correctly pass the `values_format` parameter to the :class:`ConfusionMatrixDisplay` plot() call. :pr:`15937` by :user:`Stephen Blystone `. +:mod:`sklearn.model_selection` +.............................. + +- |Fix| :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` accept scalar values provided in + `fit_params`. Change in 0.22 was breaking backward compatibility. + :pr:`15863` by :user:`Adrin Jalali ` and + :user:`Guillaume Lemaitre `. + :mod:`sklearn.naive_bayes` .......................... @@ -66,6 +75,13 @@ Changelog from this deprecated public base class. :pr:`15996` by :user:`Brigitta Sipőcz `. +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the + `quantiles_` attribute to be completely sorted in non-decreasing manner. + :pr:`15751` by :user:`Tirth Patel `. + :mod:`sklearn.semi_supervised` .............................. @@ -74,15 +90,6 @@ Changelog return sparse weight matrix. :pr:`15868` by :user:`Niklas Smedemark-Margulies `. -:mod:`sklearn.model_selection` -.............................. - -- |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` accept scalar values provided in - `fit_params`. Change in 0.22 was breaking backward compatibility. - :pr:`15863` by :user:`Adrin Jalali ` and - :user:`Guillaume Lemaitre `. - :mod:`sklearn.utils` .................... @@ -859,10 +866,6 @@ Changelog :class:`preprocessing.KernelCenterer` :pr:`14336` by :user:`Gregory Dexter `. -- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the - `quantiles_` attribute to be completely sorted in non-decreasing manner. - :pr:`15751` by :user:`Tirth Patel `. - :mod:`sklearn.model_selection` .............................. From c8c21ae18a7289d8ae9f837946e1e7f85b0337b8 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 2 Jan 2020 04:39:41 -0500 Subject: [PATCH 172/448] STY Removes hidden scroll bar (#15999) --- doc/themes/scikit-learn-modern/static/css/theme.css | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 782800eb31915..a77fb03e36f65 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -518,6 +518,16 @@ div.sk-sidebar-toc-wrapper { overflow-y: scroll; height: 100vh; padding-right: 1.75rem; + + /* Hide scrollbar for IE and Edge */ + -ms-overflow-style: none; + + /* Hide scrollbar for Firefox */ + scrollbar-width: none; +} + +div.sk-sidebar-toc-wrapper::-webkit-scrollbar { + display: none; } div.sk-sidebar-toc-wrapper::after { From 4d5407c89fb2b25ef5cba90470cbdaecf10064cc Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 2 Jan 2020 14:13:27 +0100 Subject: [PATCH 173/448] Make test_sag_regressor_computed_correctly deterministic (#16003) Fix #15818. --- sklearn/linear_model/tests/test_sag.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py index 6a591288b55d8..6bb156c64715b 100644 --- a/sklearn/linear_model/tests/test_sag.py +++ b/sklearn/linear_model/tests/test_sag.py @@ -120,7 +120,7 @@ def sag(X, y, step_size, alpha, n_iter=1, dloss=None, sparse=False, def sag_sparse(X, y, step_size, alpha, n_iter=1, dloss=None, sample_weight=None, sparse=False, - fit_intercept=True, saga=False): + fit_intercept=True, saga=False, random_state=0): if step_size * alpha == 1.: raise ZeroDivisionError("Sparse sag does not handle the case " "step_size * alpha == 1") @@ -130,7 +130,7 @@ def sag_sparse(X, y, step_size, alpha, n_iter=1, sum_gradient = np.zeros(n_features) last_updated = np.zeros(n_features, dtype=np.int) gradient_memory = np.zeros(n_samples) - rng = np.random.RandomState(77) + rng = check_random_state(random_state) intercept = 0.0 intercept_sum_gradient = 0.0 wscale = 1.0 @@ -368,7 +368,7 @@ def test_sag_regressor_computed_correctly(): alpha = .1 n_features = 10 n_samples = 40 - max_iter = 50 + max_iter = 100 tol = .000001 fit_intercept = True rng = np.random.RandomState(0) @@ -378,7 +378,8 @@ def test_sag_regressor_computed_correctly(): step_size = get_step_size(X, alpha, fit_intercept, classification=False) clf1 = Ridge(fit_intercept=fit_intercept, tol=tol, solver='sag', - alpha=alpha * n_samples, max_iter=max_iter) + alpha=alpha * n_samples, max_iter=max_iter, + random_state=rng) clf2 = clone(clf1) clf1.fit(X, y) @@ -387,12 +388,14 @@ def test_sag_regressor_computed_correctly(): spweights1, spintercept1 = sag_sparse(X, y, step_size, alpha, n_iter=max_iter, dloss=squared_dloss, - fit_intercept=fit_intercept) + fit_intercept=fit_intercept, + random_state=rng) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=max_iter, dloss=squared_dloss, sparse=True, - fit_intercept=fit_intercept) + fit_intercept=fit_intercept, + random_state=rng) assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), From df1364ca11a49befa369dc0b4f6a63cb390c2853 Mon Sep 17 00:00:00 2001 From: Manish Aradwad Date: Fri, 3 Jan 2020 15:11:06 +0530 Subject: [PATCH 174/448] DOC Update docstring of random_state in _kmeans.py (#16007) --- sklearn/cluster/_kmeans.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 71be86a087629..3bfc6328b9ac1 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -53,9 +53,8 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): x_squared_norms : array, shape (n_samples,) Squared Euclidean norm of each data point. - random_state : int, RandomState instance - The generator used to initialize the centers. Use an int to make the - randomness deterministic. + random_state : RandomState instance + The generator used to initialize the centers. See :term:`Glossary `. n_local_trials : integer, optional From 54c6739f8e35ebb2805c2b508e012d2fd6fc893b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20S=C5=82apek?= <28485371+mslapek@users.noreply.github.com> Date: Fri, 3 Jan 2020 11:47:05 +0100 Subject: [PATCH 175/448] DOC replace best_parameters_ with best_params_ in SearchCV docstring (#16005) --- sklearn/model_selection/_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index a70bdd7a2f9dc..934ec0df6b116 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -944,7 +944,7 @@ class GridSearchCV(BaseSearchCV): Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given ``cv_results_``. In that - case, the ``best_estimator_`` and ``best_parameters_`` will be set + case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. @@ -1274,7 +1274,7 @@ class RandomizedSearchCV(BaseSearchCV): Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given the ``cv_results``. In that - case, the ``best_estimator_`` and ``best_parameters_`` will be set + case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. From e5c54ca854f8056f0f219eb691ddbe3c93cbc6a4 Mon Sep 17 00:00:00 2001 From: mo <31044045+mghah@users.noreply.github.com> Date: Fri, 3 Jan 2020 03:00:38 -0800 Subject: [PATCH 176/448] DOC Follow doc guideline for LinearRegression docstring (#15988) --- sklearn/linear_model/_base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index e6270dce6d906..d3b74b7404209 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -382,12 +382,12 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): Parameters ---------- - fit_intercept : bool, optional, default True + fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). - normalize : bool, optional, default False + normalize : bool, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. @@ -395,10 +395,10 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. - copy_X : bool, optional, default True + copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to use for the computation. This will only provide speedup for n_targets > 1 and sufficient large problems. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. @@ -419,7 +419,7 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): singular_ : array of shape (min(X, y),) Singular values of `X`. Only available when `X` is dense. - intercept_ : float or array of shape of (n_targets,) + intercept_ : float or array of shape (n_targets,) Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. From 07ceb6e2552a8af5e35dd652d24989b6a69dd7c7 Mon Sep 17 00:00:00 2001 From: "@nkish" <19225359+ankishb@users.noreply.github.com> Date: Fri, 3 Jan 2020 14:32:54 +0000 Subject: [PATCH 177/448] DOC follow doc guideline in ensemble an feature_extraction modules (#15975) --- sklearn/ensemble/_bagging.py | 58 ++--- sklearn/ensemble/_base.py | 10 +- sklearn/ensemble/_forest.py | 219 +++++++++--------- sklearn/ensemble/_gb_losses.py | 156 ++++++------- sklearn/ensemble/_iforest.py | 45 ++-- sklearn/ensemble/_stacking.py | 6 +- sklearn/ensemble/_voting.py | 46 ++-- sklearn/ensemble/_weight_boosting.py | 34 +-- .../feature_extraction/_dict_vectorizer.py | 19 +- sklearn/feature_extraction/_hash.py | 10 +- sklearn/feature_extraction/image.py | 78 ++++--- sklearn/feature_extraction/text.py | 154 ++++++------ 12 files changed, 420 insertions(+), 415 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 2a9ed512113d8..ea4e5eedb6079 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -259,10 +259,10 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): The target values (class labels in classification, real numbers in regression). - max_samples : int or float, optional (default=None) + max_samples : int or float, default=None Argument to use instead of self.max_samples. - max_depth : int, optional (default=None) + max_depth : int, default=None Override value used when constructing base estimator. Only supported if the base estimator has a max_depth parameter. @@ -456,37 +456,37 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): Parameters ---------- - base_estimator : object or None, optional (default=None) + base_estimator : object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. - n_estimators : int, optional (default=10) + n_estimators : int, default=10 The number of base estimators in the ensemble. - max_samples : int or float, optional (default=1.0) + max_samples : int or float, default=1.0 The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - max_features : int or float, optional (default=1.0) + max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. - bootstrap : boolean, optional (default=True) + bootstrap : bool, default=True Whether samples are drawn with replacement. If False, sampling without replacement is performed. - bootstrap_features : boolean, optional (default=False) + bootstrap_features : bool, default=False Whether features are drawn with replacement. - oob_score : bool, optional (default=False) + oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization error. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary `. @@ -494,19 +494,19 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): .. versionadded:: 0.17 *warm_start* constructor parameter. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel for both :meth:`fit` and :meth:`predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. Attributes @@ -527,7 +527,7 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): estimators_features_ : list of arrays The subset of drawn features for each base estimator. - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The classes labels. n_classes_ : int or list @@ -537,7 +537,7 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. - oob_decision_function_ : array of shape (n_samples, n_classes) + oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, @@ -689,7 +689,7 @@ def predict_proba(self, X): Returns ------- - p : array of shape (n_samples, n_classes) + p : ndarray of shape (n_samples, n_classes) The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ @@ -739,7 +739,7 @@ def predict_log_proba(self, X): Returns ------- - p : array of shape (n_samples, n_classes) + p : ndarray of shape (n_samples, n_classes) The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ @@ -794,7 +794,7 @@ def decision_function(self, X): Returns ------- - score : array, shape = [n_samples, k] + score : ndarray of shape (n_samples, k) The decision function of the input samples. The columns correspond to the classes in sorted order, as they appear in the attribute ``classes_``. Regression and binary classification are special @@ -858,54 +858,54 @@ class BaggingRegressor(RegressorMixin, BaseBagging): Parameters ---------- - base_estimator : object or None, optional (default=None) + base_estimator : object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. - n_estimators : int, optional (default=10) + n_estimators : int, default=10 The number of base estimators in the ensemble. - max_samples : int or float, optional (default=1.0) + max_samples : int or float, default=1.0 The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - max_features : int or float, optional (default=1.0) + max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. - bootstrap : boolean, optional (default=True) + bootstrap : bool, default=True Whether samples are drawn with replacement. If False, sampling without replacement is performed. - bootstrap_features : boolean, optional (default=False) + bootstrap_features : bool, default=False Whether features are drawn with replacement. - oob_score : bool + oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization error. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary `. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel for both :meth:`fit` and :meth:`predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. Attributes diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 9c6d8cbce0206..a7f018b94c54d 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -49,7 +49,7 @@ def _set_random_states(estimator, random_state=None): Estimator with potential randomness managed by random_state parameters. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -83,13 +83,13 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): Parameters ---------- - base_estimator : object, optional (default=None) + base_estimator : object, default=None The base estimator from which the ensemble is built. - n_estimators : integer + n_estimators : int The number of estimators in the ensemble. - estimator_params : list of strings + estimator_params : list of str The list of attributes to use as parameters when instantiating a new base estimator. If none are given, default parameters are used. @@ -276,7 +276,7 @@ def get_params(self, deep=True): Parameters ---------- - deep : bool + deep : bool, default=True Setting it to True gets the various classifiers and the parameters of the classifiers as well. """ diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7e88f0c2f189a..639845187b67d 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -210,14 +210,14 @@ def apply(self, X): Parameters ---------- - X : {array-like or sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- - X_leaves : array_like, shape = [n_samples, n_estimators] + X_leaves : ndarray of shape (n_samples, n_estimators) For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. """ @@ -237,18 +237,19 @@ def decision_path(self, X): Parameters ---------- - X : {array-like or sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- - indicator : sparse csr array, shape = [n_samples, n_nodes] - Return a node indicator matrix where non zero elements - indicates that the samples goes through the nodes. + indicator : sparse matrix of shape (n_samples, n_nodes) + Return a node indicator matrix where non zero elements indicates + that the samples goes through the nodes. The matrix is of CSR + format. - n_nodes_ptr : array of size (n_estimators + 1, ) + n_nodes_ptr : ndarray of size (n_estimators + 1,) The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] gives the indicator value for the i-th estimator. @@ -271,7 +272,7 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. @@ -419,7 +420,7 @@ def feature_importances_(self): Returns ------- - feature_importances_ : array, shape = [n_features] + feature_importances_ : ndarray of shape (n_features,) The values of this array sum to 1, unless all trees are single node trees consisting of only the root node, in which case it will be an array of zeros. @@ -599,14 +600,14 @@ def predict(self, X): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- - y : array-like of shape (n_samples,) or (n_samples, n_outputs) + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ proba = self.predict_proba(X) @@ -639,14 +640,14 @@ def predict_proba(self, X): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- - p : array of shape (n_samples, n_classes), or a list of n_outputs + p : ndarray of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. @@ -686,14 +687,14 @@ def predict_log_proba(self, X): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- - p : array of shape (n_samples, n_classes), or a list of n_outputs + p : ndarray of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. @@ -751,14 +752,14 @@ def predict(self, X): Parameters ---------- - X : array-like or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- - y : array-like of shape (n_samples,) or (n_samples, n_outputs) + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) @@ -848,24 +849,24 @@ class RandomForestClassifier(ForestClassifier): Parameters ---------- - n_estimators : integer, optional (default=100) + n_estimators : int, default=100 The number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : string, optional (default="gini") + criterion : {"gini", "entropy"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Note: this parameter is tree-specific. - max_depth : integer or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -876,7 +877,7 @@ class RandomForestClassifier(ForestClassifier): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -891,12 +892,12 @@ class RandomForestClassifier(ForestClassifier): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0. The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, string or None, optional (default="auto") + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -912,12 +913,12 @@ class RandomForestClassifier(ForestClassifier): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0. A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -935,7 +936,7 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.19 - min_impurity_split : float, (default=0) + min_impurity_split : float, default=0 Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -946,38 +947,38 @@ class RandomForestClassifier(ForestClassifier): will be removed in 0.25. Use ``min_impurity_decrease`` instead. - bootstrap : boolean, optional (default=True) + bootstrap : bool, default=True Whether bootstrap samples are used when building trees. If False, the whole datset is used to build each tree. - oob_score : bool (default=False) + oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization accuracy. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Controls both the randomness of the bootstrapping of the samples used when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). See :term:`Glossary ` for details. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. - class_weight : dict, list of dicts, "balanced", "balanced_subsample" or \ - None, optional (default=None) + class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ + default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same @@ -1002,7 +1003,7 @@ class RandomForestClassifier(ForestClassifier): Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -1051,7 +1052,7 @@ class labels (multi-output problem). Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. - oob_decision_function_ : array of shape (n_samples, n_classes) + oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, @@ -1162,14 +1163,14 @@ class RandomForestRegressor(ForestRegressor): Parameters ---------- - n_estimators : integer, optional (default=10) + n_estimators : int, default=10 The number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : string, optional (default="mse") + criterion : {"mse", "mae"}, default="mse" The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean @@ -1178,12 +1179,12 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. - max_depth : integer or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -1194,7 +1195,7 @@ class RandomForestRegressor(ForestRegressor): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -1209,12 +1210,12 @@ class RandomForestRegressor(ForestRegressor): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0. The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, string or None, optional (default="auto") + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -1230,12 +1231,12 @@ class RandomForestRegressor(ForestRegressor): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0. A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -1253,7 +1254,7 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.19 - min_impurity_split : float, (default=0) + min_impurity_split : float, default=0 Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1263,37 +1264,37 @@ class RandomForestRegressor(ForestRegressor): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - bootstrap : boolean, optional (default=True) + bootstrap : bool, default=True Whether bootstrap samples are used when building trees. If False, the whole datset is used to build each tree. - oob_score : bool, optional (default=False) + oob_score : bool, default=False whether to use out-of-bag samples to estimate the R^2 on unseen data. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Controls both the randomness of the bootstrapping of the samples used when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). See :term:`Glossary ` for details. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -1444,23 +1445,23 @@ class ExtraTreesClassifier(ForestClassifier): Parameters ---------- - n_estimators : integer, optional (default=10) + n_estimators : int, default=10 The number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : string, optional (default="gini") + criterion : {"gini", "entropy"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. - max_depth : integer or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -1471,7 +1472,7 @@ class ExtraTreesClassifier(ForestClassifier): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -1486,12 +1487,12 @@ class ExtraTreesClassifier(ForestClassifier): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0. The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, string or None, optional (default="auto") + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -1507,12 +1508,12 @@ class ExtraTreesClassifier(ForestClassifier): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0. A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -1530,7 +1531,7 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.19 - min_impurity_split : float, (default=0) + min_impurity_split : float, default=0 Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1540,22 +1541,22 @@ class ExtraTreesClassifier(ForestClassifier): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - bootstrap : boolean, optional (default=False) + bootstrap : bool, default=False Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree. - oob_score : bool, optional (default=False) + oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization accuracy. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Controls 3 sources of randomness: - the bootstrapping of the samples used when building trees @@ -1566,16 +1567,16 @@ class ExtraTreesClassifier(ForestClassifier): See :term:`Glossary ` for details. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. - class_weight : dict, list of dicts, "balanced", "balanced_subsample" or \ - None, optional (default=None) + class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ + default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same @@ -1600,7 +1601,7 @@ class ExtraTreesClassifier(ForestClassifier): Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -1649,7 +1650,7 @@ class labels (multi-output problem). Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. - oob_decision_function_ : array of shape (n_samples, n_classes) + oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, @@ -1749,14 +1750,14 @@ class ExtraTreesRegressor(ForestRegressor): Parameters ---------- - n_estimators : integer, optional (default=10) + n_estimators : int, default=10 The number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : string, optional (default="mse") + criterion : {"mse", "mae"}, default="mse" The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean @@ -1765,12 +1766,12 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. - max_depth : integer or None, optional (default=None) + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -1781,7 +1782,7 @@ class ExtraTreesRegressor(ForestRegressor): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -1796,12 +1797,12 @@ class ExtraTreesRegressor(ForestRegressor): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0. The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float, string or None, optional (default="auto") + max_features : {"auto", "sqrt", "log2"} int or float, default="auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. @@ -1817,12 +1818,12 @@ class ExtraTreesRegressor(ForestRegressor): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0. A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -1840,7 +1841,7 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.19 - min_impurity_split : float, (default=0) + min_impurity_split : float, default=0 Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1850,21 +1851,21 @@ class ExtraTreesRegressor(ForestRegressor): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - bootstrap : boolean, optional (default=False) + bootstrap : bool, default=False Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree. - oob_score : bool, optional (default=False) + oob_score : bool, default=False Whether to use out-of-bag samples to estimate the R^2 on unseen data. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Controls 3 sources of randomness: - the bootstrapping of the samples used when building trees @@ -1875,15 +1876,15 @@ class ExtraTreesRegressor(ForestRegressor): See :term:`Glossary ` for details. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. - ccp_alpha : non-negative float, optional (default=0.0) + ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See @@ -2012,19 +2013,19 @@ class RandomTreesEmbedding(BaseForest): Parameters ---------- - n_estimators : integer, optional (default=10) + n_estimators : int, default=10 Number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - max_depth : integer, optional (default=5) + max_depth : int, default=5 The maximum depth of each tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. - min_samples_split : int, float, optional (default=2) + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. @@ -2035,7 +2036,7 @@ class RandomTreesEmbedding(BaseForest): .. versionchanged:: 0.18 Added float values for fractions. - min_samples_leaf : int, float, optional (default=1) + min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and @@ -2050,17 +2051,17 @@ class RandomTreesEmbedding(BaseForest): .. versionchanged:: 0.18 Added float values for fractions. - min_weight_fraction_leaf : float, optional (default=0.) + min_weight_fraction_leaf : float, default=0. The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_decrease : float, optional (default=0.) + min_impurity_decrease : float, default=0. A node will be split if this split induces a decrease of the impurity greater than or equal to this value. @@ -2078,7 +2079,7 @@ class RandomTreesEmbedding(BaseForest): .. versionadded:: 0.19 - min_impurity_split : float, (default=0) + min_impurity_split : float, default=0 Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -2088,26 +2089,26 @@ class RandomTreesEmbedding(BaseForest): ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it will be removed in 0.25. Use ``min_impurity_decrease`` instead. - sparse_output : bool, optional (default=True) + sparse_output : bool, default=True Whether or not to return a sparse CSR matrix, as default behavior, or to return a dense array compatible with dense pipeline operators. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Controls the generation of the random `y` used to fit the trees and the draw of the splits for each feature at the trees' nodes. See :term:`Glossary ` for details. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity when fitting and predicting. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. @@ -2178,7 +2179,7 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices are also supported, use sparse ``csc_matrix`` for maximum efficiency. @@ -2204,7 +2205,7 @@ def fit_transform(self, X, y=None, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Input data used to build forests. Use ``dtype=np.float32`` for maximum efficiency. @@ -2217,7 +2218,7 @@ def fit_transform(self, X, y=None, sample_weight=None): Returns ------- - X_transformed : sparse matrix, shape=(n_samples, n_out) + X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ X = check_array(X, accept_sparse=['csc']) @@ -2239,14 +2240,14 @@ def transform(self, X): Parameters ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices are also supported, use sparse ``csr_matrix`` for maximum efficiency. Returns ------- - X_transformed : sparse matrix, shape=(n_samples, n_out) + X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ check_is_fitted(self) diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py index 19c66710bf0ad..1323f0f71adf9 100644 --- a/sklearn/ensemble/_gb_losses.py +++ b/sklearn/ensemble/_gb_losses.py @@ -46,13 +46,13 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves). - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ @@ -62,10 +62,10 @@ def negative_gradient(self, y, raw_predictions, **kargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) The target labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. """ @@ -81,18 +81,18 @@ def update_terminal_regions(self, tree, X, y, residual, raw_predictions, ---------- tree : tree.Tree The tree object. - X : 2d array, shape (n, m) + X : ndarray of shape (n_samples, n_features) The data array. - y : 1d array, shape (n,) + y : ndarray of shape (n_samples,) The target labels. - residual : 1d array, shape (n,) + residual : ndarray of shape (n_samples,) The residuals (usually the negative gradient). - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. - sample_weight : 1d array, shape (n,) + sample_weight : ndarray of shape (n_samples,) The weight of each sample. - sample_mask : 1d array, shape (n,) + sample_mask : ndarray of shape (n_samples,) The sample mask to be used. learning_rate : float, default=0.1 Learning rate shrinks the contribution of each tree by @@ -129,14 +129,14 @@ def get_init_raw_predictions(self, X, estimator): Parameters ---------- - X : 2d array, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data array. - estimator : estimator instance + estimator : object The estimator to use to compute the predictions. Returns ------- - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The initial raw predictions. K is equal to 1 for binary classification and regression, and equal to the number of classes for multiclass classification. ``raw_predictions`` is casted @@ -164,7 +164,7 @@ def check_init_estimator(self, estimator): Parameters ---------- - estimator : estimator instance + estimator : object The init estimator to check. """ if not (hasattr(estimator, 'fit') and hasattr(estimator, 'predict')): @@ -196,13 +196,13 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) - The raw_predictions (i.e. values from the tree leaves). + raw_predictions : ndarray of shape (n_samples, K) + The raw predictions (i.e. values from the tree leaves). - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ if sample_weight is None: @@ -216,10 +216,10 @@ def negative_gradient(self, y, raw_predictions, **kargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) The target labels. - raw_predictions : 1d array, shape (n_samples,) + raw_predictions : ndarray of shape (n_samples,) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. """ @@ -236,18 +236,18 @@ def update_terminal_regions(self, tree, X, y, residual, raw_predictions, ---------- tree : tree.Tree The tree object. - X : 2d array, shape (n, m) + X : ndarray of shape (n_samples, n_features) The data array. - y : 1d array, shape (n,) + y : ndarray of shape (n_samples,) The target labels. - residual : 1d array, shape (n,) + residual : ndarray of shape (n_samples,) The residuals (usually the negative gradient). - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. - sample_weight : 1d array, shape (n,) + sample_weight : ndarray of shape (n,) The weight of each sample. - sample_mask : 1d array, shape (n,) + sample_mask : ndarray of shape (n,) The sample mask to be used. learning_rate : float, default=0.1 Learning rate shrinks the contribution of each tree by @@ -279,13 +279,13 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : array, shape (n_samples, K) - The raw_predictions (i.e. values from the tree leaves). + raw_predictions : ndarray of shape (n_samples, K) + The raw predictions (i.e. values from the tree leaves). - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ if sample_weight is None: @@ -301,10 +301,10 @@ def negative_gradient(self, y, raw_predictions, **kargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) The target labels. - raw_predictions : array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. """ @@ -327,11 +327,6 @@ class HuberLossFunction(RegressionLossFunction): M-Regression proposed in Friedman 2001. - References - ---------- - J. Friedman, Greedy Function Approximation: A Gradient Boosting - Machine, The Annals of Statistics, Vol. 29, No. 5, 2001. - Parameters ---------- n_classes : int @@ -339,6 +334,11 @@ class HuberLossFunction(RegressionLossFunction): alpha : float, default=0.9 Percentile at which to extract score. + + References + ---------- + J. Friedman, Greedy Function Approximation: A Gradient Boosting + Machine, The Annals of Statistics, Vol. 29, No. 5, 2001. """ def __init__(self, n_classes, alpha=0.9): @@ -354,14 +354,14 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ raw_predictions = raw_predictions.ravel() @@ -394,14 +394,14 @@ def negative_gradient(self, y, raw_predictions, sample_weight=None, Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) The target labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ raw_predictions = raw_predictions.ravel() @@ -443,7 +443,7 @@ class QuantileLossFunction(RegressionLossFunction): n_classes : int Number of classes. - alpha : float, optional (default = 0.9) + alpha : float, default = 0.9 The percentile. """ def __init__(self, n_classes, alpha=0.9): @@ -459,14 +459,14 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ raw_predictions = raw_predictions.ravel() @@ -488,11 +488,11 @@ def negative_gradient(self, y, raw_predictions, **kargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) The target labels. - raw_predictions : 2d array, shape (n_samples, K) - The raw_predictions (i.e. values from the tree leaves) of the + raw_predictions : ndarray of shape (n_samples, K) + The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. """ alpha = self.alpha @@ -519,13 +519,13 @@ def _raw_prediction_to_proba(self, raw_predictions): Parameters ---------- - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. Returns ------- - probas : 2d array, shape (n_samples, K) + probas : ndarray of shape (n_samples, K) The predicted probabilities. """ @@ -535,13 +535,13 @@ def _raw_prediction_to_decision(self, raw_predictions): Parameters ---------- - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. Returns ------- - encoded_predictions : 2d array, shape (n_samples, K) + encoded_predictions : ndarray of shape (n_samples, K) The predicted encoded labels. """ @@ -550,7 +550,7 @@ def check_init_estimator(self, estimator): Parameters ---------- - estimator : estimator instance + estimator : object The init estimator to check. """ if not (hasattr(estimator, 'fit') and @@ -589,14 +589,14 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. - sample_weight : 1d array , shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ # logaddexp(0, v) == log(1.0 + exp(v)) @@ -614,11 +614,11 @@ def negative_gradient(self, y, raw_predictions, **kargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) - The raw_predictions (i.e. values from the tree leaves) of the + raw_predictions : ndarray of shape (n_samples, K) + The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. """ return y - expit(raw_predictions.ravel()) @@ -696,14 +696,14 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ # create one-hot label encoding @@ -724,14 +724,14 @@ def negative_gradient(self, y, raw_predictions, k=0, **kwargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) The target labels. - raw_predictions : 2d array, shape (n_samples, K) - The raw_predictions (i.e. values from the tree leaves) of the + raw_predictions : ndarray of shape (n_samples, K) + The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. - k : int, optional default=0 + k : int, default=0 The index of the class. """ return y - np.nan_to_num(np.exp(raw_predictions[:, k] - @@ -779,14 +779,14 @@ class ExponentialLoss(ClassificationLossFunction): Same loss as AdaBoost. - References - ---------- - Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007 - Parameters ---------- n_classes : int Number of classes. + + References + ---------- + Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007 """ def __init__(self, n_classes): if n_classes != 2: @@ -803,14 +803,14 @@ def __call__(self, y, raw_predictions, sample_weight=None): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble. - sample_weight : 1d array, shape (n_samples,), optional + sample_weight : ndarray of shape (n_samples,), default=None Sample weights. """ raw_predictions = raw_predictions.ravel() @@ -825,10 +825,10 @@ def negative_gradient(self, y, raw_predictions, **kargs): Parameters ---------- - y : 1d array, shape (n_samples,) + y : ndarray of shape (n_samples,) True labels. - raw_predictions : 2d array, shape (n_samples, K) + raw_predictions : ndarray of shape (n_samples, K) The raw predictions (i.e. values from the tree leaves) of the tree ensemble at iteration ``i - 1``. """ diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index fcd9fe63ec755..df393b628bb02 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -50,10 +50,10 @@ class IsolationForest(OutlierMixin, BaseBagging): Parameters ---------- - n_estimators : int, optional (default=100) + n_estimators : int, default=100 The number of base estimators in the ensemble. - max_samples : int or float, optional (default="auto") + max_samples : "auto", int or float, default="auto" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. @@ -62,7 +62,7 @@ class IsolationForest(OutlierMixin, BaseBagging): If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). - contamination : 'auto' or float, optional (default='auto') + contamination : 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. @@ -75,18 +75,18 @@ class IsolationForest(OutlierMixin, BaseBagging): The default value of ``contamination`` changed from 0.1 to ``'auto'``. - max_features : int or float, optional (default=1.0) + max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. - bootstrap : bool, optional (default=False) + bootstrap : bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel for both :meth:`fit` and :meth:`predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all @@ -106,16 +106,16 @@ class IsolationForest(OutlierMixin, BaseBagging): ``behaviour`` parameter is deprecated in 0.22 and removed in 0.24. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - verbose : int, optional (default=0) + verbose : int, default=0 Controls the verbosity of the tree building process. - warm_start : bool, optional (default=False) + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. @@ -131,8 +131,8 @@ class IsolationForest(OutlierMixin, BaseBagging): The subset of drawn samples (i.e., the in-bag samples) for each base estimator. - max_samples_ : integer - The actual number of samples + max_samples_ : int + The actual number of samples. offset_ : float Offset used to define the decision function from the raw scores. We @@ -224,7 +224,7 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices are also supported, use sparse ``csc_matrix`` for maximum efficiency. @@ -312,14 +312,14 @@ def predict(self, X): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier : ndarray of shape (n_samples,) For each observation, tells whether or not (+1 or -1) it should be considered as an inlier according to the fitted model. """ @@ -344,14 +344,14 @@ def decision_function(self, X): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- - scores : array, shape (n_samples,) + scores : ndarray of shape (n_samples,) The anomaly score of the input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers. @@ -376,12 +376,12 @@ def score_samples(self, X): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Returns ------- - scores : array, shape (n_samples,) + scores : ndarray of shape (n_samples,) The anomaly score of the input samples. The lower, the more abnormal. """ @@ -439,9 +439,10 @@ def _compute_score_samples(self, X, subsample_features): Parameters ---------- X : array-like or sparse matrix + Data matrix. - subsample_features : bool, - whether features should be subsampled + subsample_features : bool + Whether features should be subsampled. """ n_samples = X.shape[0] @@ -475,13 +476,13 @@ def _average_path_length(n_samples_leaf): latter has the same structure as an isolation tree. Parameters ---------- - n_samples_leaf : array-like, shape (n_samples,). + n_samples_leaf : array-like of shape (n_samples,) The number of training samples in each test sample leaf, for each estimators. Returns ------- - average_path_length : array, same shape as n_samples_leaf + average_path_length : array of same shape as n_samples_leaf """ n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 2fe284253ccc9..db3d3508a46ba 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -117,7 +117,7 @@ def fit(self, X, y, sample_weight=None): y : array-like of shape (n_samples,) Target values. - sample_weight : array-like of shape (n_samples,) or None + sample_weight : array-like of shape (n_samples,) or default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. @@ -398,7 +398,7 @@ def fit(self, X, y, sample_weight=None): y : array-like of shape (n_samples,) Target values. - sample_weight : array-like of shape (n_samples,) or None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. @@ -630,7 +630,7 @@ def fit(self, X, y, sample_weight=None): y : array-like of shape (n_samples,) Target values. - sample_weight : array-like of shape (n_samples,) or None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 23f381ca75750..838d2440a9e4d 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -98,24 +98,24 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): Using ``None`` to drop an estimator is deprecated in 0.22 and support will be dropped in 0.24. Use the string ``'drop'`` instead. - voting : str, {'hard', 'soft'} (default='hard') + voting : {'hard', 'soft'}, default='hard' If 'hard', uses predicted class labels for majority rule voting. Else if 'soft', predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers. - weights : array-like, shape (n_classifiers,), optional (default=`None`) + weights : array-like of shape (n_classifiers,), default=`None` Sequence of weights (`float` or `int`) to weight the occurrences of predicted class labels (`hard` voting) or class probabilities before averaging (`soft` voting). Uses uniform weights if `None`. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel for ``fit``. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - flatten_transform : bool, optional (default=True) + flatten_transform : bool, default=True Affects shape of transform output only when voting='soft' If voting='soft' and flatten_transform=True, transform method returns matrix with shape (n_samples, n_classifiers * n_classes). If @@ -133,7 +133,7 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): .. versionadded:: 0.20 - classes_ : array-like, shape (n_predictions,) + classes_ : array-like of shape (n_predictions,) The classes labels. See Also @@ -189,14 +189,14 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. - sample_weight : array-like, shape (n_samples,) or None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. @@ -226,12 +226,12 @@ def predict(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Returns ------- - maj : array-like, shape (n_samples,) + maj : array-like of shape (n_samples,) Predicted class labels. """ check_is_fitted(self) @@ -266,12 +266,12 @@ def predict_proba(self): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Returns ------- - avg : array-like, shape (n_samples, n_classes) + avg : array-like of shape (n_samples, n_classes) Weighted average probability for each class per sample. """ if self.voting == 'hard': @@ -284,7 +284,7 @@ def transform(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. @@ -292,13 +292,13 @@ def transform(self, X): ------- probabilities_or_labels If `voting='soft'` and `flatten_transform=True`: - returns array-like of shape (n_classifiers, n_samples * + returns ndarray of shape (n_classifiers, n_samples * n_classes), being class probabilities calculated by each classifier. If `voting='soft' and `flatten_transform=False`: - array-like of shape (n_classifiers, n_samples, n_classes) + ndarray of shape (n_classifiers, n_samples, n_classes) If `voting='hard'`: - array-like of shape (n_samples, n_classifiers), being + ndarray of shape (n_samples, n_classifiers), being class labels predicted by each classifier. """ check_is_fitted(self) @@ -336,11 +336,11 @@ class VotingRegressor(RegressorMixin, _BaseVoting): Using ``None`` to drop an estimator is deprecated in 0.22 and support will be dropped in 0.24. Use the string ``'drop'`` instead. - weights : array-like, shape (n_regressors,), optional (default=`None`) + weights : array-like of shape (n_regressors,), default=`None` Sequence of weights (`float` or `int`) to weight the occurrences of predicted values before averaging. Uses uniform weights if `None`. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to run in parallel for ``fit``. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -386,14 +386,14 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. - sample_weight : array-like, shape (n_samples,) or None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. @@ -419,7 +419,7 @@ def predict(self, X): Returns ------- - y : array of shape (n_samples,) + y : ndarray of shape (n_samples,) The predicted values. """ check_is_fitted(self) @@ -436,7 +436,7 @@ def transform(self, X): Returns ------- - predictions: array of shape (n_samples, n_classifiers) + predictions: ndarray of shape (n_samples, n_classifiers) Values predicted by each regressor. """ check_is_fitted(self) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index ebd9635cd52b3..77337c1b662a1 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -299,29 +299,29 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting): Parameters ---------- - base_estimator : object, optional (default=None) + base_estimator : object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is ``DecisionTreeClassifier(max_depth=1)``. - n_estimators : int, optional (default=50) + n_estimators : int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. - learning_rate : float, optional (default=1.) + learning_rate : float, default=1. Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. - algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R') + algorithm : {'SAMME', 'SAMME.R'}, default='SAMME.R' If 'SAMME.R' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If 'SAMME' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -649,7 +649,7 @@ def staged_predict(self, X): Yields ------ - y : generator of array, shape = [n_samples] + y : generator of ndarray of shape (n_samples,) The predicted classes. """ X = self._validate_data(X) @@ -677,7 +677,7 @@ def decision_function(self, X): Returns ------- - score : array, shape = [n_samples, k] + score : ndarray of shape of (n_samples, k) The decision function of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. Binary classification is a special cases with ``k == 1``, @@ -720,7 +720,7 @@ def staged_decision_function(self, X): Yields ------ - score : generator of array, shape = [n_samples, k] + score : generator of ndarray of shape (n_samples, k) The decision function of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. Binary classification is a special cases with ``k == 1``, @@ -793,7 +793,7 @@ def predict_proba(self, X): Returns ------- - p : array of shape (n_samples, n_classes) + p : ndarray of shape (n_samples, n_classes) The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ @@ -828,7 +828,7 @@ def staged_predict_proba(self, X): Yields ------- - p : generator of array, shape = [n_samples] + p : generator of ndarray of shape (n_samples,) The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ @@ -854,7 +854,7 @@ def predict_log_proba(self, X): Returns ------- - p : array of shape (n_samples, n_classes) + p : ndarray of shape (n_samples, n_classes) The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ @@ -879,25 +879,25 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting): Parameters ---------- - base_estimator : object, optional (default=None) + base_estimator : object, default=None The base estimator from which the boosted ensemble is built. If ``None``, then the base estimator is ``DecisionTreeRegressor(max_depth=3)``. - n_estimators : integer, optional (default=50) + n_estimators : int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. - learning_rate : float, optional (default=1.) + learning_rate : float, default=1. Learning rate shrinks the contribution of each regressor by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. - loss : {'linear', 'square', 'exponential'}, optional (default='linear') + loss : {'linear', 'square', 'exponential'}, default='linear' The loss function to use when updating the weights after each boosting iteration. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -1147,7 +1147,7 @@ def staged_predict(self, X): Yields ------- - y : generator of array, shape = [n_samples] + y : generator of ndarray of shape (n_samples,) The predicted regression values. """ check_is_fitted(self) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index ca49263f57913..b527b0d72e6be 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -47,18 +47,17 @@ class DictVectorizer(TransformerMixin, BaseEstimator): Parameters ---------- - dtype : callable, optional + dtype : dtype, default=np.float64 The type of feature values. Passed to Numpy array/scipy.sparse matrix constructors as the dtype argument. - separator : string, optional + separator : str, default="=" Separator string used when constructing new features for one-hot coding. - sparse : boolean, optional. + sparse : bool, default=True Whether transform should produce scipy.sparse matrices. - True by default. - sort : boolean, optional. + sort : bool, default=True Whether ``feature_names_`` and ``vocabulary_`` should be - sorted when fitting. True by default. + sorted when fitting. Attributes ---------- @@ -241,13 +240,13 @@ def inverse_transform(self, X, dict_type=dict): ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Sample matrix. - dict_type : callable, optional + dict_type : type, default=dict Constructor for feature mappings. Must conform to the collections.Mapping API. Returns ------- - D : list of dict_type objects, length = n_samples + D : list of dict_type objects of shape (n_samples,) Feature mappings for the samples in X. """ # COO matrix is not subscriptable @@ -276,7 +275,7 @@ def transform(self, X): Parameters ---------- - X : Mapping or iterable over Mappings, length = n_samples + X : Mapping or iterable over Mappings of shape (n_samples,) Dict(s) or Mapping(s) from feature names (arbitrary Python objects) to feature values (strings or convertible to dtype). @@ -324,7 +323,7 @@ def restrict(self, support, indices=False): support : array-like Boolean mask or list of indices (as returned by the get_support member of feature selectors). - indices : boolean, optional + indices : bool, default=False Whether support is a list of indices. Returns diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index f5a0ba540ccf9..f52e6f296169b 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -47,11 +47,11 @@ class FeatureHasher(TransformerMixin, BaseEstimator): Parameters ---------- - n_features : integer, optional + n_features : int, default=2**20 The number of features (columns) in the output matrices. Small numbers of features are likely to cause hash collisions, but large numbers will cause larger coefficient dimensions in linear learners. - input_type : string, optional, default "dict" + input_type : {"dict", "pair"}, default="dict" Either "dict" (the default) to accept dictionaries over (feature_name, value); "pair" to accept pairs of (feature_name, value); or "string" to accept single strings. @@ -60,11 +60,11 @@ class FeatureHasher(TransformerMixin, BaseEstimator): The feature_name is hashed to find the appropriate column for the feature. The value's sign might be flipped in the output (but see non_negative, below). - dtype : numpy type, optional, default np.float64 + dtype : numpy dtype, default=np.float64 The type of feature values. Passed to scipy.sparse matrix constructors as the dtype argument. Do not set this to bool, np.boolean or any unsigned integer type. - alternate_sign : boolean, optional, default True + alternate_sign : bool, default=True When True, an alternating sign is added to the features as to approximately conserve the inner product in the hashed space even for small n_features. This approach is similar to sparse random projection. @@ -116,7 +116,7 @@ def fit(self, X=None, y=None): Parameters ---------- - X : array-like + X : ndarray Returns ------- diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index d0da784c526d7..588abf3fcf896 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -33,11 +33,11 @@ def _make_edges_3d(n_x, n_y, n_z=1): Parameters ---------- - n_x : integer + n_x : int The size of the grid in the x direction. - n_y : integer + n_y : int The size of the grid in the y direction. - n_z : integer, optional + n_z : integer, default=1 The size of the grid in the z direction, defaults to 1 """ vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z)) @@ -138,14 +138,16 @@ def img_to_graph(img, mask=None, return_as=sparse.coo_matrix, dtype=None): Parameters ---------- - img : ndarray, 2D or 3D - 2D or 3D image - mask : ndarray of booleans, optional + img : ndarray of shape (height, width) or (height, width, channel) + 2D or 3D image. + mask : ndarray of shape (height, width) or \ + (height, width, channel), dtype=bool, default=None An optional mask of the image, to consider only part of the pixels. - return_as : np.ndarray or a sparse matrix class, optional + return_as : np.ndarray or a sparse matrix class, \ + default=sparse.coo_matrix The class to use to build the returned adjacency matrix. - dtype : None or dtype, optional + dtype : dtype, default=None The data of the returned sparse matrix. By default it is the dtype of img @@ -175,14 +177,15 @@ def grid_to_graph(n_x, n_y, n_z=1, mask=None, return_as=sparse.coo_matrix, Dimension in x axis n_y : int Dimension in y axis - n_z : int, optional, default 1 + n_z : int, default=1 Dimension in z axis - mask : ndarray of booleans, optional + mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None An optional mask of the image, to consider only part of the pixels. - return_as : np.ndarray or a sparse matrix class, optional + return_as : np.ndarray or a sparse matrix class, \ + default=sparse.coo_matrix The class to use to build the returned adjacency matrix. - dtype : dtype, optional, default int + dtype : dtype, default=int The data of the returned sparse matrix. By default it is int Notes @@ -216,7 +219,7 @@ def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None): The height of a patch p_w : int The width of a patch - max_patches : integer or float, optional default is None + max_patches : int or float, default=None The maximum number of patches to extract. If max_patches is a float between 0 and 1, it is taken to be a proportion of the total number of patches. @@ -257,12 +260,12 @@ def _extract_patches(arr, patch_shape=8, extraction_step=1): arr : ndarray n-dimensional array of which patches are to be extracted - patch_shape : integer or tuple of length arr.ndim + patch_shape : int or tuple of length arr.ndim.default=8 Indicates the shape of the patches to be extracted. If an integer is given, the shape will be a hypercube of sidelength given by its value. - extraction_step : integer or tuple of length arr.ndim + extraction_step : int or tuple of length arr.ndim, default=1 Indicates step size at which extraction shall be performed. If integer is given, then the step is uniform in all dimensions. @@ -317,12 +320,12 @@ def extract_patches(arr, patch_shape=8, extraction_step=1): arr : ndarray n-dimensional array of which patches are to be extracted - patch_shape : integer or tuple of length arr.ndim + patch_shape : int or tuple of length arr.ndim, default=8 Indicates the shape of the patches to be extracted. If an integer is given, the shape will be a hypercube of sidelength given by its value. - extraction_step : integer or tuple of length arr.ndim + extraction_step : int or tuple of length arr.ndim, default=1 Indicates step size at which extraction shall be performed. If integer is given, then the step is uniform in all dimensions. @@ -349,20 +352,20 @@ def extract_patches_2d(image, patch_size, max_patches=None, random_state=None): Parameters ---------- - image : array, shape = (image_height, image_width) or + image : ndarray of shape (image_height, image_width) or \ (image_height, image_width, n_channels) The original image data. For color images, the last dimension specifies the channel: a RGB image would have `n_channels=3`. - patch_size : tuple of ints (patch_height, patch_width) - the dimensions of one patch + patch_size : tuple of int (patch_height, patch_width) + The dimensions of one patch. - max_patches : integer or float, optional default is None - The maximum number of patches to extract. If max_patches is a float + max_patches : int or float, default=None + The maximum number of patches to extract. If `max_patches` is a float between 0 and 1, it is taken to be a proportion of the total number of patches. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Determines the random number generator used for random sampling when `max_patches` is not None. Use an int to make the randomness deterministic. @@ -370,7 +373,7 @@ def extract_patches_2d(image, patch_size, max_patches=None, random_state=None): Returns ------- - patches : array, shape = (n_patches, patch_height, patch_width) or + patches : array of shape (n_patches, patch_height, patch_width) or \ (n_patches, patch_height, patch_width, n_channels) The collection of patches extracted from the image, where `n_patches` is either `max_patches` or the total number of patches that can be @@ -446,20 +449,20 @@ def reconstruct_from_patches_2d(patches, image_size): Parameters ---------- - patches : array, shape = (n_patches, patch_height, patch_width) or + patches : ndarray of shape (n_patches, patch_height, patch_width) or \ (n_patches, patch_height, patch_width, n_channels) The complete set of patches. If the patches contain colour information, channels are indexed along the last dimension: RGB patches would have `n_channels=3`. - image_size : tuple of ints (image_height, image_width) or + image_size : tuple of int (image_height, image_width) or \ (image_height, image_width, n_channels) - the size of the image that will be reconstructed + The size of the image that will be reconstructed. Returns ------- - image : array, shape = image_size - the reconstructed image + image : ndarray of shape image_size + The reconstructed image. """ i_h, i_w = image_size[:2] p_h, p_w = patches.shape[1:3] @@ -488,21 +491,20 @@ class PatchExtractor(BaseEstimator): Parameters ---------- - patch_size : tuple of ints (patch_height, patch_width) - the dimensions of one patch + patch_size : tuple of int (patch_height, patch_width) + The dimensions of one patch. - max_patches : integer or float, optional default is None + max_patches : int or float, default=None The maximum number of patches per image to extract. If max_patches is a float in (0, 1), it is taken to mean a proportion of the total number of patches. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None Determines the random number generator used for random sampling when `max_patches` is not None. Use an int to make the randomness deterministic. See :term:`Glossary `. - Examples -------- >>> from sklearn.datasets import load_sample_images @@ -524,14 +526,14 @@ def __init__(self, patch_size=None, max_patches=None, random_state=None): self.random_state = random_state def fit(self, X, y=None): - """Do nothing and return the estimator unchanged + """Do nothing and return the estimator unchanged. This method is just there to implement the usual API and hence work in pipelines. Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) Training data. """ return self @@ -541,7 +543,7 @@ def transform(self, X): Parameters ---------- - X : array, shape = (n_samples, image_height, image_width) or + X : ndarray of shape (n_samples, image_height, image_width) or \ (n_samples, image_height, image_width, n_channels) Array of images from which to extract patches. For color images, the last dimension specifies the channel: a RGB image would have @@ -549,7 +551,7 @@ def transform(self, X): Returns ------- - patches : array, shape = (n_patches, patch_height, patch_width) or + patches : array of shape (n_patches, patch_height, patch_width) or \ (n_patches, patch_height, patch_width, n_channels) The collection of patches extracted from the images, where `n_patches` is either `n_samples * max_patches` or the total diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 82ba60a18da28..5b127a10962bc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -53,10 +53,10 @@ def _preprocess(doc, accent_function=None, lower=False): ---------- doc: str The string to preprocess - accent_function: callable + accent_function: callable, default=None Function for handling accented characters. Common strategies include normalizing and removing. - lower: bool + lower: bool, default=False Whether to use str.lower to lowercase all fo the text Returns @@ -81,12 +81,12 @@ def _analyze(doc, analyzer=None, tokenizer=None, ngrams=None, Parameters ---------- - analyzer: callable - tokenizer: callable - ngrams: callable - preprocessor: callable - decoder: callable - stop_words: list + analyzer: callable, default=None + tokenizer: callable, default=None + ngrams: callable, default=None + preprocessor: callable, default=None + decoder: callable, default=None + stop_words: list, default=None Returns ------- @@ -576,7 +576,7 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): Parameters ---------- - input : string {'filename', 'file', 'content'} + input : string {'filename', 'file', 'content'}, default='content' If 'filename', the sequence passed as an argument to fit is expected to be a list of filenames that need reading to fetch the raw content to analyze. @@ -591,13 +591,13 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): If bytes or files are given to analyze, this encoding is used to decode. - decode_error : {'strict', 'ignore', 'replace'} + decode_error : {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'. - strip_accents : {'ascii', 'unicode', None} + strip_accents : {'ascii', 'unicode'}, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have @@ -608,20 +608,20 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. - lowercase : boolean, default=True + lowercase : bool, default=True Convert all characters to lowercase before tokenizing. - preprocessor : callable or None (default) + preprocessor : callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer is not callable``. - tokenizer : callable or None (default) + tokenizer : callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. - stop_words : string {'english'}, list, or None (default) + stop_words : string {'english'}, list, default=None If 'english', a built-in stop word list for English is used. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). @@ -644,7 +644,8 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): only bigrams. Only applies if ``analyzer is not callable``. - analyzer : string, {'word', 'char', 'char_wb'} or callable + analyzer : string, {'word', 'char', 'char_wb'} or callable, \ + default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. @@ -658,27 +659,27 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): first read from the file and then passed to the given callable analyzer. - n_features : integer, default=(2 ** 20) + n_features : int, default=(2 ** 20) The number of features (columns) in the output matrices. Small numbers of features are likely to cause hash collisions, but large numbers will cause larger coefficient dimensions in linear learners. - binary : boolean, default=False. + binary : bool, default=False. If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. - norm : 'l1', 'l2' or None, optional + norm : {'l1', 'l2'}, default='l2' Norm used to normalize term vectors. None for no normalization. - alternate_sign : boolean, optional, default True + alternate_sign : bool, default=True When True, an alternating sign is added to the features as to approximately conserve the inner product in the hashed space even for small n_features. This approach is similar to sparse random projection. .. versionadded:: 0.19 - dtype : type, optional + dtype : type, default=np.float64 Type of the matrix returned by fit_transform() or transform(). Examples @@ -733,7 +734,7 @@ def partial_fit(self, X, y=None): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : ndarray of shape [n_samples, n_features] Training data. """ return self @@ -743,7 +744,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : ndarray of shape [n_samples, n_features] Training data. """ # triggers a parameter validation @@ -839,7 +840,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): Parameters ---------- - input : string {'filename', 'file', 'content'} + input : string {'filename', 'file', 'content'}, default='content' If 'filename', the sequence passed as an argument to fit is expected to be a list of filenames that need reading to fetch the raw content to analyze. @@ -850,17 +851,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): Otherwise the input is expected to be a sequence of items that can be of type string or byte. - encoding : string, 'utf-8' by default. + encoding : string, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode. - decode_error : {'strict', 'ignore', 'replace'} + decode_error : {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'. - strip_accents : {'ascii', 'unicode', None} + strip_accents : {'ascii', 'unicode'}, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have @@ -871,20 +872,20 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. - lowercase : boolean, True by default + lowercase : bool, default=True Convert all characters to lowercase before tokenizing. - preprocessor : callable or None (default) + preprocessor : callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer is not callable``. - tokenizer : callable or None (default) + tokenizer : callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. - stop_words : string {'english'}, list, or None (default) + stop_words : string {'english'}, list, default=None If 'english', a built-in stop word list for English is used. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). @@ -911,7 +912,8 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): unigrams and bigrams, and ``(2, 2)`` means only bigrams. Only applies if ``analyzer is not callable``. - analyzer : string, {'word', 'char', 'char_wb'} or callable + analyzer : string, {'word', 'char', 'char_wb'} or callable, \ + default='word' Whether the feature should be made of word n-gram or character n-grams. Option 'char_wb' creates character n-grams only from text inside @@ -942,25 +944,25 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): absolute counts. This parameter is ignored if vocabulary is not None. - max_features : int or None, default=None + max_features : int, default=None If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. - vocabulary : Mapping or iterable, optional + vocabulary : Mapping or iterable, default=None Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input documents. Indices in the mapping should not be repeated and should not have any gap between 0 and the largest index. - binary : boolean, default=False + binary : bool, default=False If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. - dtype : type, optional + dtype : type, default=np.int64 Type of the matrix returned by fit_transform() or transform(). Attributes @@ -1199,7 +1201,7 @@ def fit_transform(self, raw_documents, y=None): Returns ------- - X : array, [n_samples, n_features] + X : array of shape (n_samples, n_features) Document-term matrix. """ # We intentionally don't call the transform method to make @@ -1257,7 +1259,7 @@ def transform(self, raw_documents): Returns ------- - X : sparse matrix, [n_samples, n_features] + X : sparse matrix of shape (n_samples, n_features) Document-term matrix. """ if isinstance(raw_documents, str): @@ -1282,7 +1284,7 @@ def inverse_transform(self, X): Returns ------- - X_inv : list of arrays, len = n_samples + X_inv : list of arrays of shape (n_samples,) List of arrays of terms. """ self._check_vocabulary() @@ -1371,7 +1373,7 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): Parameters ---------- - norm : 'l1', 'l2' or None, optional (default='l2') + norm : {'l1', 'l2'}, default='l2' Each output row will have unit norm, either: * 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has @@ -1379,20 +1381,20 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): * 'l1': Sum of absolute values of vector elements is 1. See :func:`preprocessing.normalize` - use_idf : boolean (default=True) + use_idf : bool, default=True Enable inverse-document-frequency reweighting. - smooth_idf : boolean (default=True) + smooth_idf : bool, default=True Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. - sublinear_tf : boolean (default=False) + sublinear_tf : bool, default=False Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). Attributes ---------- - idf_ : array, shape (n_features) + idf_ : array of shape (n_features) The inverse document frequency (IDF) vector; only defined if ``use_idf`` is True. @@ -1440,12 +1442,12 @@ def __init__(self, norm='l2', use_idf=True, smooth_idf=True, self.sublinear_tf = sublinear_tf def fit(self, X, y=None): - """Learn the idf vector (global term weights) + """Learn the idf vector (global term weights). Parameters ---------- - X : sparse matrix, [n_samples, n_features] - a matrix of term/token counts + X : sparse matrix of shape n_samples, n_features) + A matrix of term/token counts. """ X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): @@ -1476,16 +1478,16 @@ def transform(self, X, copy=True): Parameters ---------- - X : sparse matrix, [n_samples, n_features] + X : sparse matrix of (n_samples, n_features) a matrix of term/token counts - copy : boolean, default True + copy : bool, default=True Whether to copy X and operate on the copy or perform in-place operations. Returns ------- - vectors : sparse matrix, [n_samples, n_features] + vectors : sparse matrix of shape (n_samples, n_features) """ X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy) if not sp.issparse(X): @@ -1544,7 +1546,7 @@ class TfidfVectorizer(CountVectorizer): Parameters ---------- - input : str {'filename', 'file', 'content'} + input : {'filename', 'file', 'content'}, default='content' If 'filename', the sequence passed as an argument to fit is expected to be a list of filenames that need reading to fetch the raw content to analyze. @@ -1559,13 +1561,13 @@ class TfidfVectorizer(CountVectorizer): If bytes or files are given to analyze, this encoding is used to decode. - decode_error : {'strict', 'ignore', 'replace'} (default='strict') + decode_error : {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'. - strip_accents : {'ascii', 'unicode', None} (default=None) + strip_accents : {'ascii', 'unicode'}, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have @@ -1576,20 +1578,20 @@ class TfidfVectorizer(CountVectorizer): Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. - lowercase : bool (default=True) + lowercase : bool, default=True Convert all characters to lowercase before tokenizing. - preprocessor : callable or None (default=None) + preprocessor : callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer is not callable``. - tokenizer : callable or None (default=None) + tokenizer : callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. - analyzer : str, {'word', 'char', 'char_wb'} or callable + analyzer : {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. @@ -1603,7 +1605,7 @@ class TfidfVectorizer(CountVectorizer): first read from the file and then passed to the given callable analyzer. - stop_words : str {'english'}, list, or None (default=None) + stop_words : {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. @@ -1632,42 +1634,42 @@ class TfidfVectorizer(CountVectorizer): only bigrams. Only applies if ``analyzer is not callable``. - max_df : float in range [0.0, 1.0] or int (default=1.0) + max_df : float or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). - If float, the parameter represents a proportion of documents, integer - absolute counts. + If float in range [0.0, 1.0], the parameter represents a proportion of + documents, integer absolute counts. This parameter is ignored if vocabulary is not None. - min_df : float in range [0.0, 1.0] or int (default=1) + min_df : float or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. - If float, the parameter represents a proportion of documents, integer - absolute counts. + If float in range of [0.0, 1.0], the parameter represents a proportion + of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. - max_features : int or None (default=None) + max_features : int, default=None If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. - vocabulary : Mapping or iterable, optional (default=None) + vocabulary : Mapping or iterable, default=None Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input documents. - binary : bool (default=False) + binary : bool, default=False If True, all non-zero term counts are set to 1. This does not mean outputs will have only 0/1 values, only that the tf term in tf-idf is binary. (Set idf and normalization to False to get 0/1 outputs). - dtype : type, optional (default=float64) + dtype : dtype, default=float64 Type of the matrix returned by fit_transform() or transform(). - norm : 'l1', 'l2' or None, optional (default='l2') + norm : {'l1', 'l2'}, default='l2' Each output row will have unit norm, either: * 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has @@ -1675,15 +1677,15 @@ class TfidfVectorizer(CountVectorizer): * 'l1': Sum of absolute values of vector elements is 1. See :func:`preprocessing.normalize`. - use_idf : bool (default=True) + use_idf : bool, default=True Enable inverse-document-frequency reweighting. - smooth_idf : bool (default=True) + smooth_idf : bool, default=True Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. - sublinear_tf : bool (default=False) + sublinear_tf : bool, default=False Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). Attributes @@ -1695,7 +1697,7 @@ class TfidfVectorizer(CountVectorizer): True if a fixed vocabulary of term to indices mapping is provided by the user - idf_ : array, shape (n_features) + idf_ : array of shape (n_features,) The inverse document frequency (IDF) vector; only defined if ``use_idf`` is True. @@ -1852,7 +1854,7 @@ def fit_transform(self, raw_documents, y=None): Returns ------- - X : sparse matrix, [n_samples, n_features] + X : sparse matrix of (n_samples, n_features) Tf-idf-weighted document-term matrix. """ self._check_params() @@ -1873,7 +1875,7 @@ def transform(self, raw_documents, copy="deprecated"): raw_documents : iterable An iterable which yields either str, unicode or file objects. - copy : bool, default True + copy : bool, default=True Whether to copy X and operate on the copy or perform in-place operations. @@ -1884,7 +1886,7 @@ def transform(self, raw_documents, copy="deprecated"): Returns ------- - X : sparse matrix, [n_samples, n_features] + X : sparse matrix of (n_samples, n_features) Tf-idf-weighted document-term matrix. """ check_is_fitted(self, msg='The TF-IDF vectorizer is not fitted') From d7795a431e30d23f7e8499bdbe89dbdc6e9a068e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 4 Jan 2020 18:02:21 +0100 Subject: [PATCH 178/448] [MRG+1] Better non-regression test for spectral embedding AMG solver issue (#16014) --- .../manifold/tests/test_spectral_embedding.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index f99eae3783c05..49bf413516e74 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -215,32 +215,32 @@ def test_spectral_embedding_amg_solver(seed=36): assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 1e-5) -# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand +# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with +# np.random.rand: # https://github.com/scikit-learn/scikit-learn/issues/15913 @pytest.mark.filterwarnings( "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*") -def test_spectral_embedding_amg_solver_failure(seed=36): - # Test spectral embedding with amg solver failure, see issue #13393 +def test_spectral_embedding_amg_solver_failure(): + # Non-regression test for amg solver failure (issue #13393 on github) pytest.importorskip('pyamg') - - # The generated graph below is NOT fully connected if n_neighbors=3 - n_samples = 200 - n_clusters = 3 - n_features = 3 - centers = np.eye(n_clusters, n_features) - S, true_labels = make_blobs(n_samples=n_samples, centers=centers, - cluster_std=1., random_state=42) - - se_amg0 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, - random_state=np.random.RandomState(seed)) - embed_amg0 = se_amg0.fit_transform(S) - - for i in range(10): - se_amg0.set_params(random_state=np.random.RandomState(seed + 1)) - embed_amg1 = se_amg0.fit_transform(S) - - assert _check_with_col_sign_flipping(embed_amg0, embed_amg1, 0.05) + seed = 36 + num_nodes = 100 + X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed) + upper = sparse.triu(X) - sparse.diags(X.diagonal()) + sym_matrix = upper + upper.T + embedding = spectral_embedding(sym_matrix, + n_components=10, + eigen_solver='amg', + random_state=0) + + # Check that the learned embedding is stable w.r.t. random solver init: + for i in range(3): + new_embedding = spectral_embedding(sym_matrix, + n_components=10, + eigen_solver='amg', + random_state=i + 1) + assert _check_with_col_sign_flipping( + embedding, new_embedding, tol=0.05) @pytest.mark.filterwarnings("ignore:the behavior of nmi will " From 11147a13ed45e513fa88d2351d43b14c093b8636 Mon Sep 17 00:00:00 2001 From: Rushabh Vasani Date: Sat, 4 Jan 2020 23:05:25 +0530 Subject: [PATCH 179/448] Argument validation in make_multilabel_classification() (#16006) --- doc/whats_new/v0.23.rst | 4 ++++ sklearn/datasets/_samples_generator.py | 11 +++++++++++ sklearn/datasets/tests/test_samples_generator.py | 12 ++++++++++++ 3 files changed, 27 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 1941aacb7a7b0..b476c34b380cc 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -63,6 +63,10 @@ Changelog by :user:`Stephanie Andrews ` and :user:`Reshama Shaikh `. +- |Fix| :func:`datasets.make_multilabel_classification` now generates + `ValueError` for arguments `n_classes < 1` OR `length < 1`. + :pr:`16006` by :user:`Rushabh Vasani `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 8893aedbdfc5a..10c87d988c324 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -342,6 +342,17 @@ def make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, Only returned if ``return_distributions=True``. """ + if n_classes < 1: + raise ValueError( + "'n_classes' should be an integer greater than 0. Got {} instead." + .format(n_classes) + ) + if length < 1: + raise ValueError( + "'length' should be an integer greater than 0. Got {} instead." + .format(length) + ) + generator = check_random_state(random_state) p_c = generator.rand(n_classes) p_c /= p_c.sum() diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index 433baca985b87..c683e277c705a 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -222,6 +222,18 @@ def test_make_multilabel_classification_return_indicator_sparse(): assert sp.issparse(Y) +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"n_classes": 0}, "'n_classes' should be an integer"), + ({"length": 0}, "'length' should be an integer") + ] +) +def test_make_multilabel_classification_valid_arguments(params, err_msg): + with pytest.raises(ValueError, match=err_msg): + make_multilabel_classification(**params) + + def test_make_hastie_10_2(): X, y = make_hastie_10_2(n_samples=100, random_state=0) assert X.shape == (100, 10), "X shape mismatch" From 00f3910678d2b64ddc5be81255a093b97396c580 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sat, 4 Jan 2020 18:30:20 -0500 Subject: [PATCH 180/448] [MRG] MNT Fixes link to whats new in front page (#16009) --- doc/conf.py | 1 + doc/templates/index.html | 3 ++- doc/themes/scikit-learn-modern/layout.html | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index c4d7e578216fd..e3ccf62c2166e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -88,6 +88,7 @@ # The short X.Y version. import sklearn version = parse(sklearn.__version__).base_version +version = ".".join(version.split(".")[:2]) # The full version, including alpha/beta/rc tags. release = sklearn.__version__ diff --git a/doc/templates/index.html b/doc/templates/index.html index aa7139bd9b505..4f69829f413e1 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -8,7 +8,7 @@

scikit-learn

Machine Learning in Python

Getting Started - What's New in {{ version }} + What's New in {{ release }} GitHub

@@ -156,6 +156,7 @@

News

On-going development: What's new (Changelog)

January 2020. scikit-learn 0.22.1 is available for download (Changelog).

December 2019. scikit-learn 0.22 is available for download (Changelog).

Scikit-learn from 0.21 requires Python 3.5 or greater. diff --git a/doc/themes/scikit-learn-modern/layout.html b/doc/themes/scikit-learn-modern/layout.html index f32c6f94d47e5..07cf685518863 100644 --- a/doc/themes/scikit-learn-modern/layout.html +++ b/doc/themes/scikit-learn-modern/layout.html @@ -77,7 +77,7 @@ {%- if pagename != "install" %}

- scikit-learn {{ version }}
+ scikit-learn {{ release }}
Other versions

From 1fa28a7ed3f058005551e6a0f196e03afaf3b0e0 Mon Sep 17 00:00:00 2001 From: mo <31044045+mghah@users.noreply.github.com> Date: Mon, 6 Jan 2020 06:02:22 -0800 Subject: [PATCH 181/448] DOC improve `neighbors` module docstring per dic guidline (#16020) --- sklearn/neighbors/_binary_tree.pxi | 80 +++++++++++++------------- sklearn/neighbors/_classification.py | 68 +++++++++++----------- sklearn/neighbors/_graph.py | 48 +++++++++------- sklearn/neighbors/_lof.py | 76 ++++++++++++------------ sklearn/neighbors/_nca.py | 55 +++++++++--------- sklearn/neighbors/_nearest_centroid.py | 6 +- sklearn/neighbors/_regression.py | 49 ++++++++-------- sklearn/neighbors/_unsupervised.py | 18 +++--- 8 files changed, 206 insertions(+), 194 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 43db83b0f8b62..e633cdb0d1ee6 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -272,7 +272,7 @@ X : array-like of shape (n_samples, n_features) Note: if X is a C-contiguous array of doubles then data will not be copied. Otherwise, an internal copy will be made. -leaf_size : positive integer (default = 40) +leaf_size : positive int, default=40 Number of points at which to switch to brute-force. Changing leaf_size will not affect the results of a query, but can significantly impact the speed of a query and the memory required @@ -282,7 +282,7 @@ leaf_size : positive integer (default = 40) satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in the case that ``n_samples < leaf_size``. -metric : string or DistanceMetric object +metric : str or DistanceMetric object the distance metric to use for the tree. Default='minkowski' with p=2 (that is, a euclidean metric). See the documentation of the DistanceMetric class for a list of available metrics. @@ -509,15 +509,15 @@ def kernel_norm(h, d, kernel, return_log=False): Parameters ---------- h : float - the bandwidth of the kernel + The bandwidth of the kernel. d : int - the dimension of the space in which the kernel norm is computed - kernel : string + The dimension of the space in which the kernel norm is computed. + kernel : str The kernel identifier. Must be one of ['gaussian'|'tophat'|'epanechnikov'| 'exponential'|'linear'|'cosine'] - return_log : boolean - if True, return the log of the kernel norm. Otherwise, return the + return_log : bool, default=False + If True, return the log of the kernel norm. Otherwise, return the kernel norm. Returns ------- @@ -1281,20 +1281,20 @@ cdef class BinaryTree: ---------- X : array-like of shape (n_samples, n_features) An array of points to query - k : integer (default = 1) + k : int, default=1 The number of nearest neighbors to return - return_distance : boolean (default = True) + return_distance : bool, default=True if True, return a tuple (d, i) of distances and indices if False, return array i - dualtree : boolean (default = False) + dualtree : bool, default=False if True, use the dual tree formalism for the query: a tree is built for the query points, and the pair of trees is used to efficiently search this space. This can lead to better performance as the number of points grows large. - breadth_first : boolean (default = False) + breadth_first : bool, default=False if True, then query the nodes in a breadth-first manner. Otherwise, query the nodes in a depth-first manner. - sort_results : boolean (default = True) + sort_results : bool, default=True if True, then distances and indices of each point are sorted on return, so that the first column contains the closest points. Otherwise, neighbors are returned in an arbitrary order. @@ -1304,13 +1304,13 @@ cdef class BinaryTree: i : if return_distance == False (d,i) : if return_distance == True - d : array of doubles - shape: x.shape[:-1] + (k,) - each entry gives the list of distances to the - neighbors of the corresponding point + d : ndarray of shape X.shape[:-1] + k, dtype=double + Each entry gives the list of distances to the neighbors of the + corresponding point. - i : array of integers - shape: x.shape[:-1] + (k,) - each entry gives the list of indices of - neighbors of the corresponding point + i : ndarray of shape X.shape[:-1] + k, dtype=int + Each entry gives the list of indices of neighbors of the + corresponding point. """ # XXX: we should allow X to be a pre-built tree. X = check_array(X, dtype=DTYPE, order='C') @@ -1394,19 +1394,19 @@ cdef class BinaryTree: r : distance within which neighbors are returned r can be a single value, or an array of values of shape x.shape[:-1] if different radii are desired for each point. - return_distance : boolean (default = False) + return_distance : bool, default=False if True, return distances to neighbors of each point if False, return only neighbors Note that unlike the query() method, setting return_distance=True here adds to the computation time. Not all distances need to be calculated explicitly for return_distance=False. Results are not sorted by default: see ``sort_results`` keyword. - count_only : boolean (default = False) + count_only : bool, default=False if True, return only the count of points within distance r if False, return the indices of all points within distance r If return_distance==True, setting count_only=True will result in an error. - sort_results : boolean (default = False) + sort_results : bool, default=False if True, the distances and indices will be sorted before being returned. If False, the results will not be sorted. If return_distance == False, setting sort_results = True will @@ -1418,19 +1418,19 @@ cdef class BinaryTree: ind : if count_only == False and return_distance == False (ind, dist) : if count_only == False and return_distance == True - count : array of integers, shape = X.shape[:-1] - each entry gives the number of neighbors within - a distance r of the corresponding point. + count : ndarray of shape X.shape[:-1], dtype=int + Each entry gives the number of neighbors within a distance r of the + corresponding point. - ind : array of objects, shape = X.shape[:-1] - each element is a numpy integer array listing the indices of + ind : ndarray of shape X.shape[:-1], dtype=object + Each element is a numpy integer array listing the indices of neighbors of the corresponding point. Note that unlike the results of a k-neighbors query, the returned neighbors are not sorted by distance by default. - dist : array of objects, shape = X.shape[:-1] - each element is a numpy double array - listing the distances corresponding to indices in i. + dist : ndarray of shape X.shape[:-1], dtype=object + Each element is a numpy double array listing the distances + corresponding to indices in i. """ if count_only and return_distance: raise ValueError("count_only and return_distance " @@ -1591,7 +1591,7 @@ cdef class BinaryTree: of training data. h : float the bandwidth of the kernel - kernel : string + kernel : str, default="gaussian" specify the kernel to use. Options are - 'gaussian' - 'tophat' @@ -1600,23 +1600,23 @@ cdef class BinaryTree: - 'linear' - 'cosine' Default is kernel = 'gaussian' - atol, rtol : float (default = 0) + atol, rtol : float, default=0, 1e-8 Specify the desired relative and absolute tolerance of the result. If the true result is K_true, then the returned result K_ret satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret`` The default is zero (i.e. machine precision) for both. - breadth_first : boolean (default = False) - if True, use a breadth-first search. If False (default) use a + breadth_first : bool, default=False + If True, use a breadth-first search. If False (default) use a depth-first search. Breadth-first is generally faster for compact kernels and/or high tolerances. - return_log : boolean (default = False) - return the logarithm of the result. This can be more accurate + return_log : bool, default=False + Return the logarithm of the result. This can be more accurate than returning the result itself for narrow kernels. Returns ------- - density : ndarray - The array of (log)-density evaluations, shape = X.shape[:-1] + density : ndarray of shape X.shape[:-1] + The array of (log)-density evaluations """ cdef DTYPE_t h_c = h cdef DTYPE_t log_atol = log(atol) @@ -1722,10 +1722,10 @@ cdef class BinaryTree: X : array-like of shape (n_samples, n_features) An array of points to query. Last dimension should match dimension of training data. - r : array_like + r : array-like A one-dimensional array of distances - dualtree : boolean (default = False) - If true, use a dualtree algorithm. Otherwise, use a single-tree + dualtree : bool, default=False + If True, use a dualtree algorithm. Otherwise, use a single-tree algorithm. Dual tree algorithms can have better scaling for large N. diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index af3a9feb857e5..0580b710afd44 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -30,10 +30,10 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, Parameters ---------- - n_neighbors : int, optional (default = 5) + n_neighbors : int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries. - weights : str or callable, optional (default = 'uniform') + weights : {'uniform', 'distance'} or callable, default='uniform' weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood @@ -45,7 +45,7 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, array of distances, and returns an array of the same shape containing the weights. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` @@ -57,18 +57,18 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - leaf_size : int, optional (default = 30) + leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a @@ -77,10 +77,10 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, must be square during fit. X may be a :term:`Glossary `, in which case only "nonzero" elements may be considered neighbors. - metric_params : dict, optional (default = None) + metric_params : dict, default=None Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -92,7 +92,7 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, classes_ : array of shape (n_classes,) Class labels known to the classifier - effective_metric_ : string or callble + effective_metric_ : str or callble The distance metric used. It will be same as the `metric` parameter or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to 'minkowski' and `p` parameter set to 2. @@ -159,13 +159,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ + X : array-like of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of shape [n_queries] or [n_queries, n_outputs] + y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each data sample. """ X = check_array(X, accept_sparse='csr') @@ -201,13 +201,13 @@ def predict_proba(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ + X : array-like of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - p : array of shape = [n_queries, n_classes], or a list of n_outputs + p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -259,11 +259,11 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, Parameters ---------- - radius : float, optional (default = 1.0) + radius : float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries. - weights : str or callable + weights : {'uniform', 'distance'} or callable, default='uniform' weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood @@ -277,7 +277,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, Uniform weights are used by default. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` @@ -289,18 +289,18 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - leaf_size : int, optional (default = 30) + leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a @@ -309,7 +309,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, must be square during fit. X may be a :term:`Glossary `, in which case only "nonzero" elements may be considered neighbors. - outlier_label : {manual label, 'most_frequent'}, optional (default = None) + outlier_label : {manual label, 'most_frequent'}, default=None label for outlier samples (samples with no neighbors in given radius). - manual label: str or int label (should be the same type as y) @@ -317,10 +317,10 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, - 'most_frequent' : assign the most frequent label of y to outliers. - None : when any outlier is detected, ValueError will be raised. - metric_params : dict, optional (default = None) + metric_params : dict, default=None Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -328,10 +328,10 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, Attributes ---------- - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) Class labels known to the classifier. - effective_metric_ : string or callble + effective_metric_ : str or callble The distance metric used. It will be same as the `metric` parameter or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to 'minkowski' and `p` parameter set to 2. @@ -392,12 +392,14 @@ def fit(self, X, y): Parameters ---------- - X : {array-like, sparse matrix, BallTree, KDTree} - Training data. If array or matrix, shape [n_samples, n_features], - or [n_samples, n_samples] if metric='precomputed'. + X : BallTree, KDTree or {array-like, sparse matrix} of shape \ + (n_samples, n_features) or (n_samples, n_samples) + Training data. If array or matrix, the shape is (n_samples, + n_features), or (n_samples, n_samples) if metric='precomputed'. - y : {array-like, sparse matrix} - Target values of shape = [n_samples] or [n_samples, n_outputs] + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_output) + Target values. """ @@ -453,13 +455,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ + X : array-like of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of shape [n_queries] or [n_queries, n_outputs] + y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each data sample. """ @@ -495,13 +497,13 @@ def predict_proba(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ + X : array-like of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - p : array of shape = [n_queries, n_classes], or a list of n_outputs + p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 81616fbf3651b..9fc4a6e830cde 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -52,23 +52,23 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', n_neighbors : int Number of neighbors for each sample. - mode : {'connectivity', 'distance'}, optional + mode : {'connectivity', 'distance'}, default='connectivity' Type of returned matrix: 'connectivity' will return the connectivity matrix with ones and zeros, and 'distance' will return the distances between neighbors according to the given metric. - metric : string, default 'minkowski' + metric : str, default='minkowski' The distance metric used to calculate the k-Neighbors for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is 'euclidean' ('minkowski' metric with the p param equal to 2.) - p : int, default 2 + p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional + metric_params : dict, default=None additional keyword arguments for the metric function. include_self : bool or 'auto', default=False @@ -76,7 +76,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', itself. If 'auto', then True is used for mode='connectivity' and False for mode='distance'. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -84,8 +84,9 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', Returns ------- - A : sparse graph in CSR format, shape = [n_samples, n_samples] - A[i, j] is assigned the weight of edge that connects i to j. + A : sparse matrix of shape (n_samples, n_samples) + Graph where A[i, j] is assigned the weight of edge that + connects i to j. The matrix is of CSR format. Examples -------- @@ -130,23 +131,23 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', radius : float Radius of neighborhoods. - mode : {'connectivity', 'distance'}, optional + mode : {'connectivity', 'distance'}, default='connectivity' Type of returned matrix: 'connectivity' will return the connectivity matrix with ones and zeros, and 'distance' will return the distances between neighbors according to the given metric. - metric : string, default 'minkowski' + metric : str, default='minkowski' The distance metric used to calculate the neighbors within a given radius for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is 'euclidean' ('minkowski' metric with the param equal to 2.) - p : int, default 2 + p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional + metric_params : dict, default=None additional keyword arguments for the metric function. include_self : bool or 'auto', default=False @@ -154,7 +155,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', itself. If 'auto', then True is used for mode='connectivity' and False for mode='distance'. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -162,8 +163,9 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', Returns ------- - A : sparse graph in CSR format, shape = [n_samples, n_samples] - A[i, j] is assigned the weight of edge that connects i to j. + A : sparse matrix of shape (n_samples, n_samples) + Graph where A[i, j] is assigned the weight of edge that connects + i to j. The matrix is of CSR format. Examples -------- @@ -231,7 +233,7 @@ class KNeighborsTransformer(NeighborsBase, KNeighborsMixin, required to store the tree. The optimal value depends on the nature of the problem. - metric : string or callable, default='minkowski' + metric : str or callable, default='minkowski' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. @@ -294,14 +296,15 @@ def transform(self, X): Parameters ---------- X : array-like of shape (n_samples_transform, n_features) - Sample data + Sample data. Returns ------- - Xt : CSR sparse graph of shape (n_samples_transform, n_samples_fit) + Xt : sparse matrix of shape (n_samples_transform, n_samples_fit) Xt[i, j] is assigned the weight of edge that connects i to j. Only the neighbors have an explicit value. The diagonal is always explicit. + The matrix is of CSR format. """ check_is_fitted(self) add_one = self.mode == 'distance' @@ -323,10 +326,11 @@ def fit_transform(self, X, y=None): Returns ------- - Xt : CSR sparse graph of shape (n_samples, n_samples) + Xt : sparse matrix of shape (n_samples, n_samples) Xt[i, j] is assigned the weight of edge that connects i to j. Only the neighbors have an explicit value. The diagonal is always explicit. + The matrix is of CSR format. """ return self.fit(X).transform(X) @@ -370,7 +374,7 @@ class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin, required to store the tree. The optimal value depends on the nature of the problem. - metric : string or callable, default='minkowski' + metric : str or callable, default='minkowski' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. @@ -437,10 +441,11 @@ def transform(self, X): Returns ------- - Xt : CSR sparse graph of shape (n_samples_transform, n_samples_fit) + Xt : sparse matrix of shape (n_samples_transform, n_samples_fit) Xt[i, j] is assigned the weight of edge that connects i to j. Only the neighbors have an explicit value. The diagonal is always explicit. + The matrix is of CSR format. """ check_is_fitted(self) return self.radius_neighbors_graph(X, mode=self.mode, @@ -461,9 +466,10 @@ def fit_transform(self, X, y=None): Returns ------- - Xt : CSR sparse graph, shape (n_samples, n_samples) + Xt : sparse matrix of shape (n_samples, n_samples) Xt[i, j] is assigned the weight of edge that connects i to j. Only the neighbors have an explicit value. The diagonal is always explicit. + The matrix is of CSR format. """ return self.fit(X).transform(X) diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index e77d65711cc43..fc27b7ed69420 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -35,12 +35,12 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, Parameters ---------- - n_neighbors : int, optional (default=20) + n_neighbors : int, default=20 Number of neighbors to use by default for :meth:`kneighbors` queries. If n_neighbors is larger than the number of samples provided, all samples will be used. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` @@ -52,13 +52,13 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - leaf_size : int, optional (default=30) + leaf_size : int, default=30 Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - metric : string or callable, default 'minkowski' + metric : str or callable, default='minkowski' metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. @@ -87,16 +87,16 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, metrics: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html - p : integer, optional (default=2) + p : int, default=2 Parameter for the Minkowski metric from :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional (default=None) + metric_params : dict, default=None Additional keyword arguments for the metric function. - contamination : 'auto' or float, optional (default='auto') + contamination : 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. When fitting this is used to define the threshold on the scores of the samples. @@ -109,14 +109,14 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, The default value of ``contamination`` changed from 0.1 to ``'auto'``. - novelty : boolean, default False + novelty : bool, default=False By default, LocalOutlierFactor is only meant to be used for outlier detection (novelty=False). Set novelty to True if you want to use LocalOutlierFactor for novelty detection. In this case be aware that that you should only use predict, decision_function and score_samples on new unseen data and not on the training set. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -124,17 +124,18 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, Attributes ---------- - negative_outlier_factor_ : numpy array, shape (n_samples,) + negative_outlier_factor_ : ndarray of shape (n_samples,) The opposite LOF of the training samples. The higher, the more normal. - Inliers tend to have a LOF score close to 1 (``negative_outlier_factor_`` - close to -1), while outliers tend to have a larger LOF score. + Inliers tend to have a LOF score close to 1 + (``negative_outlier_factor_`` close to -1), while outliers tend to have + a larger LOF score. The local outlier factor (LOF) of a sample captures its supposed 'degree of abnormality'. It is the average of the ratio of the local reachability density of a sample and those of its k-nearest neighbors. - n_neighbors_ : integer + n_neighbors_ : int The actual number of neighbors used for :meth:`kneighbors` queries. offset_ : float @@ -182,16 +183,16 @@ def fit_predict(self): Parameters ---------- - X : array-like, shape (n_samples, n_features), default=None + X : array-like of shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. y : Ignored - not used, present for API consistency by convention. + Not used, present for API consistency by convention. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier : ndarray of shape (n_samples,) Returns -1 for anomalies/outliers and 1 for inliers. """ @@ -213,13 +214,13 @@ def _fit_predict(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features), default=None + X : array-like of shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier : ndarray of shape (n_samples,) Returns -1 for anomalies/outliers and 1 for inliers. """ @@ -233,12 +234,13 @@ def fit(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix, BallTree, KDTree} - Training data. If array or matrix, shape [n_samples, n_features], - or [n_samples, n_samples] if metric='precomputed'. + X : BallTree, KDTree or {array-like, sparse matrix} of shape \ + (n_samples, n_features) or (n_samples, n_samples) + Training data. If array or matrix, the shape is (n_samples, + n_features), or (n_samples, n_samples) if metric='precomputed'. y : Ignored - not used, present for API consistency by convention. + Not used, present for API consistency by convention. Returns ------- @@ -290,13 +292,13 @@ def predict(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier : ndarray of shape (n_samples,) Returns -1 for anomalies/outliers and +1 for inliers. """ if not self.novelty: @@ -315,14 +317,14 @@ def _predict(self, X=None): Parameters ---------- - X : array-like, shape (n_samples, n_features), default=None + X : array-like of shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. If None, makes prediction on the training data without considering them as their own neighbors. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier : ndarray of shape (n_samples,) Returns -1 for anomalies/outliers and +1 for inliers. """ check_is_fitted(self) @@ -352,13 +354,13 @@ def decision_function(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - shifted_opposite_lof_scores : array, shape (n_samples,) + shifted_opposite_lof_scores : ndarray of shape (n_samples,) The shifted opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers. @@ -388,13 +390,13 @@ def _decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - shifted_opposite_lof_scores : array, shape (n_samples,) + shifted_opposite_lof_scores : ndarray of shape (n_samples,) The shifted opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers. @@ -419,13 +421,13 @@ def score_samples(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - opposite_lof_scores : array, shape (n_samples,) + opposite_lof_scores : ndarray of shape (n_samples,) The opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. """ @@ -455,13 +457,13 @@ def _score_samples(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - opposite_lof_scores : array, shape (n_samples,) + opposite_lof_scores : ndarray of shape (n_samples,) The opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. """ @@ -487,17 +489,17 @@ def _local_reachability_density(self, distances_X, neighbors_indices): Parameters ---------- - distances_X : array, shape (n_queries, self.n_neighbors) + distances_X : ndarray of shape (n_queries, self.n_neighbors) Distances to the neighbors (in the training samples `self._fit_X`) of each query point to compute the LRD. - neighbors_indices : array, shape (n_queries, self.n_neighbors) + neighbors_indices : ndarray of shape (n_queries, self.n_neighbors) Neighbors indices (of each query point) among training samples self._fit_X. Returns ------- - local_reachability_density : array, shape (n_queries,) + local_reachability_density : ndarray of shape (n_queries,) The local reachability density of each sample. """ dist_k = self._distances_fit_X_[neighbors_indices, diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index ea90a43b3b36f..3c9ddbbd411d0 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -39,11 +39,12 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): Parameters ---------- - n_components : int, optional (default=None) + n_components : int, default=None Preferred dimensionality of the projected space. If None it will be set to ``n_features``. - init : string or numpy array, optional (default='auto') + init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \ + (n_features_a, n_features_b), default='auto' Initialization of the linear transformation. Possible options are 'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape (n_features_a, n_features_b). @@ -83,32 +84,32 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): :meth:`fit` and n_features_a must be less than or equal to that. If ``n_components`` is not None, n_features_a must match it. - warm_start : bool, optional, (default=False) + warm_start : bool, default=False If True and :meth:`fit` has been called before, the solution of the previous call to :meth:`fit` is used as the initial linear transformation (``n_components`` and ``init`` will be ignored). - max_iter : int, optional (default=50) + max_iter : int, default=50 Maximum number of iterations in the optimization. - tol : float, optional (default=1e-5) + tol : float, default=1e-5 Convergence tolerance for the optimization. - callback : callable, optional (default=None) + callback : callable, default=None If not None, this function is called after every iteration of the optimizer, taking as arguments the current solution (flattened transformation matrix) and the number of iterations. This might be useful in case one wants to examine or store the transformation found after each iteration. - verbose : int, optional (default=0) + verbose : int, default=0 If 0, no progress messages will be printed. If 1, progress messages will be printed to stdout. If > 1, progress messages will be printed and the ``disp`` parameter of :func:`scipy.optimize.minimize` will be set to ``verbose - 2``. - random_state : int or numpy.RandomState or None, optional (default=None) + random_state : int or numpy.RandomState, default=None A pseudo random number generator object or a seed for it if int. If ``init='random'``, ``random_state`` is used to initialize the random transformation. If ``init='pca'``, ``random_state`` is passed as an @@ -116,7 +117,7 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): Attributes ---------- - components_ : array, shape (n_components, n_features) + components_ : ndarray of shape (n_components, n_features) The linear transformation learned during fitting. n_iter_ : int @@ -176,10 +177,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The training samples. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) The corresponding training labels. Returns @@ -244,12 +245,12 @@ def transform(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data samples. Returns ------- - X_embedded: array, shape (n_samples, n_components) + X_embedded: ndarray of shape (n_samples, n_components) The data samples transformed. Raises @@ -268,22 +269,22 @@ def _validate_params(self, X, y): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The training samples. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) The corresponding training labels. Returns ------- - X : array, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The validated training samples. - y : array, shape (n_samples,) + y : ndarray of shape (n_samples,) The validated training labels, encoded to be integers in the range(0, n_classes). - init : string or numpy array of shape (n_features_a, n_features_b) + init : str or ndarray of shape (n_features_a, n_features_b) The validated initialization of the linear transformation. Raises @@ -377,18 +378,18 @@ def _initialize(self, X, y, init): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The training samples. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) The training labels. - init : string or numpy array of shape (n_features_a, n_features_b) + init : str or ndarray of shape (n_features_a, n_features_b) The validated initialization of the linear transformation. Returns ------- - transformation : array, shape (n_components, n_features) + transformation : ndarray of shape (n_components, n_features) The initialized linear transformation. """ @@ -443,7 +444,7 @@ def _callback(self, transformation): Parameters ---------- - transformation : array, shape=(n_components * n_features,) + transformation : ndarray of shape (n_components * n_features,) The solution computed by the optimizer in this iteration. """ if self.callback is not None: @@ -456,14 +457,14 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): Parameters ---------- - transformation : array, shape (n_components * n_features,) + transformation : ndarray of shape (n_components * n_features,) The raveled linear transformation on which to compute loss and evaluate gradient. - X : array, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The training samples. - same_class_mask : array, shape (n_samples, n_samples) + same_class_mask : ndarray of shape (n_samples, n_samples) A mask where ``mask[i, j] == 1`` if ``X[i]`` and ``X[j]`` belong to the same class, and ``0`` otherwise. @@ -472,7 +473,7 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): loss : float The loss computed for the given transformation. - gradient : array, shape (n_components * n_features,) + gradient : ndarray of shape (n_components * n_features,) The new (flattened) gradient of the loss. """ diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index ce26db87b370a..3eefb7b4fbf58 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -30,7 +30,7 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): Parameters ---------- - metric : string, or callable + metric : str or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its @@ -41,7 +41,7 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): If the "manhattan" metric is provided, this centroid is the median and for all other metrics, the centroid is now set to be the mean. - shrink_threshold : float, optional (default = None) + shrink_threshold : float, default=None Threshold for shrinking centroids to remove features. Attributes @@ -96,7 +96,7 @@ def fit(self, X, y): Training vector, where n_samples is the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. - y : array, shape = [n_samples] + y : array-like of shape (n_samples,) Target values (integers) """ if self.metric == 'precomputed': diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index e8eafacf9effe..00d8f10c8880d 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -34,10 +34,10 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, Parameters ---------- - n_neighbors : int, optional (default = 5) + n_neighbors : int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries. - weights : str or callable + weights : {'uniform', 'distance'} or callable, default='uniform' weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood @@ -51,7 +51,7 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, Uniform weights are used by default. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` @@ -63,18 +63,18 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - leaf_size : int, optional (default = 30) + leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a @@ -83,10 +83,10 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, must be square during fit. X may be a :term:`Glossary `, in which case only "nonzero" elements may be considered neighbors. - metric_params : dict, optional (default = None) + metric_params : dict, default=None Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -95,7 +95,7 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, Attributes ---------- - effective_metric_ : string or callable + effective_metric_ : str or callable The distance metric to use. It will be same as the `metric` parameter or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to 'minkowski' and `p` parameter set to 2. @@ -160,14 +160,14 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ + X : array-like of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of int, shape = [n_queries] or [n_queries, n_outputs] - Target values + y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int + Target values. """ X = check_array(X, accept_sparse='csr') @@ -209,11 +209,11 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Parameters ---------- - radius : float, optional (default = 1.0) + radius : float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries. - weights : str or callable + weights : {'uniform', 'distance'} or callable, default='uniform' weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood @@ -227,7 +227,7 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Uniform weights are used by default. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` @@ -239,18 +239,18 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - leaf_size : int, optional (default = 30) + leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a @@ -259,10 +259,10 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, must be square during fit. X may be a :term:`Glossary `, in which case only "nonzero" elements may be considered neighbors. - metric_params : dict, optional (default = None) + metric_params : dict, default=None Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -270,7 +270,7 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Attributes ---------- - effective_metric_ : string or callable + effective_metric_ : str or callable The distance metric to use. It will be same as the `metric` parameter or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to 'minkowski' and `p` parameter set to 2. @@ -324,14 +324,15 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ + X : array-like of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of float, shape = [n_queries] or [n_queries, n_outputs] - Target values + y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \ + dtype=double + Target values. """ X = check_array(X, accept_sparse='csr') diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 79599791a96a1..20be4f636c2a4 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -15,14 +15,14 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Parameters ---------- - n_neighbors : int, optional (default = 5) + n_neighbors : int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries. - radius : float, optional (default = 1.0) + radius : float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` @@ -34,13 +34,13 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - leaf_size : int, optional (default = 30) + leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - metric : string or callable, default 'minkowski' + metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a @@ -49,16 +49,16 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, must be square during fit. X may be a :term:`Glossary `, in which case only "nonzero" elements may be considered neighbors. - p : integer, optional (default = 2) + p : int, default=2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional (default = None) + metric_params : dict, default=None Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -66,7 +66,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Attributes ---------- - effective_metric_ : string + effective_metric_ : str Metric used to compute distances to neighbors. effective_metric_params_ : dict From a1b6ff8515d4f616f9d54f7c72cc6cad211c8e59 Mon Sep 17 00:00:00 2001 From: tamirlan1 Date: Mon, 6 Jan 2020 06:47:22 -0800 Subject: [PATCH 182/448] DOC improve naive_bayes module following doc guideline (#16019) --- sklearn/naive_bayes.py | 141 ++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 71 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 22bd339cbd6b0..8c10ae54303e7 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -45,7 +45,7 @@ def _joint_log_likelihood(self, X): """Compute the unnormalized posterior log probability of X I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of - shape [n_classes, n_samples]. + shape (n_classes, n_samples). Input is passed to _joint_log_likelihood as-is by predict, predict_proba and predict_log_proba. @@ -131,32 +131,32 @@ class GaussianNB(_BaseNB): Parameters ---------- - priors : array-like, shape (n_classes,) + priors : array-like of shape (n_classes,) Prior probabilities of the classes. If specified the priors are not adjusted according to the data. - var_smoothing : float, optional (default=1e-9) + var_smoothing : float, default=1e-9 Portion of the largest variance of all features that is added to variances for calculation stability. Attributes ---------- - class_count_ : array, shape (n_classes,) + class_count_ : ndarray of shape (n_classes,) number of training samples observed in each class. - class_prior_ : array, shape (n_classes,) + class_prior_ : ndarray of shape (n_classes,) probability of each class. - classes_ : array, shape (n_classes,) + classes_ : ndarray of shape (n_classes,) class labels known to the classifier epsilon_ : float absolute additive value to variances - sigma_ : array, shape (n_classes, n_features) + sigma_ : ndarray of shape (n_classes, n_features) variance of each feature per class - theta_ : array, shape (n_classes, n_features) + theta_ : ndarray of shape (n_classes, n_features) mean of each feature per class Examples @@ -186,14 +186,14 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. - sample_weight : array-like, shape (n_samples,), optional (default=None) + sample_weight : array-like of shape (n_samples,), default=None Weights applied to individual samples (1. for unweighted). .. versionadded:: 0.17 @@ -233,21 +233,21 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None): weights were given, this should contain the sum of sample weights represented in old mean and variance. - mu : array-like, shape (number of Gaussians,) + mu : array-like of shape (number of Gaussians,) Means for Gaussians in original set. - var : array-like, shape (number of Gaussians,) + var : array-like of shape (number of Gaussians,) Variances for Gaussians in original set. - sample_weight : array-like, shape (n_samples,), optional (default=None) + sample_weight : array-like of shape (n_samples,), default=None Weights applied to individual samples (1. for unweighted). Returns ------- - total_mu : array-like, shape (number of Gaussians,) + total_mu : array-like of shape (number of Gaussians,) Updated mean for each Gaussian over the combined set. - total_var : array-like, shape (number of Gaussians,) + total_var : array-like of shape (number of Gaussians,) Updated variance for each Gaussian over the combined set. """ if X.shape[0] == 0: @@ -301,20 +301,20 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. - classes : array-like, shape (n_classes,), optional (default=None) + classes : array-like of shape (n_classes,), default=None List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. - sample_weight : array-like, shape (n_samples,), optional (default=None) + sample_weight : array-like of shape (n_samples,), default=None Weights applied to individual samples (1. for unweighted). .. versionadded:: 0.17 @@ -332,24 +332,24 @@ def _partial_fit(self, X, y, classes=None, _refit=False, Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. - classes : array-like, shape (n_classes,), optional (default=None) + classes : array-like of shape (n_classes,), default=None List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. - _refit : bool, optional (default=False) + _refit : bool, default=False If true, act as though this were the first time we called _partial_fit (ie, throw away any past fitting and start over). - sample_weight : array-like, shape (n_samples,), optional (default=None) + sample_weight : array-like of shape (n_samples,), default=None Weights applied to individual samples (1. for unweighted). Returns @@ -531,7 +531,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): y : array-like of shape (n_samples,) Target values. - classes : array-like of shape (n_classes) (default=None) + classes : array-like of shape (n_classes), default=None List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted @@ -673,44 +673,44 @@ class MultinomialNB(_BaseDiscreteNB): Parameters ---------- - alpha : float, optional (default=1.0) + alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). - fit_prior : boolean, optional (default=True) + fit_prior : bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used. - class_prior : array-like, size (n_classes,), optional (default=None) + class_prior : array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified the priors are not adjusted according to the data. Attributes ---------- - class_count_ : array, shape (n_classes,) + class_count_ : ndarray of shape (n_classes,) Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. - class_log_prior_ : array, shape (n_classes, ) + class_log_prior_ : ndarray of shape (n_classes, ) Smoothed empirical log probability for each class. - classes_ : array, shape (n_classes,) + classes_ : ndarray of shape (n_classes,) Class labels known to the classifier - coef_ : array, shape (n_classes, n_features) + coef_ : ndarray of shape (n_classes, n_features) Mirrors ``feature_log_prob_`` for interpreting MultinomialNB as a linear model. - feature_count_ : array, shape (n_classes, n_features) + feature_count_ : ndarray of shape (n_classes, n_features) Number of samples encountered for each (class, feature) during fitting. This value is weighted by the sample weight when provided. - feature_log_prob_ : array, shape (n_classes, n_features) + feature_log_prob_ : ndarray of shape (n_classes, n_features) Empirical log probability of features given a class, ``P(x_i|y)``. - intercept_ : array, shape (n_classes, ) + intercept_ : ndarray of shape (n_classes, ) Mirrors ``class_log_prior_`` for interpreting MultinomialNB as a linear model. @@ -782,16 +782,16 @@ class ComplementNB(_BaseDiscreteNB): Parameters ---------- - alpha : float, optional (default=1.0) + alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). - fit_prior : boolean, optional (default=True) + fit_prior : bool, default=True Only used in edge case with a single class in the training set. - class_prior : array-like, size (n_classes,), optional (default=None) + class_prior : array-like of shape (n_classes,), default=None Prior probabilities of the classes. Not used. - norm : boolean, optional (default=False) + norm : bool, default=False Whether or not a second normalization of the weights is performed. The default behavior mirrors the implementations found in Mahout and Weka, which do not follow the full algorithm described in Table 9 of the @@ -799,26 +799,26 @@ class ComplementNB(_BaseDiscreteNB): Attributes ---------- - class_count_ : array, shape (n_classes,) + class_count_ : ndarray of shape (n_classes,) Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. - class_log_prior_ : array, shape (n_classes, ) + class_log_prior_ : ndarray of shape (n_classes,) Smoothed empirical log probability for each class. Only used in edge case with a single class in the training set. - classes_ : array, shape (n_classes,) + classes_ : ndarray of shape (n_classes,) Class labels known to the classifier - feature_all_ : array, shape (n_features,) + feature_all_ : ndarray of shape (n_features,) Number of samples encountered for each feature during fitting. This value is weighted by the sample weight when provided. - feature_count_ : array, shape (n_classes, n_features) + feature_count_ : ndarray of shape (n_classes, n_features) Number of samples encountered for each (class, feature) during fitting. This value is weighted by the sample weight when provided. - feature_log_prob_ : array, shape (n_classes, n_features) + feature_log_prob_ : ndarray of shape (n_classes, n_features) Empirical weights for class complements. n_features_ : int @@ -893,46 +893,45 @@ class BernoulliNB(_BaseDiscreteNB): Parameters ---------- - alpha : float, optional (default=1.0) + alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). - binarize : float or None, optional (default=0.0) + binarize : float or None, default=0.0 Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. - fit_prior : bool, optional (default=True) + fit_prior : bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used. - class_prior : array-like, size=[n_classes,], optional (default=None) + class_prior : array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified the priors are not adjusted according to the data. Attributes ---------- - class_count_ : array, shape = [n_classes] + class_count_ : ndarray of shape (n_classes) Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. - class_log_prior_ : array, shape = [n_classes] + class_log_prior_ : ndarray of shape (n_classes) Log probability of each class (smoothed). - classes_ : array, shape (n_classes,) + classes_ : ndarray of shape (n_classes,) Class labels known to the classifier - feature_count_ : array, shape = [n_classes, n_features] + feature_count_ : ndarray of shape (n_classes, n_features) Number of samples encountered for each (class, feature) during fitting. This value is weighted by the sample weight when provided. - feature_log_prob_ : array, shape = [n_classes, n_features] + feature_log_prob_ : ndarray of shape (n_classes, n_features) Empirical log probability of features given a class, P(x_i|y). n_features_ : int Number of features of each sample. - Examples -------- >>> import numpy as np @@ -1020,36 +1019,36 @@ class CategoricalNB(_BaseDiscreteNB): Parameters ---------- - alpha : float, optional (default=1.0) + alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). - fit_prior : boolean, optional (default=True) + fit_prior : bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used. - class_prior : array-like, size (n_classes,), optional (default=None) + class_prior : array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified the priors are not adjusted according to the data. Attributes ---------- - category_count_ : list of arrays, len n_features + category_count_ : list of arrays of shape (n_features,) Holds arrays of shape (n_classes, n_categories of respective feature) for each feature. Each array provides the number of samples encountered for each class and category of the specific feature. - class_count_ : array, shape (n_classes,) + class_count_ : ndarray of shape (n_classes,) Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. - class_log_prior_ : array, shape (n_classes, ) + class_log_prior_ : ndarray of shape (n_classes,) Smoothed empirical log probability for each class. - classes_ : array, shape (n_classes,) + classes_ : ndarray of shape (n_classes,) Class labels known to the classifier - feature_log_prob_ : list of arrays, len n_features + feature_log_prob_ : list of arrays of shape (n_features,) Holds arrays of shape (n_classes, n_categories of respective feature) for each feature. Each array provides the empirical log probability of categories given the respective feature and class, ``P(x_i|y)``. @@ -1081,7 +1080,7 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. Here, each feature of X is assumed to be from a different categorical distribution. @@ -1090,10 +1089,10 @@ def fit(self, X, y, sample_weight=None): total number of categories for the given feature. This can, for instance, be achieved with the help of OrdinalEncoder. - y : array-like, shape = [n_samples] + y : array-like of shape (n_samples,) Target values. - sample_weight : array-like, shape = [n_samples], (default=None) + sample_weight : array-like of shape (n_samples), default=None Weights applied to individual samples (1. for unweighted). Returns @@ -1118,7 +1117,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. Here, each feature of X is assumed to be from a different categorical distribution. @@ -1127,16 +1126,16 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): total number of categories for the given feature. This can, for instance, be achieved with the help of OrdinalEncoder. - y : array-like, shape = [n_samples] + y : array-like of shape (n_samples) Target values. - classes : array-like, shape = [n_classes] (default=None) + classes : array-like of shape (n_classes), default=None List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. - sample_weight : array-like, shape = [n_samples], (default=None) + sample_weight : array-like of shape (n_samples), default=None Weights applied to individual samples (1. for unweighted). Returns From 730d1e726b9fb59811639896c5f8c26bb2ee0628 Mon Sep 17 00:00:00 2001 From: Venkatachalam N Date: Tue, 7 Jan 2020 02:05:28 +0530 Subject: [PATCH 183/448] [MRG] BUG fix RidgeClassifierCV to accept custom score (#14848) * handle_clf_ridgeGCV * included test case * undo_lambda_rm * crtd_commet * check_y * include_comp_proba * fix_format * added more comment for is_clf condtn * adding param is_clf * adding super * format fixes * init params * moving param 2 inint * using is_classifier * removing super * ENH do not allocate local arrays in Ridge*CV of store_cv_vales is False * add whats new * TST check multiple metrics for RidgeCV * linting * update whats new * missing doc when solving conflict * missing doc * style * correct whats new * fix * address agramfort comments * fix Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v0.23.rst | 5 +++ sklearn/linear_model/_ridge.py | 48 +++++++++++++++----- sklearn/linear_model/tests/test_ridge.py | 57 ++++++++++++++++++++++-- 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b476c34b380cc..53c416c506614 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -100,6 +100,11 @@ Changelog :class:`linear_model.RidgeClassifierCV`. :pr:`15653` by :user:`Jérôme Dockès `. +- |Fix| Fixed a bug in :class:`linear_model.RidgeClassifierCV` to pass a + specific scoring strategy. Before the internal estimator outputs score + instead of predictions. + :pr:`14848` by :user:`Venkatachalam N `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index ec2f29dbb2317..2973bc70b3282 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -19,7 +19,7 @@ from ._base import LinearClassifierMixin, LinearModel, _rescale_data from ._sag import sag_solver -from ..base import RegressorMixin, MultiOutputMixin +from ..base import RegressorMixin, MultiOutputMixin, is_classifier from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_X_y @@ -1057,8 +1057,8 @@ def _matmat(self, v): return res -class _IdentityEstimator: - """Hack to call a scorer when we already have the predictions.""" +class _IdentityRegressor: + """Fake regressor which will directly output the prediction.""" def decision_function(self, y_predict): return y_predict @@ -1067,8 +1067,21 @@ def predict(self, y_predict): return y_predict +class _IdentityClassifier(LinearClassifierMixin): + """Fake classifier which will directly output the prediction. + + We inherit from LinearClassifierMixin to get the proper shape for the + output `y`. + """ + def __init__(self, classes): + self.classes_ = classes + + def decision_function(self, y_predict): + return y_predict + + class _RidgeGCV(LinearModel): - """Ridge regression with built-in Generalized Cross-Validation + """Ridge regression with built-in Generalized Cross-Validation. It allows efficient Leave-One-Out cross-validation. @@ -1113,7 +1126,8 @@ class _RidgeGCV(LinearModel): def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, copy_X=True, - gcv_mode=None, store_cv_values=False): + gcv_mode=None, store_cv_values=False, + is_clf=False): self.alphas = np.asarray(alphas) self.fit_intercept = fit_intercept self.normalize = normalize @@ -1121,6 +1135,7 @@ def __init__(self, alphas=(0.1, 1.0, 10.0), self.copy_X = copy_X self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values + self.is_clf = is_clf def _decomp_diag(self, v_prime, Q): # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) @@ -1502,11 +1517,20 @@ def fit(self, X, y, sample_weight=None): self.cv_values_[:, i] = squared_errors.ravel() else: predictions = y - (c / G_inverse_diag) - alpha_score = scorer( - _IdentityEstimator(), predictions.ravel(), y.ravel()) if self.store_cv_values: self.cv_values_[:, i] = predictions.ravel() + if self.is_clf: + identity_estimator = _IdentityClassifier( + classes=np.arange(n_y) + ) + predictions_, y_ = predictions, y.argmax(axis=1) + else: + identity_estimator = _IdentityRegressor() + predictions_, y_ = predictions.ravel(), y.ravel() + + alpha_score = scorer(identity_estimator, predictions_, y_) + if (best_score is None) or (alpha_score > best_score): best_coef, best_score, best_alpha = c, alpha_score, alpha @@ -1576,7 +1600,8 @@ def fit(self, X, y, sample_weight=None): normalize=self.normalize, scoring=self.scoring, gcv_mode=self.gcv_mode, - store_cv_values=self.store_cv_values) + store_cv_values=self.store_cv_values, + is_clf=is_classifier(self)) estimator.fit(X, y, sample_weight=sample_weight) self.alpha_ = estimator.alpha_ self.best_score_ = estimator.best_score_ @@ -1588,7 +1613,8 @@ def fit(self, X, y, sample_weight=None): " are incompatible") parameters = {'alpha': self.alphas} solver = 'sparse_cg' if sparse.issparse(X) else 'auto' - gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept, + model = RidgeClassifier if is_classifier(self) else Ridge + gs = GridSearchCV(model(fit_intercept=self.fit_intercept, normalize=self.normalize, solver=solver), parameters, cv=cv, scoring=self.scoring) @@ -1716,7 +1742,6 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): RidgeClassifier : Ridge classifier RidgeClassifierCV : Ridge classifier with built-in cross validation """ - pass class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): @@ -1876,7 +1901,8 @@ def fit(self, X, y, sample_weight=None): sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) - _BaseRidgeCV.fit(self, X, Y, sample_weight=sample_weight) + target = Y if self.cv is None else y + _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight) return self @property diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index a92e830aba66e..c1f7bb86a7fcf 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -60,6 +60,14 @@ SPARSE_FILTER = lambda X: sp.csr_matrix(X) +def _accuracy_callable(y_test, y_pred): + return np.mean(y_test == y_pred) + + +def _mean_squared_error_callable(y_test, y_pred): + return ((y_test - y_pred) ** 2).mean() + + @pytest.mark.parametrize('solver', ("svd", "sparse_cg", "cholesky", "lsqr", "sag")) def test_ridge(solver): @@ -726,6 +734,38 @@ def _test_ridge_classifiers(filter_): assert np.mean(y_iris == y_pred) >= 0.8 +@pytest.mark.parametrize("scoring", [None, "accuracy", _accuracy_callable]) +@pytest.mark.parametrize("cv", [None, KFold(5)]) +@pytest.mark.parametrize("filter_", [DENSE_FILTER, SPARSE_FILTER]) +def test_ridge_classifier_with_scoring(filter_, scoring, cv): + # non-regression test for #14672 + # check that RidgeClassifierCV works with all sort of scoring and + # cross-validation + scoring_ = make_scorer(scoring) if callable(scoring) else scoring + clf = RidgeClassifierCV(scoring=scoring_, cv=cv) + # Smoke test to check that fit/predict does not raise error + clf.fit(filter_(X_iris), y_iris).predict(filter_(X_iris)) + + +@pytest.mark.parametrize("cv", [None, KFold(5)]) +@pytest.mark.parametrize("filter_", [DENSE_FILTER, SPARSE_FILTER]) +def test_ridge_regression_custom_scoring(filter_, cv): + # check that custom scoring is working as expected + # check the tie breaking strategy (keep the first alpha tried) + + def _dummy_score(y_test, y_pred): + return 0.42 + + alphas = np.logspace(-2, 2, num=5) + clf = RidgeClassifierCV( + alphas=alphas, scoring=make_scorer(_dummy_score), cv=cv + ) + clf.fit(filter_(X_iris), y_iris) + assert clf.best_score_ == pytest.approx(0.42) + # In case of tie score, the first alphas will be kept + assert clf.alpha_ == pytest.approx(alphas[0]) + + def _test_tolerance(filter_): ridge = Ridge(tol=1e-5, fit_intercept=False) ridge.fit(filter_(X_diabetes), y_diabetes) @@ -845,7 +885,9 @@ def test_class_weights_cv(): assert_array_equal(reg.predict([[-.2, 2]]), np.array([-1])) -@pytest.mark.parametrize("scoring", [None, 'neg_mean_squared_error']) +@pytest.mark.parametrize( + "scoring", [None, 'neg_mean_squared_error', _mean_squared_error_callable] +) def test_ridgecv_store_cv_values(scoring): rng = np.random.RandomState(42) @@ -855,7 +897,9 @@ def test_ridgecv_store_cv_values(scoring): alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) - r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True, scoring=scoring) + scoring_ = make_scorer(scoring) if callable(scoring) else scoring + + r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True, scoring=scoring_) # with len(y.shape) == 1 y = rng.randn(n_samples) @@ -873,7 +917,8 @@ def test_ridgecv_store_cv_values(scoring): r.fit, x, y) -def test_ridge_classifier_cv_store_cv_values(): +@pytest.mark.parametrize("scoring", [None, 'accuracy', _accuracy_callable]) +def test_ridge_classifier_cv_store_cv_values(scoring): x = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = np.array([1, 1, 1, -1, -1]) @@ -882,7 +927,11 @@ def test_ridge_classifier_cv_store_cv_values(): alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) - r = RidgeClassifierCV(alphas=alphas, cv=None, store_cv_values=True) + scoring_ = make_scorer(scoring) if callable(scoring) else scoring + + r = RidgeClassifierCV( + alphas=alphas, cv=None, store_cv_values=True, scoring=scoring_ + ) # with len(y.shape) == 1 n_targets = 1 From bdf1ae9e4a739abf06076e67dc453774ffc886a7 Mon Sep 17 00:00:00 2001 From: huangk10 <42799131+huangk10@users.noreply.github.com> Date: Tue, 7 Jan 2020 06:40:16 +0800 Subject: [PATCH 184/448] ENH Add a optional fit_param to enable custom MultiOutput fit process (#15959) --- doc/whats_new/v0.22.rst | 8 ++++++++ sklearn/multioutput.py | 25 +++++++++++++++++-------- sklearn/tests/test_multioutput.py | 27 +++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index d5d7fddf7417d..bbd35676defcf 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -104,6 +104,14 @@ Changelog deprecated. This change is made to restore some backward compatibility with the behavior of this utility in version 0.21. :pr:`15947` by `Thomas Fan`_. +:mod:`sklearn.multioutput` +.......................... + +- |Feature| :func:`multioutput.MultiOutputRegressor.fit` and + :func:`multioutput.MultiOutputClassifier.fit` now can accept `fit_params` + to pass to the `estimator.fit` method of each step. :issue:`15953` + :pr:`15959` by :user:`Ke Huang `. + .. _changes_0_22: Version 0.22.0 diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index a6e8fc3c5dc16..945957cdc12a7 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -24,7 +24,8 @@ from .model_selection import cross_val_predict from .utils import check_array, check_X_y, check_random_state from .utils.metaestimators import if_delegate_has_method -from .utils.validation import check_is_fitted, has_fit_parameter +from .utils.validation import (check_is_fitted, has_fit_parameter, + _check_fit_params) from .utils.multiclass import check_classification_targets from .utils import deprecated @@ -32,12 +33,12 @@ "ClassifierChain", "RegressorChain"] -def _fit_estimator(estimator, X, y, sample_weight=None): +def _fit_estimator(estimator, X, y, sample_weight=None, **fit_params): estimator = clone(estimator) if sample_weight is not None: - estimator.fit(X, y, sample_weight=sample_weight) + estimator.fit(X, y, sample_weight=sample_weight, **fit_params) else: - estimator.fit(X, y) + estimator.fit(X, y, **fit_params) return estimator @@ -121,7 +122,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): sample_weight, first_time) for i in range(y.shape[1])) return self - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, **fit_params): """ Fit the model to data. Fit a separate model for each output variable. @@ -139,6 +140,9 @@ def fit(self, X, y, sample_weight=None): Only supported if the underlying regressor supports sample weights. + **fit_params : dict of string -> object + Parameters passed to the ``estimator.fit`` method of each step. + Returns ------- self : object @@ -164,9 +168,12 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Underlying estimator does not support" " sample weights.") + fit_params_validated = _check_fit_params(X, fit_params) + self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)( - self.estimator, X, y[:, i], sample_weight) + self.estimator, X, y[:, i], sample_weight, + **fit_params_validated) for i in range(y.shape[1])) return self @@ -338,7 +345,7 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): def __init__(self, estimator, n_jobs=None): super().__init__(estimator, n_jobs) - def fit(self, X, Y, sample_weight=None): + def fit(self, X, Y, sample_weight=None, **fit_params): """Fit the model to data matrix X and targets Y. Parameters @@ -351,12 +358,14 @@ def fit(self, X, Y, sample_weight=None): Sample weights. If None, then samples are equally weighted. Only supported if the underlying classifier supports sample weights. + **fit_params : dict of string -> object + Parameters passed to the ``estimator.fit`` method of each step. Returns ------- self : object """ - super().fit(X, Y, sample_weight) + super().fit(X, Y, sample_weight, **fit_params) self.classes_ = [estimator.classes_ for estimator in self.estimators_] return self diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index 6256f72a4b0b3..4a528716d181f 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -30,6 +30,7 @@ from sklearn.base import ClassifierMixin from sklearn.utils import shuffle from sklearn.model_selection import GridSearchCV +from sklearn.dummy import DummyRegressor, DummyClassifier def test_multi_target_regression(): @@ -561,3 +562,29 @@ class A(MultiOutputEstimator, MultiOutputRegressor): with pytest.warns(FutureWarning, match="is deprecated in version 0.22"): A(SGDRegressor(random_state=0, max_iter=5)) + + +class DummyRegressorWithFitParams(DummyRegressor): + def fit(self, X, y, sample_weight=None, **fit_params): + self._fit_params = fit_params + return super().fit(X, y, sample_weight) + + +class DummyClassifierWithFitParams(DummyClassifier): + def fit(self, X, y, sample_weight=None, **fit_params): + self._fit_params = fit_params + return super().fit(X, y, sample_weight) + + +@pytest.mark.parametrize( + "estimator, dataset", + [(MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")), + datasets.make_multilabel_classification()), + (MultiOutputRegressor(DummyRegressorWithFitParams()), + datasets.make_regression(n_targets=3))]) +def test_multioutput_estimator_with_fit_params(estimator, dataset): + X, y = dataset + some_param = np.zeros_like(X) + estimator.fit(X, y, some_param=some_param) + for dummy_estimator in estimator.estimators_: + assert 'some_param' in dummy_estimator._fit_params From a847cdebd624a00b6fb21a4f941466727aeb81f2 Mon Sep 17 00:00:00 2001 From: Siddharth Gupta <8859981+sid21g@users.noreply.github.com> Date: Wed, 8 Jan 2020 16:31:33 +0530 Subject: [PATCH 185/448] DOC Correct numpy link in Coding Guidelines page (#16049) --- doc/developers/develop.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index ead6286d98083..2211ab153197a 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -601,7 +601,7 @@ In addition, we add the following guidelines: find bugs in scikit-learn. * Use the `numpy docstring standard - `_ in all your docstrings. + `_ in all your docstrings. A good example of code that we like can be found `here From 3f9f7d5f7ddbb63c403428a7f3227f6849254d42 Mon Sep 17 00:00:00 2001 From: Marie Douriez Date: Wed, 8 Jan 2020 06:47:22 -0800 Subject: [PATCH 186/448] DOC improve random_state docstring in _logistic.py (#15729) --- sklearn/linear_model/_logistic.py | 33 +++++++++---------------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 7ea3f1b6566d7..41c06de1b9df2 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -585,12 +585,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, Default changed from 'ovr' to 'auto' in 0.22. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag' or - 'liblinear'. + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. check_input : bool, default=True If False, the input arrays X and y will not be checked. @@ -922,12 +918,8 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, binary*. 'multinomial' is unavailable when solver='liblinear'. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag' and - 'liblinear'. + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. max_squared_sum : float, default=None Maximum squared sum of X over samples. Used only in SAG solver. @@ -1098,12 +1090,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, *class_weight='balanced'* random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag' or - 'liblinear'. + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ default='lbfgs' @@ -1355,7 +1343,6 @@ def fit(self, X, y, sample_weight=None): accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) - n_samples, n_features = X.shape multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_)) @@ -1431,6 +1418,7 @@ def fit(self, X, y, sample_weight=None): fold_coefs_, _, n_iter_ = zip(*fold_coefs_) self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] + n_features = X.shape[1] if multi_class == 'multinomial': self.coef_ = fold_coefs_[0][0] else: @@ -1656,12 +1644,9 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, Default changed from 'ovr' to 'auto' in 0.22. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when `solver='sag'` or `solver='liblinear'`. + Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data. Note that this only applies to the solver and not the cross-validation - generator. + generator. See :term:`Glossary ` for details. l1_ratios : list of float, default=None The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. From b0c5a008d661a5dfe08a5cbf05cd617b9c02c63b Mon Sep 17 00:00:00 2001 From: dbauer9 <52041142+dbauer9@users.noreply.github.com> Date: Wed, 8 Jan 2020 16:11:24 +0100 Subject: [PATCH 187/448] DOC improve random_state docstring in for tree-based models (#15264) --- sklearn/tree/_classes.py | 62 ++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e56b2e9a269c0..eade7c9e56ad5 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -650,11 +650,17 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int or RandomState, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. @@ -767,13 +773,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. - The features are always randomly permuted at each split. Therefore, - the best found split may vary, even with the same training data and - ``max_features=n_features``, if the improvement of the criterion is - identical for several splits enumerated during the search of the best - split. To obtain a deterministic behaviour during fitting, - ``random_state`` has to be fixed. - References ---------- @@ -1030,11 +1029,17 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int or RandomState, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. @@ -1118,13 +1123,6 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. - The features are always randomly permuted at each split. Therefore, - the best found split may vary, even with the same training data and - ``max_features=n_features``, if the improvement of the criterion is - identical for several splits enumerated during the search of the best - split. To obtain a deterministic behaviour during fitting, - ``random_state`` has to be fixed. - References ---------- @@ -1319,11 +1317,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int or RandomState, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used to pick randomly the `max_features` used at each split. + See :term:`Glossary ` for details. max_leaf_nodes : int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. @@ -1549,11 +1545,9 @@ class ExtraTreeRegressor(DecisionTreeRegressor): valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - random_state : int or RandomState, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used to pick randomly the `max_features` used at each split. + See :term:`Glossary ` for details. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity From ceed3eb49047e4604bd8d350c958676883d68b4b Mon Sep 17 00:00:00 2001 From: Marie Douriez Date: Wed, 8 Jan 2020 07:57:54 -0800 Subject: [PATCH 188/448] DOC improve random_state docsting in _ridge and small clean-up (#15728) --- sklearn/linear_model/_ridge.py | 56 ++++++++++++++-------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 2973bc70b3282..067e00421234e 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -134,7 +134,7 @@ def _solve_lsqr(X, y, alpha, max_iter=None, tol=1e-3): def _solve_cholesky(X, y, alpha): # w = inv(X^t X + alpha*Id) * X.T y - n_samples, n_features = X.shape + n_features = X.shape[1] n_targets = y.shape[1] A = safe_sparse_dot(X.T, X, dense_output=True) @@ -275,8 +275,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge - coefficients. More stable for singular matrices than - 'cholesky'. + coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution via a Cholesky decomposition of @@ -301,7 +300,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', All last five solvers support both dense and sparse data. However, only - 'sag' and 'sparse_cg' supports sparse input when`fit_intercept` is + 'sag' and 'sparse_cg' supports sparse input when `fit_intercept` is True. .. versionadded:: 0.17 @@ -323,11 +322,8 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', information depending on the solver used. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag'. + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. return_n_iter : bool, default=False If True, the method also returns `n_iter`, the actual number of @@ -488,8 +484,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', coef_, n_iter_, _ = sag_solver( X, target.ravel(), sample_weight, 'squared', alpha_i, 0, max_iter, tol, verbose, random_state, False, max_squared_sum, - init, - is_saga=solver == 'saga') + init, is_saga=solver == 'saga') if return_intercept: coef[i] = coef_[:-1] intercept[i] = coef_[-1] @@ -659,8 +654,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge - coefficients. More stable for singular matrices than - 'cholesky'. + coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution. @@ -683,7 +677,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): scaler from sklearn.preprocessing. All last five solvers support both dense and sparse data. However, only - 'sparse_cg' supports sparse input when `fit_intercept` is True. + 'sag' and 'sparse_cg' supports sparse input when `fit_intercept` is + True. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. @@ -691,14 +686,11 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): SAGA solver. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag'. + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. .. versionadded:: 0.17 - *random_state* to support Stochastic Average Gradient. + `random_state` to support Stochastic Average Gradient. Attributes ---------- @@ -822,8 +814,7 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge - coefficients. More stable for singular matrices than - 'cholesky'. + coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution. @@ -851,11 +842,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): SAGA solver. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``solver`` == 'sag'. + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. Attributes ---------- @@ -1137,11 +1125,13 @@ def __init__(self, alphas=(0.1, 1.0, 10.0), self.store_cv_values = store_cv_values self.is_clf = is_clf - def _decomp_diag(self, v_prime, Q): + @staticmethod + def _decomp_diag(v_prime, Q): # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) return (v_prime * Q ** 2).sum(axis=-1) - def _diag_dot(self, D, B): + @staticmethod + def _diag_dot(D, B): # compute dot(diag(D), B) if len(B.shape) > 1: # handle case where B is > 1-d @@ -1336,7 +1326,7 @@ def _eigen_decompose_covariance(self, X, y, sqrt_sw): cov[-1] = 0 cov[:, -1] = 0 cov[-1, -1] = sqrt_sw.dot(sqrt_sw) - nullspace_dim = max(0, X.shape[1] - X.shape[0]) + nullspace_dim = max(0, n_features - n_samples) eigvals, V = linalg.eigh(cov) # remove eigenvalues and vectors in the null space of X^T.X eigvals = eigvals[nullspace_dim:] @@ -1469,8 +1459,6 @@ def fit(self, X, y, sample_weight=None): "alphas must be positive. Got {} containing some " "negative or null value instead.".format(self.alphas)) - n_samples, n_features = X.shape - X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) @@ -1488,11 +1476,13 @@ def fit(self, X, y, sample_weight=None): decompose = self._svd_decompose_design_matrix solve = self._solve_svd_design_matrix + n_samples = X.shape[0] + if sample_weight is not None: X, y = _rescale_data(X, y, sample_weight) sqrt_sw = np.sqrt(sample_weight) else: - sqrt_sw = np.ones(X.shape[0], dtype=X.dtype) + sqrt_sw = np.ones(n_samples, dtype=X.dtype) X_mean, *decomposition = decompose(X, y, sqrt_sw) From 7898d6b80101caa1e8d4a3911eca242f3b86dde5 Mon Sep 17 00:00:00 2001 From: Sarat Addepalli Date: Wed, 8 Jan 2020 21:39:33 +0530 Subject: [PATCH 189/448] DOC improve random_state docstrings in the linear_model module (#11900) --- sklearn/linear_model/_coordinate_descent.py | 80 ++++++++------------ sklearn/linear_model/_passive_aggressive.py | 22 +++--- sklearn/linear_model/_perceptron.py | 10 +-- sklearn/linear_model/_ransac.py | 9 +-- sklearn/linear_model/_sag.py | 10 +-- sklearn/linear_model/_stochastic_gradient.py | 16 ++-- sklearn/linear_model/_theil_sen.py | 9 +-- 7 files changed, 65 insertions(+), 91 deletions(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 30ccb0c9f702f..96c001bb7b794 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -577,13 +577,11 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): positive : bool, optional When set to ``True``, forces the coefficients to be positive. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -857,13 +855,11 @@ class Lasso(ElasticNet): positive : bool, optional When set to ``True``, forces the coefficients to be positive. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -1310,13 +1306,11 @@ class LassoCV(RegressorMixin, LinearModelCV): positive : bool, optional If positive, restrict regression coefficients to be positive - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -1484,13 +1478,11 @@ class ElasticNetCV(RegressorMixin, LinearModelCV): positive : bool, optional When set to ``True``, forces the coefficients to be positive. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -1662,13 +1654,11 @@ class MultiTaskElasticNet(Lasso): initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -1848,13 +1838,11 @@ class MultiTaskLasso(MultiTaskElasticNet): initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -2013,13 +2001,11 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV): ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration @@ -2189,13 +2175,11 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV): ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random - feature to update. If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random' + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index c83a8161c3890..ed0b1c68e1d25 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -73,12 +73,11 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + random_state : int, RandomState instance, default=None + Used to shuffle the training data, when ``shuffle`` is set to + ``True``. Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. warm_start : bool, optional When set to True, reuse the solution of the previous call to fit as @@ -319,12 +318,11 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): If the difference between the current prediction and the correct label is below this threshold, the model is not updated. - random_state : int, RandomState instance or None, optional, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + random_state : int, RandomState instance, default=None + Used to shuffle the training data, when ``shuffle`` is set to + ``True``. Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. warm_start : bool, optional When set to True, reuse the solution of the previous call to fit as diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py index 157083c010390..ff50f6ebbc06e 100644 --- a/sklearn/linear_model/_perceptron.py +++ b/sklearn/linear_model/_perceptron.py @@ -53,11 +53,10 @@ class Perceptron(BaseSGDClassifier): for more details. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + Used to shuffle the training data, when ``shuffle`` is set to + ``True``. Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. early_stopping : bool, default=False Whether to use early stopping to terminate training when validation. @@ -144,6 +143,7 @@ class Perceptron(BaseSGDClassifier): https://en.wikipedia.org/wiki/Perceptron and references therein. """ + def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 40ebb3a08420f..0363032359524 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -149,11 +149,10 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, If the loss on a sample is greater than the ``residual_threshold``, then this sample is classified as an outlier. - random_state : int, RandomState instance or None, optional, default None - The generator used to initialize the centers. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + The generator used to initialize the centers. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py index c5cd88fe6710a..9fe6f076f5145 100644 --- a/sklearn/linear_model/_sag.py +++ b/sklearn/linear_model/_sag.py @@ -151,12 +151,10 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., verbose : integer, optional The verbosity level. - random_state : int, RandomState instance or None, optional, default None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + random_state : int, RandomState instance, default=None + Used when shuffling the data. Pass an int for reproducible output + across multiple function calls. + See :term:`Glossary `. check_input : bool, default True If False, the input arrays X and y will not be checked. diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index a9775a4ae850e..2ca7eedd98a06 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -809,11 +809,9 @@ class SGDClassifier(BaseSGDClassifier): for more details. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + Used for shuffling the data, when ``shuffle`` is set to ``True``. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. learning_rate : str, default='optimal' The learning rate schedule: @@ -1417,11 +1415,9 @@ class SGDRegressor(BaseSGDRegressor): and the correct label are ignored if they are less than this threshold. random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + Used for shuffling the data, when ``shuffle`` is set to ``True``. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. learning_rate : string, default='invscaling' The learning rate schedule: diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 9adf8109a10ef..35cb3b8f25471 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -240,12 +240,11 @@ class TheilSenRegressor(RegressorMixin, LinearModel): tol : float, optional, default 1.e-3 Tolerance when calculating spatial median. - random_state : int, RandomState instance or None, optional, default None + random_state : int, RandomState instance, default=None A random number generator instance to define the state of the random - permutations generator. If int, random_state is the seed used by the - random number generator; If RandomState instance, random_state is the - random number generator; If None, the random number generator is the - RandomState instance used by `np.random`. + permutations generator. Pass an int for reproducible output across + multiple function calls. + See :term:`Glossary ` n_jobs : int or None, optional (default=None) Number of CPUs to use during the cross validation. From 801f729498ab19b18e7130376cc5fb83ae4c27b5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Jan 2020 18:51:51 +0100 Subject: [PATCH 190/448] DOC improve docstring random_state in cluster module (#16056) --- sklearn/cluster/_kmeans.py | 8 ++++---- sklearn/cluster/_mean_shift.py | 2 +- sklearn/cluster/_spectral.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 3bfc6328b9ac1..92dae7a4d7726 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -238,7 +238,7 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', tol : float, optional The relative increment in the results before declaring convergence. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. @@ -377,7 +377,7 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. @@ -580,7 +580,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, init : {'k-means++', 'random' or ndarray or callable} optional Method for initialization - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. @@ -1147,7 +1147,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, the distances of each sample to its closest center. May not be None when random_reassign is True. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization and to pick new clusters amongst observations with uniform probability. Use an int to make the randomness deterministic. diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 0b1a1f99c26de..b5c69f6b92f16 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -45,7 +45,7 @@ def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0, n_samples : int, optional The number of samples to use. If not given, all samples are used. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None The generator used to randomly select the samples from input points for bandwidth estimation. Use an int to make the randomness deterministic. diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index b6c5586e75b47..34ece7688ec92 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -38,7 +38,7 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, Maximum number of iterations to attempt in rotation and partition matrix search if machine precision convergence is not reached - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for rotation matrix initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. @@ -194,7 +194,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by the K-Means initialization. Use an int to make the randomness @@ -310,7 +310,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): n_components : integer, optional, default=n_clusters Number of eigen vectors to use for the spectral embedding - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by the K-Means initialization. Use an int to make the randomness From cb86aae1edabce5e93f36070d223ed2591930b31 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Jan 2020 18:52:31 +0100 Subject: [PATCH 191/448] DOC improve random_state docstring of datasets module (#16057) --- sklearn/datasets/_base.py | 2 +- sklearn/datasets/_covtype.py | 2 +- sklearn/datasets/_kddcup99.py | 2 +- sklearn/datasets/_olivetti_faces.py | 2 +- sklearn/datasets/_rcv1.py | 2 +- sklearn/datasets/_samples_generator.py | 40 +++++++++++++------------- sklearn/datasets/_twenty_newsgroups.py | 2 +- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 334e0a72b47c6..bdb5cfe8772ca 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -154,7 +154,7 @@ def load_files(container_path, description=None, categories=None, contains characters not of the given `encoding`. Passed as keyword argument 'errors' to bytes.decode. - random_state : int, RandomState instance or None (default=0) + random_state : int, RandomState instance or None, default=0 Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 233b19678f6de..f9fab853adc84 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -66,7 +66,7 @@ def fetch_covtype(data_home=None, download_if_missing=True, If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index cfda9cfaeca84..0a8121521ac82 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -76,7 +76,7 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False, shuffle : bool, default=False Whether to shuffle dataset. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling and for selection of abnormal samples if `subset='SA'`. Pass an int for reproducible output across multiple function calls. diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index 004f26b464836..f88f088e82e8b 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -61,7 +61,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, If True the order of the dataset is shuffled to avoid having images of the same person grouped. - random_state : int, RandomState instance or None (default=0) + random_state : int, RandomState instance or None, default=0 Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index 887a8271eae5e..0836fe1249271 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -111,7 +111,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 10c87d988c324..d57dfc9bda999 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -124,7 +124,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, shuffle : boolean, optional (default=True) Shuffle the samples and the features. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -320,7 +320,7 @@ def make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, probabilities of features given classes, from which the data was drawn. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -437,7 +437,7 @@ def make_hastie_10_2(n_samples=12000, random_state=None): n_samples : int, optional (default=12000) The number of samples. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -528,7 +528,7 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, coef : boolean, optional (default=False) If True, the coefficients of the underlying linear model are returned. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -615,7 +615,7 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None, noise : double or None (default=None) Standard deviation of Gaussian noise added to the data. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -685,7 +685,7 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None): noise : double or None (default=None) Standard deviation of Gaussian noise added to the data. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -764,7 +764,7 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, shuffle : boolean, optional (default=True) Shuffle the samples. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -901,7 +901,7 @@ def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None): noise : float, optional (default=0.0) The standard deviation of the gaussian noise applied to the output. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset noise. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -962,7 +962,7 @@ def make_friedman2(n_samples=100, noise=0.0, random_state=None): noise : float, optional (default=0.0) The standard deviation of the gaussian noise applied to the output. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset noise. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1027,7 +1027,7 @@ def make_friedman3(n_samples=100, noise=0.0, random_state=None): noise : float, optional (default=0.0) The standard deviation of the gaussian noise applied to the output. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset noise. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1103,7 +1103,7 @@ def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, The relative importance of the fat noisy tail of the singular values profile. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1156,7 +1156,7 @@ def make_sparse_coded_signal(n_samples, n_components, n_features, n_nonzero_coefs : int number of active (non-zero) coefficients in each sample - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1215,7 +1215,7 @@ def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None): n_features : int, optional (default=10) The number of features. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1255,7 +1255,7 @@ def make_spd_matrix(n_dim, random_state=None): n_dim : int The matrix dimension. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1304,7 +1304,7 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False, largest_coef : float between 0 and 1, optional (default=0.9) The value of the largest coefficient. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1365,7 +1365,7 @@ def make_swiss_roll(n_samples=100, noise=0.0, random_state=None): noise : float, optional (default=0.0) The standard deviation of the gaussian noise. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1417,7 +1417,7 @@ def make_s_curve(n_samples=100, noise=0.0, random_state=None): noise : float, optional (default=0.0) The standard deviation of the gaussian noise. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1480,7 +1480,7 @@ def make_gaussian_quantiles(mean=None, cov=1., n_samples=100, shuffle : boolean, optional (default=True) Shuffle the samples. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1568,7 +1568,7 @@ def make_biclusters(shape, n_clusters, noise=0.0, minval=10, shuffle : boolean, optional (default=True) Shuffle the samples. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1659,7 +1659,7 @@ def make_checkerboard(shape, n_clusters, noise=0.0, minval=10, shuffle : boolean, optional (default=True) Shuffle the samples. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index 5d43aa7c558ad..a1edc08019c85 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -184,7 +184,7 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None, make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. - random_state : int, RandomState instance or None (default) + random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. From cd6e4d52d9bcd8d2152b97c9cf8902eeec4642ca Mon Sep 17 00:00:00 2001 From: Sarat Addepalli Date: Wed, 8 Jan 2020 23:23:01 +0530 Subject: [PATCH 192/448] DOC improve random_state docstring from decomposition module (#11902) * [MRG] Fix random_state docstrings in decomposition module * Add more details, and specify reproducibility * [MRG] docs for random_state in decomposition module Update the wording for reproducible results. * [MRG] Update doc for optional parameter to match numpy doc style * apply new style Co-authored-by: Guillaume Lemaitre --- sklearn/decomposition/_dict_learning.py | 45 ++++++++++++----------- sklearn/decomposition/_factor_analysis.py | 9 ++--- sklearn/decomposition/_fastica.py | 20 +++++----- sklearn/decomposition/_kernel_pca.py | 9 ++--- sklearn/decomposition/_lda.py | 8 ++-- sklearn/decomposition/_nmf.py | 39 ++++++++++---------- sklearn/decomposition/_pca.py | 9 ++--- sklearn/decomposition/_sparse_pca.py | 19 +++++----- sklearn/decomposition/_truncated_svd.py | 9 ++--- 9 files changed, 80 insertions(+), 87 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index a2f3f601f4127..f19305dbfc272 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -361,11 +361,10 @@ def _update_dict(dictionary, Y, code, verbose=False, return_r2=False, Whether to compute and return the residual sum of squares corresponding to the computed solution. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used for randomly initializing the dictionary. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. positive : boolean, optional Whether to enforce positivity when finding the dictionary. @@ -483,10 +482,9 @@ def dict_learning(X, n_components, alpha, max_iter=100, tol=1e-8, To control the verbosity of the procedure. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Used for randomly initializing the dictionary. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. return_n_iter : bool Whether or not to return the number of iterations. @@ -690,10 +688,11 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, initialization. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Used for initializing the dictionary when ``dict_init`` is not + specified, randomly shuffling the data when ``shuffle`` is set to + ``True``, and updating the dictionary. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. return_inner_stats : boolean, optional Return the inner statistics A (dictionary covariance) and B @@ -1132,11 +1131,12 @@ class DictionaryLearning(SparseCodingMixin, BaseEstimator): its negative part and its positive part. This can improve the performance of downstream classifiers. - random_state : int, RandomState instance or None, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, optional (default=None) + Used for initializing the dictionary when ``dict_init`` is not + specified, randomly shuffling the data when ``shuffle`` is set to + ``True``, and updating the dictionary. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. positive_code : bool, default=False Whether to enforce positivity when finding the code. @@ -1323,10 +1323,11 @@ class MiniBatchDictionaryLearning(SparseCodingMixin, BaseEstimator): performance of downstream classifiers. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Used for initializing the dictionary when ``dict_init`` is not + specified, randomly shuffling the data when ``shuffle`` is set to + ``True``, and updating the dictionary. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. positive_code : bool Whether to enforce positivity when finding the code. diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 14f0648d937bc..f9bb249c8a057 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -89,11 +89,10 @@ class FactorAnalysis(TransformerMixin, BaseEstimator): Number of iterations for the power method. 3 by default. Only used if ``svd_method`` equals 'randomized' - random_state : int, RandomState instance or None, optional (default=0) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Only used when ``svd_method`` equals 'randomized'. + random_state : int, RandomState instance, default=None + Only used when ``svd_method`` equals 'randomized'. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. Attributes ---------- diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index c191f5e41ab41..44e665556b805 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -202,11 +202,11 @@ def my_g(x): Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.'s is used. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used to initialize ``w_init`` when not specified, with a + normal distribution. Pass an int, for reproducible results + across multiple function calls. + See :term:`Glossary `. return_X_mean : bool, optional If True, X_mean is returned too. @@ -341,11 +341,11 @@ def my_g(x): w_init : None of an (n_components, n_components) ndarray The mixing matrix to be used to initialize the algorithm. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used to initialize ``w_init`` when not specified, with a + normal distribution. Pass an int, for reproducible results + across multiple function calls. + See :term:`Glossary `. Attributes ---------- diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 169b0942e74bd..a8559f341591b 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -76,11 +76,10 @@ class KernelPCA(TransformerMixin, BaseEstimator): When n_components is None, this parameter is ignored and components with zero eigenvalues are removed regardless. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``eigen_solver`` == 'arpack'. + random_state : int, RandomState instance, default=None + Used when ``eigen_solver`` == 'arpack'. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. .. versionadded:: 0.18 diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 8fcb51896d190..48b52df811734 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -222,11 +222,9 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): verbose : int, optional (default=0) Verbosity level. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. Attributes ---------- diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 6d5509611cefd..9458390c88414 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -287,11 +287,10 @@ def _initialize_nmf(X, n_components, init=None, eps=1e-6, eps : float Truncate all values less then this in output to zero. - random_state : int, RandomState instance or None, optional, default: None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'. + random_state : int, RandomState instance, default=None + Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. Returns ------- @@ -472,11 +471,11 @@ def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0, shuffle : boolean, default: False If true, randomize the order of coordinates in the CD solver. - random_state : int, RandomState instance or None, optional, default: None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used to randomize the coordinates in the CD solver, when + ``shuffle`` is set to ``True``. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. Returns ------- @@ -963,11 +962,11 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, Select whether the regularization affects the components (H), the transformation (W), both or none of them. - random_state : int, RandomState instance or None, optional, default: None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used for NMF initialisation (when ``init`` == 'nndsvdar' or + 'random'), and in Coordinate Descent. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. verbose : integer, default: 0 The verbosity level. @@ -1156,11 +1155,11 @@ class NMF(TransformerMixin, BaseEstimator): max_iter : integer, default: 200 Maximum number of iterations before timing out. - random_state : int, RandomState instance or None, optional, default: None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used for initialisation (when ``init`` == 'nndsvdar' or + 'random'), and in Coordinate Descent. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. alpha : double, default: 0. Constant that multiplies the regularization terms. Set it to zero to diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index e3fcf2dfcc6bb..faa083e099b5e 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -189,11 +189,10 @@ class PCA(_BasePCA): .. versionadded:: 0.18.0 - random_state : int, RandomState instance or None, optional (default None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'. + random_state : int, RandomState instance, default=None + Used when ``svd_solver`` == 'arpack' or 'randomized'. Pass an int + for reproducible results across multiple function calls. + See :term:`Glossary `. .. versionadded:: 0.18.0 diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 3e31994d6894d..98aecc4a43db8 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -79,11 +79,10 @@ class SparsePCA(TransformerMixin, BaseEstimator): verbose : int Controls the verbosity; the higher, the more messages. Defaults to 0. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used during dictionary learning. Pass an int for reproducible results + across multiple function calls. + See :term:`Glossary `. normalize_components : 'deprecated' This parameter does not have any effect. The components are always @@ -282,11 +281,11 @@ class MiniBatchSparsePCA(SparsePCA): Lasso solution (linear_model.Lasso). Lars will be faster if the estimated components are sparse. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used for random shuffling when ``shuffle`` is set to ``True``, + during online dictionary learning. Pass an int for reproducible results + across multiple function calls. + See :term:`Glossary `. normalize_components : 'deprecated' This parameter does not have any effect. The components are always diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 73e4dfbe9f547..e3bddd23c4de8 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -56,11 +56,10 @@ class TruncatedSVD(TransformerMixin, BaseEstimator): `~sklearn.utils.extmath.randomized_svd` to handle sparse matrices that may have large slowly decaying spectrum. - random_state : int, RandomState instance or None, optional, default = None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Used during randomized svd. Pass an int for reproducible results across + multiple function calls. + See :term:`Glossary `. tol : float, optional Tolerance for ARPACK. 0 means machine precision. Ignored by randomized From d367ea474c6e60602ced053d383810e5fc8dde62 Mon Sep 17 00:00:00 2001 From: Madhura Jayaratne Date: Thu, 9 Jan 2020 18:43:53 +1100 Subject: [PATCH 193/448] MNT Avoid calling check_array/check_X_y twice (#16044) --- sklearn/naive_bayes.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 8c10ae54303e7..0b8ec518d2f28 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1146,21 +1146,15 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): sample_weight=sample_weight) def _check_X(self, X): - # FIXME: we can avoid calling check_array twice after #14872 is merged. - # X = check_array(X, y, dtype='int', accept_sparse=False, - # force_all_finite=True) - X = check_array(X, accept_sparse=False, force_all_finite=True) - X = check_array(X, dtype='int') + X = check_array(X, dtype='int', accept_sparse=False, + force_all_finite=True) if np.any(X < 0): raise ValueError("X must not contain negative values.") return X def _check_X_y(self, X, y): - # FIXME: we can avoid calling check_array twice after #14872 is merged. - # X, y = check_array(X, y, dtype='int', accept_sparse=False, - # force_all_finite=True) - X, y = check_X_y(X, y, accept_sparse=False, force_all_finite=True) - X, y = check_X_y(X, y, dtype='int') + X, y = check_X_y(X, y, dtype='int', accept_sparse=False, + force_all_finite=True) if np.any(X < 0): raise ValueError("X must not contain negative values.") return X, y From 40f668c03891caaeda231279fe12cbb95b198cbe Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Thu, 9 Jan 2020 14:41:21 +0530 Subject: [PATCH 194/448] MNT replace self.solver with solver in _ridge.py (#16054) --- sklearn/linear_model/_ridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 067e00421234e..15469391388b1 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -573,7 +573,7 @@ def fit(self, X, y, sample_weight=None): if solver == 'sag' and sparse.issparse(X) and self.fit_intercept: self.coef_, self.n_iter_, self.intercept_ = _ridge_regression( X, y, alpha=self.alpha, sample_weight=sample_weight, - max_iter=self.max_iter, tol=self.tol, solver=self.solver, + max_iter=self.max_iter, tol=self.tol, solver='sag', random_state=self.random_state, return_n_iter=True, return_intercept=True, check_input=False) # add the offset which was subtracted by _preprocess_data From eb1244963dea7d54ce5262aa67fe2476a68a4b18 Mon Sep 17 00:00:00 2001 From: Mandy Gu <32079152+happilyeverafter95@users.noreply.github.com> Date: Thu, 9 Jan 2020 04:54:45 -0500 Subject: [PATCH 195/448] DOC improve random_state docstring random modue (#15576) --- sklearn/utils/random.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py index d59d578ff1a1d..524c90406e6eb 100644 --- a/sklearn/utils/random.py +++ b/sklearn/utils/random.py @@ -36,11 +36,9 @@ def _random_choice_csc(n_samples, classes, class_probability=None, Optional (default=None). Class distribution of each column. If None the uniform distribution is assumed. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Controls the randomness of the sampled classes. + See :term:`Glossary `. Returns ------- From 4fe4d27eb1b88cc799199f2176ac734bbc16ca31 Mon Sep 17 00:00:00 2001 From: Thomas Schmitt Date: Thu, 9 Jan 2020 15:59:30 +0100 Subject: [PATCH 196/448] DOC Add version_added label for impute module (#15549) --- sklearn/impute/_base.py | 6 ++++++ sklearn/impute/_iterative.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index c952831d85e1f..6ef34b8312e23 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -120,6 +120,10 @@ class SimpleImputer(_BaseImputer): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer` + estimator which is now removed. + Parameters ---------- missing_values : number, string, np.nan (default) or None @@ -463,6 +467,8 @@ class MissingIndicator(TransformerMixin, BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- missing_values : number, string, np.nan (default) or None diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 7983b8dbe4062..fb3d4470c8e6e 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -34,6 +34,8 @@ class IterativeImputer(_BaseImputer): Read more in the :ref:`User Guide `. + .. versionadded:: 0.21 + .. note:: This estimator is still **experimental** for now: the predictions From afe6e513a454a325de5e1ba0f1dc4db8928522b6 Mon Sep 17 00:00:00 2001 From: Titus Christian Date: Fri, 10 Jan 2020 01:02:28 +0700 Subject: [PATCH 197/448] DOC improve svm module docstring per guidlines (#16060) --- sklearn/svm/_base.py | 59 ++++----- sklearn/svm/_bounds.py | 12 +- sklearn/svm/_classes.py | 270 ++++++++++++++++++++-------------------- 3 files changed, 174 insertions(+), 167 deletions(-) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index d327e0fef26e4..e83ee8ada9d57 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -109,17 +109,17 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. For kernel="precomputed", the expected shape of X is (n_samples, n_samples). - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values (class labels in classification, real numbers in regression) - sample_weight : array-like, shape (n_samples,) + sample_weight : array-like of shape (n_samples,), default=None Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. @@ -304,13 +304,13 @@ def predict(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) For kernel="precomputed", the expected shape of X is (n_samples_test, n_samples_train). Returns ------- - y_pred : array, shape (n_samples,) + y_pred : ndarray of shape (n_samples,) """ X = self._validate_for_predict(X) predict = self._sparse_predict if self._sparse else self._dense_predict @@ -377,11 +377,11 @@ def _decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Returns ------- - X : array-like, shape (n_samples, n_class * (n_class-1) / 2) + X : array-like of shape (n_samples, n_class * (n_class-1) / 2) Returns the decision function of the sample for each class in the model. """ @@ -539,11 +539,11 @@ def decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Returns ------- - X : array-like, shape (n_samples, n_classes * (n_classes-1) / 2) + X : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2) Returns the decision function of the sample for each class in the model. If decision_function_shape='ovr', the shape is (n_samples, @@ -572,13 +572,14 @@ def predict(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) For kernel="precomputed", the expected shape of X is - [n_samples_test, n_samples_train] + (n_samples_test, n_samples_train). Returns ------- - y_pred : array, shape (n_samples,) + y_pred : ndarray of shape (n_samples,) Class labels for samples in X. """ check_is_fitted(self) @@ -615,13 +616,13 @@ def predict_proba(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) For kernel="precomputed", the expected shape of X is [n_samples_test, n_samples_train] Returns ------- - T : array-like, shape (n_samples, n_classes) + T : ndarray of shape (n_samples, n_classes) Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute :term:`classes_`. @@ -654,13 +655,14 @@ def predict_log_proba(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) For kernel="precomputed", the expected shape of X is - [n_samples_test, n_samples_train] + (n_samples_test, n_samples_train). Returns ------- - T : array-like, shape (n_samples, n_classes) + T : ndarray of shape (n_samples, n_classes) Returns the log-probabilities of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute :term:`classes_`. @@ -804,11 +806,11 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target vector relative to X C : float @@ -825,7 +827,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, In order to avoid this, one should increase the intercept_scaling. such that the feature vector becomes [x, intercept_scaling]. - class_weight : {dict, 'balanced'}, optional + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same @@ -835,7 +837,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` - penalty : str, {'l1', 'l2'} + penalty : {'l1', 'l2'} The norm of the penalty used in regularization. dual : bool @@ -850,14 +852,14 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, tol : float Stopping condition. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - multi_class : str, {'ovr', 'crammer_singer'} + multi_class : {'ovr', 'crammer_singer'}, default='ovr' `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer` optimizes a joint objective over all classes. While `crammer_singer` is interesting from an theoretical perspective @@ -866,21 +868,22 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, If `crammer_singer` is chosen, the options loss, penalty and dual will be ignored. - loss : str, {'logistic_regression', 'hinge', 'squared_hinge', - 'epsilon_insensitive', 'squared_epsilon_insensitive} + loss : {'logistic_regression', 'hinge', 'squared_hinge', \ + 'epsilon_insensitive', 'squared_epsilon_insensitive}, \ + default='logistic_regression' The loss function used to fit the model. - epsilon : float, optional (default=0.1) + epsilon : float, default=0.1 Epsilon parameter in the epsilon-insensitive loss function. Note that the value of this parameter depends on the scale of the target variable y. If unsure, set epsilon=0. - sample_weight : array-like, optional + sample_weight : array-like of shape (n_samples,), default=None Weights assigned to each sample. Returns ------- - coef_ : ndarray, shape (n_features, n_features + 1) + coef_ : ndarray of shape (n_features, n_features + 1) The coefficient vector got by minimizing the objective function. intercept_ : float diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py index c60f0cd033213..1e1ed8939ce5f 100644 --- a/sklearn/svm/_bounds.py +++ b/sklearn/svm/_bounds.py @@ -21,23 +21,23 @@ def l1_min_c(X, y, loss='squared_hinge', fit_intercept=True, Parameters ---------- - X : {array-like or sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. - y : array, shape = [n_samples] - Target vector relative to X + y : array-like of shape (n_samples,) + Target vector relative to X. - loss : {'squared_hinge', 'log'}, default 'squared_hinge' + loss : {'squared_hinge', 'log'}, default='squared_hinge' Specifies the loss function. With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss). With 'log' it is the loss of logistic regression models. - fit_intercept : bool, default: True + fit_intercept : bool, default=True Specifies if the intercept should be fitted by the model. It must match the fit() method parameter. - intercept_scaling : float, default: 1 + intercept_scaling : float, default=1.0 when fit_intercept is True, instance vector x becomes [x, intercept_scaling], i.e. a "synthetic" feature with constant value equals to diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 50c2356142ae2..c8a27a12e72e5 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -26,28 +26,28 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, Parameters ---------- - penalty : str, 'l1' or 'l2' (default='l2') + penalty : {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse. - loss : str, 'hinge' or 'squared_hinge' (default='squared_hinge') + loss : {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. - dual : bool, (default=True) + dual : bool, default=True Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. - tol : float, optional (default=1e-4) + tol : float, default=1e-4 Tolerance for stopping criteria. - C : float, optional (default=1.0) + C : float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. - multi_class : str, 'ovr' or 'crammer_singer' (default='ovr') + multi_class : {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``"ovr"`` trains n_classes one-vs-rest classifiers, while @@ -58,12 +58,12 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, If ``"crammer_singer"`` is chosen, the options loss, penalty and dual will be ignored. - fit_intercept : bool, optional (default=True) + fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be already centered). - intercept_scaling : float, optional (default=1) + intercept_scaling : float, default=1 When self.fit_intercept is True, instance vector x becomes ``[x, self.intercept_scaling]``, i.e. a "synthetic" feature with constant value equals to @@ -74,7 +74,7 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - class_weight : {dict, 'balanced'}, optional + class_weight : dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. @@ -82,12 +82,12 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. - verbose : int, (default=0) + verbose : int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data for the dual coordinate descent (if ``dual=True``). When ``dual=False`` the underlying implementation of :class:`LinearSVC` @@ -97,23 +97,23 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, None, the random number generator is the RandomState instance used by `np.random`. - max_iter : int, (default=1000) + max_iter : int, default=1000 The maximum number of iterations to be run. Attributes ---------- - coef_ : array, shape = [1, n_features] if n_classes == 2 \ -else [n_classes, n_features] + coef_ : ndarray of shape (1, n_features) if n_classes == 2 \ + else (n_classes, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. ``coef_`` is a readonly property derived from ``raw_coef_`` that follows the internal memory layout of liblinear. - intercept_ : array, shape = [1] if n_classes == 2 else [n_classes] + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) Constants in decision function. - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The unique classes labels. n_iter_ : int @@ -267,29 +267,30 @@ class LinearSVR(RegressorMixin, LinearModel): Parameters ---------- - epsilon : float, optional (default=0.0) + epsilon : float, default=0.0 Epsilon parameter in the epsilon-insensitive loss function. Note that the value of this parameter depends on the scale of the target variable y. If unsure, set ``epsilon=0``. - tol : float, optional (default=1e-4) + tol : float, default=1e-4 Tolerance for stopping criteria. - C : float, optional (default=1.0) + C : float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. - loss : string, optional (default='epsilon_insensitive') + loss : {'epsilon_insensitive', 'squared_epsilon_insensitive'}, \ + default='epsilon_insensitive' Specifies the loss function. The epsilon-insensitive loss (standard SVR) is the L1 loss, while the squared epsilon-insensitive loss ('squared_epsilon_insensitive') is the L2 loss. - fit_intercept : boolean, optional (default=True) + fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be already centered). - intercept_scaling : float, optional (default=1) + intercept_scaling : float, default=1. When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equals to @@ -300,35 +301,36 @@ class LinearSVR(RegressorMixin, LinearModel): To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - dual : bool, (default=True) + dual : bool, default=True Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. - verbose : int, (default=0) + verbose : int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - max_iter : int, (default=1000) + max_iter : int, default=1000 The maximum number of iterations to be run. Attributes ---------- - coef_ : array, shape = [n_features] if n_classes == 2 else [n_classes, n_features] + coef_ : ndarray of shape (n_features) if n_classes == 2 \ + else (n_classes, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is a readonly property derived from `raw_coef_` that follows the internal memory layout of liblinear. - intercept_ : array, shape = [1] if n_classes == 2 else [n_classes] + intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes) Constants in decision function. n_iter_ : int @@ -402,6 +404,7 @@ def fit(self, X, y, sample_weight=None): Returns ------- self : object + An instance of the estimator. """ # FIXME Remove l1/l2 support in 0.23 ---------------------------------- msg = ("loss='%s' has been deprecated in favor of " @@ -456,12 +459,12 @@ class SVC(BaseSVC): Parameters ---------- - C : float, optional (default=1.0) + C : float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. - kernel : string, optional (default='rbf') + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. @@ -469,11 +472,11 @@ class SVC(BaseSVC): used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. - degree : int, optional (default=3) + degree : int, default=3 Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : {'scale', 'auto'} or float, optional (default='scale') + gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses @@ -483,26 +486,26 @@ class SVC(BaseSVC): .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, optional (default=0.0) + coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. - shrinking : boolean, optional (default=True) + shrinking : bool, default=True Whether to use the shrinking heuristic. - probability : boolean, optional (default=False) + probability : bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `. - tol : float, optional (default=1e-3) + tol : float, default=1e-3 Tolerance for stopping criterion. - cache_size : float, optional + cache_size : float, default=200 Specify the size of the kernel cache (in MB). - class_weight : {dict, 'balanced'}, optional + class_weight : dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. @@ -510,15 +513,15 @@ class SVC(BaseSVC): weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` - verbose : bool, default: False + verbose : bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. - max_iter : int, optional (default=-1) + max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. - decision_function_shape : 'ovo', 'ovr', default='ovr' + decision_function_shape : {'ovo', 'ovr'}, default='ovr' Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape @@ -534,7 +537,7 @@ class SVC(BaseSVC): .. versionchanged:: 0.17 Deprecated *decision_function_shape='ovo' and None*. - break_ties : bool, optional (default=False) + break_ties : bool, default=False If true, ``decision_function_shape='ovr'``, and number of classes > 2, :term:`predict` will break ties according to the confidence values of :term:`decision_function`; otherwise the first class among the tied @@ -543,7 +546,7 @@ class SVC(BaseSVC): .. versionadded:: 0.22 - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None The seed of the pseudo random number generator used when shuffling the data for probability estimates. If int, random_state is the seed used by the random number generator; If RandomState instance, @@ -552,23 +555,23 @@ class SVC(BaseSVC): Attributes ---------- - support_ : array-like of shape (n_SV) + support_ : ndarray of shape (n_SV,) Indices of support vectors. - support_vectors_ : array-like of shape (n_SV, n_features) + support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - n_support_ : array-like, dtype=int32, shape = [n_class] + n_support_ : ndarray of shape (n_class,), dtype=int32 Number of support vectors for each class. - dual_coef_ : array, shape = [n_class-1, n_SV] + dual_coef_ : ndarray of shape (n_class-1, n_SV) Coefficients of the support vector in the decision function. For multiclass, coefficient for all 1-vs-1 classifiers. The layout of the coefficients in the multiclass case is somewhat non-trivial. See the section about multi-class classification in the SVM section of the User Guide for details. - coef_ : array, shape = [n_class * (n_class-1) / 2, n_features] + coef_ : ndarray of shape (n_class * (n_class-1) / 2, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. @@ -581,11 +584,11 @@ class SVC(BaseSVC): fit_status_ : int 0 if correctly fitted, 1 otherwise (will raise warning) - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The classes labels. - probA_ : array, shape = [n_class * (n_class-1) / 2] - probB_ : array, shape = [n_class * (n_class-1) / 2] + probA_ : ndarray of shape (n_class * (n_class-1) / 2) + probB_ : ndarray of shape (n_class * (n_class-1) / 2) If `probability=True`, it corresponds to the parameters learned in Platt scaling to produce probability estimates from decision values. If `probability=False`, it's an empty array. Platt scaling uses the @@ -665,23 +668,23 @@ class NuSVC(BaseSVC): Parameters ---------- - nu : float, optional (default=0.5) + nu : float, default=0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. - kernel : string, optional (default='rbf') + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. - degree : int, optional (default=3) + degree : int, default=3 Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : {'scale', 'auto'} or float, optional (default='scale') + gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses @@ -691,41 +694,41 @@ class NuSVC(BaseSVC): .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, optional (default=0.0) + coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. - shrinking : boolean, optional (default=True) + shrinking : bool, default=True Whether to use the shrinking heuristic. - probability : boolean, optional (default=False) + probability : bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `. - tol : float, optional (default=1e-3) + tol : float, default=1e-3 Tolerance for stopping criterion. - cache_size : float, optional + cache_size : float, default=200 Specify the size of the kernel cache (in MB). - class_weight : {dict, 'balanced'}, optional + class_weight : {dict, 'balanced'}, default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies as ``n_samples / (n_classes * np.bincount(y))`` - verbose : bool, default: False + verbose : bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. - max_iter : int, optional (default=-1) + max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. - decision_function_shape : 'ovo', 'ovr', default='ovr' + decision_function_shape : {'ovo', 'ovr'}, default='ovr' Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape @@ -740,7 +743,7 @@ class NuSVC(BaseSVC): .. versionchanged:: 0.17 Deprecated *decision_function_shape='ovo' and None*. - break_ties : bool, optional (default=False) + break_ties : bool, default=False If true, ``decision_function_shape='ovr'``, and number of classes > 2, :term:`predict` will break ties according to the confidence values of :term:`decision_function`; otherwise the first class among the tied @@ -749,7 +752,7 @@ class NuSVC(BaseSVC): .. versionadded:: 0.22 - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None The seed of the pseudo random number generator used when shuffling the data for probability estimates. If int, random_state is the seed used by the random number generator; If RandomState instance, @@ -758,23 +761,23 @@ class NuSVC(BaseSVC): Attributes ---------- - support_ : array-like of shape (n_SV) + support_ : ndarray of shape (n_SV,) Indices of support vectors. - support_vectors_ : array-like of shape (n_SV, n_features) + support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - n_support_ : array-like, dtype=int32, shape = [n_class] + n_support_ : ndarray of shape (n_class), dtype=int32 Number of support vectors for each class. - dual_coef_ : array, shape = [n_class-1, n_SV] + dual_coef_ : ndarray of shape (n_class-1, n_SV) Coefficients of the support vector in the decision function. For multiclass, coefficient for all 1-vs-1 classifiers. The layout of the coefficients in the multiclass case is somewhat non-trivial. See the section about multi-class classification in the SVM section of the User Guide for details. - coef_ : array, shape = [n_class * (n_class-1) / 2, n_features] + coef_ : ndarray of shape (n_class * (n_class-1) / 2, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. @@ -784,13 +787,13 @@ class NuSVC(BaseSVC): intercept_ : ndarray of shape (n_class * (n_class-1) / 2,) Constants in decision function. - classes_ : array of shape (n_classes,) + classes_ : ndarray of shape (n_classes,) The unique classes labels. fit_status_ : int 0 if correctly fitted, 1 if the algorithm did not converge. - probA_ : ndarray, shape of (n_class * (n_class-1) / 2,) + probA_ : ndarray of shape (n_class * (n_class-1) / 2,) probB_ : ndarray of shape (n_class * (n_class-1) / 2,) If `probability=True`, it corresponds to the parameters learned in Platt scaling to produce probability estimates from decision values. @@ -873,18 +876,18 @@ class SVR(RegressorMixin, BaseLibSVM): Parameters ---------- - kernel : string, optional (default='rbf') + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. - degree : int, optional (default=3) + degree : int, default=3 Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : {'scale', 'auto'} or float, optional (default='scale') + gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses @@ -894,50 +897,50 @@ class SVR(RegressorMixin, BaseLibSVM): .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, optional (default=0.0) + coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. - tol : float, optional (default=1e-3) + tol : float, default=1e-3 Tolerance for stopping criterion. - C : float, optional (default=1.0) + C : float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. - epsilon : float, optional (default=0.1) + epsilon : float, default=0.1 Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value. - shrinking : boolean, optional (default=True) + shrinking : bool, default=True Whether to use the shrinking heuristic. - cache_size : float, optional + cache_size : float, default=200 Specify the size of the kernel cache (in MB). - verbose : bool, default: False + verbose : bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. - max_iter : int, optional (default=-1) + max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. Attributes ---------- - support_ : array-like of shape (n_SV) + support_ : ndarray of shape (n_SV,) Indices of support vectors. - support_vectors_ : array-like of shape (n_SV, n_features) + support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - dual_coef_ : array, shape = [1, n_SV] + dual_coef_ : ndarray of shape (1, n_SV) Coefficients of the support vector in the decision function. - coef_ : array, shape = [1, n_features] + coef_ : ndarray of shape (1, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. @@ -947,7 +950,7 @@ class SVR(RegressorMixin, BaseLibSVM): fit_status_ : int 0 if correctly fitted, 1 otherwise (will raise warning) - intercept_ : array, shape = [1] + intercept_ : ndarray of shape (1,) Constants in decision function. Examples @@ -1005,26 +1008,26 @@ class NuSVR(RegressorMixin, BaseLibSVM): Parameters ---------- - nu : float, optional + nu : float, default=0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. - C : float, optional (default=1.0) + C : float, default=1.0 Penalty parameter C of the error term. - kernel : string, optional (default='rbf') + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. - degree : int, optional (default=3) + degree : int, default=3 Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : {'scale', 'auto'} or float, optional (default='scale') + gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses @@ -1034,46 +1037,46 @@ class NuSVR(RegressorMixin, BaseLibSVM): .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, optional (default=0.0) + coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. - shrinking : boolean, optional (default=True) + shrinking : bool, default=True Whether to use the shrinking heuristic. - tol : float, optional (default=1e-3) + tol : float, default=1e-3 Tolerance for stopping criterion. - cache_size : float, optional + cache_size : float, default=200 Specify the size of the kernel cache (in MB). - verbose : bool, default: False + verbose : bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. - max_iter : int, optional (default=-1) + max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. Attributes ---------- - support_ : array-like of shape (n_SV) + support_ : ndarray of shape (n_SV,) Indices of support vectors. - support_vectors_ : array-like of shape (n_SV, n_features) + support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - dual_coef_ : array, shape = [1, n_SV] + dual_coef_ : ndarray of shape (1, n_SV) Coefficients of the support vector in the decision function. - coef_ : array, shape = [1, n_features] + coef_ : ndarray of shape (1, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is readonly property derived from `dual_coef_` and `support_vectors_`. - intercept_ : array, shape = [1] + intercept_ : ndarray of shape (1,) Constants in decision function. Examples @@ -1128,18 +1131,18 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): Parameters ---------- - kernel : string, optional (default='rbf') + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. - degree : int, optional (default=3) + degree : int, default=3 Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : {'scale', 'auto'} or float, optional (default='scale') + gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses @@ -1149,52 +1152,52 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, optional (default=0.0) + coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. - tol : float, optional + tol : float, default=1e-3 Tolerance for stopping criterion. - nu : float, optional + nu : float, default=0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. - shrinking : boolean, optional + shrinking : bool, default=True Whether to use the shrinking heuristic. - cache_size : float, optional + cache_size : float, default=200 Specify the size of the kernel cache (in MB). - verbose : bool, default: False + verbose : bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. - max_iter : int, optional (default=-1) + max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. Attributes ---------- - support_ : array-like of shape (n_SV) + support_ : ndarray of shape (n_SV,) Indices of support vectors. - support_vectors_ : array-like of shape (n_SV, n_features) + support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - dual_coef_ : array, shape = [1, n_SV] + dual_coef_ : ndarray of shape (1, n_SV) Coefficients of the support vectors in the decision function. - coef_ : array, shape = [1, n_features] + coef_ : ndarray of shape (1, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is readonly property derived from `dual_coef_` and `support_vectors_` - intercept_ : array, shape = [1,] + intercept_ : ndarray of shape (1,) Constant in the decision function. offset_ : float @@ -1229,16 +1232,15 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale', random_state=None) def fit(self, X, y=None, sample_weight=None, **params): - """ - Detects the soft boundary of the set of samples X. + """Detects the soft boundary of the set of samples X. Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Set of samples, where n_samples is the number of samples and n_features is the number of features. - sample_weight : array-like, shape (n_samples,) + sample_weight : array-like of shape (n_samples,), default=None Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. @@ -1266,11 +1268,12 @@ def decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - dec : array-like, shape (n_samples,) + dec : ndarray of shape (n_samples,) Returns the decision function of the samples. """ dec = self._decision_function(X).ravel() @@ -1281,30 +1284,31 @@ def score_samples(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - score_samples : array-like, shape (n_samples,) + score_samples : ndarray of shape (n_samples,) Returns the (unshifted) scoring function of the samples. """ return self.decision_function(X) + self.offset_ def predict(self, X): - """ - Perform classification on samples in X. + """Perform classification on samples in X. For a one-class model, +1 or -1 is returned. Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) For kernel="precomputed", the expected shape of X is - [n_samples_test, n_samples_train] + (n_samples_test, n_samples_train). Returns ------- - y_pred : array, shape (n_samples,) + y_pred : ndarray of shape (n_samples,) Class labels for samples in X. """ y = super().predict(X) From 50d3fe963c3d2823b2ba2a534b8a50dba620fc5e Mon Sep 17 00:00:00 2001 From: ngshya Date: Thu, 9 Jan 2020 19:12:25 +0100 Subject: [PATCH 198/448] FIX avoid division by 0 warning in LabelPropagation (#15946) --- sklearn/semi_supervised/_label_propagation.py | 1 + .../semi_supervised/tests/test_label_propagation.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 665b50dcfa507..a07b717d6f932 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -290,6 +290,7 @@ def fit(self, X, y): self.n_iter_ += 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + normalizer[normalizer == 0] = 1 self.label_distributions_ /= normalizer # set the transduction item diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index d983ab854948b..015f6fa191853 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -157,6 +157,18 @@ def test_convergence_warning(): assert_no_warnings(mdl.fit, X, y) +def test_label_propagation_non_zero_normalizer(): + # check that we don't divide by zero in case of null normalizer + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/pull/15946 + X = np.array([[100., 100.], [100., 100.], [0., 0.], [0., 0.]]) + y = np.array([0, 1, -1, -1]) + mdl = label_propagation.LabelSpreading(kernel='knn', + max_iter=100, + n_neighbors=1) + assert_no_warnings(mdl.fit, X, y) + + def test_predict_sparse_callable_kernel(): # This is a non-regression test for #15866 From 5a02de89574dd7eca600b1f612ab148fd613c024 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 9 Jan 2020 20:15:59 +0100 Subject: [PATCH 199/448] =?UTF-8?q?ENH=20Add=20verbose=20option=20to=20Vot?= =?UTF-8?q?ingClassifier=20and=20VotingRegre=E2=80=A6=20(#16069)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/whats_new/v0.23.rst | 10 +++++++++ sklearn/ensemble/_base.py | 8 ++++--- sklearn/ensemble/_voting.py | 32 ++++++++++++++++++++++----- sklearn/ensemble/tests/test_voting.py | 24 ++++++++++++++++++++ 4 files changed, 65 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 53c416c506614..b23a1fab704bf 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -67,6 +67,16 @@ Changelog `ValueError` for arguments `n_classes < 1` OR `length < 1`. :pr:`16006` by :user:`Rushabh Vasani `. +:mod:`sklearn.ensemble` +................................. + +- |API| Added boolean `verbose` flag to classes: + :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. + :pr:`15991` by :user:`Sam Bail `, + :user:`Hanna Bruce MacDonald `, + :user:`Reshama Shaikh `, and + :user:`Chiara Marmo `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index a7f018b94c54d..30a690c974d3a 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -15,12 +15,13 @@ from ..base import is_classifier, is_regressor from ..base import BaseEstimator from ..base import MetaEstimatorMixin -from ..utils import Bunch +from ..utils import Bunch, _print_elapsed_time from ..utils import check_random_state from ..utils.metaestimators import _BaseComposition -def _parallel_fit_estimator(estimator, X, y, sample_weight=None): +def _parallel_fit_estimator(estimator, X, y, sample_weight=None, + message_clsname=None, message=None): """Private function used to fit an estimator within a job.""" if sample_weight is not None: try: @@ -33,7 +34,8 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None): ) from exc raise else: - estimator.fit(X, y) + with _print_elapsed_time(message_clsname, message): + estimator.fit(X, y) return estimator diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 838d2440a9e4d..b52be0158473a 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -39,6 +39,11 @@ class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): instead. """ + def _log_message(self, name, idx, total): + if not self.verbose: + return None + return '(%d of %d) Processing %s' % (idx, total, name) + @property def _weights_not_none(self): """Get the weights of not `None` estimators.""" @@ -63,9 +68,14 @@ def fit(self, X, y, sample_weight=None): % (len(self.weights), len(self.estimators))) self.estimators_ = Parallel(n_jobs=self.n_jobs)( - delayed(_parallel_fit_estimator)(clone(clf), X, y, - sample_weight=sample_weight) - for clf in clfs if clf not in (None, 'drop') + delayed(_parallel_fit_estimator)( + clone(clf), X, y, + sample_weight=sample_weight, + message_clsname='Voting', + message=self._log_message(names[idx], + idx + 1, len(clfs)) + ) + for idx, clf in enumerate(clfs) if clf not in (None, 'drop') ) self.named_estimators_ = Bunch() @@ -122,6 +132,10 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): flatten_transform=False, it returns (n_classifiers, n_samples, n_classes). + verbose : bool, default=False + If True, the time elapsed while fitting will be printed as it + is completed. + Attributes ---------- estimators_ : list of classifiers @@ -176,13 +190,14 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): (6, 6) """ - def __init__(self, estimators, voting='hard', weights=None, n_jobs=None, - flatten_transform=True): + def __init__(self, estimators, voting='hard', weights=None, + n_jobs=None, flatten_transform=True, verbose=False): super().__init__(estimators=estimators) self.voting = voting self.weights = weights self.n_jobs = n_jobs self.flatten_transform = flatten_transform + self.verbose = verbose def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -346,6 +361,10 @@ class VotingRegressor(RegressorMixin, _BaseVoting): ``-1`` means using all processors. See :term:`Glossary ` for more details. + verbose : bool, default=False + If True, the time elapsed while fitting will be printed as it + is completed. + Attributes ---------- estimators_ : list of regressors @@ -376,10 +395,11 @@ class VotingRegressor(RegressorMixin, _BaseVoting): [ 3.3 5.7 11.8 19.7 28. 40.3] """ - def __init__(self, estimators, weights=None, n_jobs=None): + def __init__(self, estimators, weights=None, n_jobs=None, verbose=False): super().__init__(estimators=estimators) self.weights = weights self.n_jobs = n_jobs + self.verbose = verbose def fit(self, X, y, sample_weight=None): """Fit the estimators. diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 8619296536964..61b106cbeedff 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -1,6 +1,7 @@ """Testing for the VotingClassifier and VotingRegressor""" import pytest +import re import numpy as np from sklearn.utils._testing import assert_almost_equal, assert_array_equal @@ -513,6 +514,29 @@ def test_check_estimators_voting_estimator(estimator): check_no_attributes_set_in_init(estimator.__class__.__name__, estimator) +@pytest.mark.parametrize( + "estimator", + [VotingRegressor( + estimators=[('lr', LinearRegression()), + ('rf', RandomForestRegressor(random_state=123))], + verbose=True), + VotingClassifier( + estimators=[('lr', LogisticRegression(random_state=123)), + ('rf', RandomForestClassifier(random_state=123))], + verbose=True)] +) +def test_voting_verbose(estimator, capsys): + + X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) + y = np.array([1, 1, 2, 2]) + + pattern = (r'\[Voting\].*$1 of 2$ Processing lr, total=.*\n' + r'\[Voting\].*$2 of 2$ Processing rf, total=.*\n$') + + estimator.fit(X, y) + assert re.match(pattern, capsys.readouterr()[0]) + + # TODO: Remove in 0.24 when None is removed in Voting* @pytest.mark.parametrize( "Voter, BaseEstimator", From a4ec379e81843a24ba8e1fd355b18950570cdbfa Mon Sep 17 00:00:00 2001 From: Erich Schubert Date: Thu, 9 Jan 2020 23:34:10 +0100 Subject: [PATCH 200/448] MNT remove unused variables in elkan algorithm (#16082) --- sklearn/cluster/_k_means_elkan.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index 87d32d1e47858..640e6c00105bb 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -161,14 +161,12 @@ def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_, labels, lower_bounds, upper_bounds, n_samples, n_features, n_clusters) cdef np.uint8_t[:] bounds_tight = np.ones(n_samples, dtype=np.uint8) - cdef np.uint8_t[:] points_to_update = np.zeros(n_samples, dtype=np.uint8) cdef np.ndarray[floating, ndim=2, mode='c'] new_centers if max_iter <= 0: raise ValueError('Number of iterations should be a positive number' ', got %d instead' % max_iter) - col_indices = np.arange(center_half_distances.shape[0], dtype=np.int) for iteration in range(max_iter): if verbose: print("start iteration") From 1c422ca736fc595148cbe606ffeb426755863b3c Mon Sep 17 00:00:00 2001 From: Madhura Jayaratne Date: Fri, 10 Jan 2020 14:03:58 +1100 Subject: [PATCH 201/448] DOC Update docs to indicate shuffle=True still maintains sample order within each split (#16085) --- sklearn/model_selection/_split.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index bb0643e8c8edb..e03d9aa29b3ac 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -378,6 +378,7 @@ class KFold(_BaseKFold): shuffle : boolean, optional Whether to shuffle the data before splitting into batches. + Note that the samples within each split will not be shuffled. random_state : int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; @@ -584,6 +585,7 @@ class StratifiedKFold(_BaseKFold): shuffle : boolean, optional Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. random_state : int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; From 63f2ccacb879a01a2a8a4e22db65ab5b7acd5683 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 10 Jan 2020 18:15:16 +0200 Subject: [PATCH 202/448] DOC Incorrectly-rendered entries in what's new (#16091) --- doc/whats_new/v0.23.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b23a1fab704bf..343414edece81 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -89,7 +89,7 @@ Changelog ............................... - |Enhancement| :func:`gaussian_process.kernels.Matern` returns the RBF kernel when ``nu=np.inf``. - :pr: `15503` by :user:`Sam Dixon` . + :pr:`15503` by :user:`Sam Dixon `. :mod:`sklearn.linear_model` ........................... @@ -126,7 +126,7 @@ Changelog - |Fix| :func: `cross_val_predict` supports `method="predict_proba"` when `y=None`. - :pr: `15918` by :user: `Luca Kubin `. + :pr:`15918` by :user:`Luca Kubin `. :mod:`sklearn.preprocessing` ............................ From 5609eb4d3ecd85253816b811c8b91da20282dc9e Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 10 Jan 2020 17:50:19 +0100 Subject: [PATCH 203/448] DOC Rearrange status of the Consortium members (#16089) --- doc/about.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/about.rst b/doc/about.rst index 2008d96af0045..9926f62dcc824 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -139,7 +139,7 @@ Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo. :target: https://www.axa.fr/ .. |bnp| image:: images/bnp.png - :width: 170pt + :width: 150pt :target: https://www.bnpparibascardif.com/ .. |fujitsu| image:: images/fujitsu.png @@ -175,15 +175,17 @@ Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo. +---------+----------+ | | +---------+----------+ - | |axa| ||fujitsu| | + | |axa| | |bnp| | +---------+----------+ - | |bnp| | + ||fujitsu|| |intel| | +---------+----------+ - | |intel| | |nvidia| | + | | + +---------+----------+ + ||dataiku|| |nvidia| | +---------+----------+ | | +---------+----------+ - ||dataiku|| |inria| | + | |inria| | +---------+----------+ .. raw:: html From e3ee768a0c2c9cbec6b6a1ad095541fb34863ce0 Mon Sep 17 00:00:00 2001 From: Qizhi Jiang Date: Sun, 12 Jan 2020 01:42:10 +0800 Subject: [PATCH 204/448] DOC Docstring improvements to confusion_matrix and splitters (#16072) --- sklearn/metrics/_classification.py | 5 +- sklearn/model_selection/_search.py | 18 +++--- sklearn/model_selection/_split.py | 80 ++++++++++++-------------- sklearn/model_selection/_validation.py | 19 +++--- 4 files changed, 58 insertions(+), 64 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index cba7f2c2e8fc8..5e56816ab726f 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -232,7 +232,10 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, Returns ------- C : ndarray of shape (n_classes, n_classes) - Confusion matrix. + Confusion matrix whose i-th row and j-th + column entry indicates the number of + samples with true label being i-th class + and prediced label being j-th class. References ---------- diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 934ec0df6b116..ede292c2b6261 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -204,13 +204,12 @@ class ParameterSampler: n_iter : integer Number of parameter settings that are produced. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance or None, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Returns ------- @@ -1296,13 +1295,12 @@ class RandomizedSearchCV(BaseSearchCV): verbose : integer Controls the verbosity: the higher, the more messages. - random_state : int, RandomState instance or None, optional, default=None + random_state : int, RandomState instance or None, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index e03d9aa29b3ac..6bab796fb54a7 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -380,12 +380,12 @@ class KFold(_BaseKFold): Whether to shuffle the data before splitting into batches. Note that the samples within each split will not be shuffled. - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Only used when ``shuffle`` is True. This should be left + random_state : int, RandomState instance or None, default=None + Only used when ``shuffle`` is True. This should be left to None if ``shuffle`` is False. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Examples -------- @@ -587,12 +587,12 @@ class StratifiedKFold(_BaseKFold): Whether to shuffle each class's samples before splitting into batches. Note that the samples within each split will not be shuffled. - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Only used when ``shuffle`` is True. This should be left + random_state : int, RandomState instance or None, default=None + Only used when ``shuffle`` is True. This should be left to None if ``shuffle`` is False. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Examples -------- @@ -1090,11 +1090,10 @@ class _RepeatedSplits(metaclass=ABCMeta): n_repeats : int, default=10 Number of times cross-validator needs to be repeated. - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. **cvargs : additional params Constructor parameters for cv. Must not contain random_state @@ -1195,11 +1194,9 @@ class RepeatedKFold(_RepeatedSplits): n_repeats : int, default=10 Number of times cross-validator needs to be repeated. - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls.See :term:`Glossary `. Examples -------- @@ -1249,9 +1246,10 @@ class RepeatedStratifiedKFold(_RepeatedSplits): n_repeats : int, default=10 Number of times cross-validator needs to be repeated. - random_state : None, int or RandomState, default=None - Random state to be used to generate random state for each - repetition. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Examples -------- @@ -1389,11 +1387,10 @@ class ShuffleSplit(BaseShuffleSplit): int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Examples -------- @@ -1491,11 +1488,10 @@ class GroupShuffleSplit(ShuffleSplit): int, represents the absolute number of train groups. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Examples -------- @@ -1604,11 +1600,10 @@ class StratifiedShuffleSplit(BaseShuffleSplit): int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. Examples -------- @@ -2048,11 +2043,10 @@ def train_test_split(*arrays, **options): int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. shuffle : boolean, optional (default=True) Whether or not to shuffle the data before splitting. If shuffle=False diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index f841484ce8eb0..6bbe6e0c5ce95 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1003,11 +1003,10 @@ def permutation_test_score(estimator, X, y, groups=None, cv=None, ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional (default=0) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. verbose : integer, optional The verbosity level. @@ -1173,11 +1172,11 @@ def learning_curve(estimator, X, y, groups=None, Whether to shuffle training data before taking prefixes of it based on``train_sizes``. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``shuffle`` is True. + random_state : int, RandomState instance or None, default=None + Used when ``shuffle`` is True. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. From 073de7c6e56eb7fc4f1a4fac5f66ef94fea58d78 Mon Sep 17 00:00:00 2001 From: Madhura Jayaratne Date: Sun, 12 Jan 2020 11:40:25 +1100 Subject: [PATCH 205/448] FIX Fix string formatting in the error message in CategoricalNB (#16090) --- doc/whats_new/v0.23.rst | 8 ++++++++ sklearn/naive_bayes.py | 2 +- sklearn/tests/test_naive_bayes.py | 5 +++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 343414edece81..6d398ceef3cbd 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -128,6 +128,14 @@ Changelog when `y=None`. :pr:`15918` by :user:`Luca Kubin `. +:mod:`sklearn.naive_bayes` +............................. + +- |Fix| A correctly formatted error message is shown in + :class:`naive_bayes.CategoricalNB` when the number of features in the input + differs between `predict` and `fit`. + :pr:`16090` by :user:`Madhura Jayaratne `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 0b8ec518d2f28..dd1d9586db6e1 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1205,7 +1205,7 @@ def _update_feature_log_prob(self, alpha): def _joint_log_likelihood(self, X): if not X.shape[1] == self.n_features_: raise ValueError("Expected input with %d features, got %d instead" - .format(self.n_features_, X.shape[1])) + % (self.n_features_, X.shape[1])) jll = np.zeros((X.shape[0], self.class_count_.shape[0])) for i in range(self.n_features_): indices = X[:, i] diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 1c00438eb8ab9..b4470a7ed49e5 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -667,6 +667,11 @@ def test_categoricalnb(): assert_raise_message(ValueError, error_msg, clf.predict, X) assert_raise_message(ValueError, error_msg, clf.fit, X, y) + # Check error is raised for incorrect X + X = np.array([[1, 4, 1], [2, 5, 6]]) + msg = "Expected input with 2 features, got 3 instead" + assert_raise_message(ValueError, msg, clf.predict, X) + # Test alpha X3_test = np.array([[2, 5]]) # alpha=1 increases the count of all categories by one so the final From c643a40a494b9a24bcf116cb957c42395459adbd Mon Sep 17 00:00:00 2001 From: ELNS <57490926+EverLookNeverSee@users.noreply.github.com> Date: Sun, 12 Jan 2020 07:24:07 +0330 Subject: [PATCH 206/448] DOC Corrects random_state default value in FactorAnalysis (#16098) --- sklearn/decomposition/_factor_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index f9bb249c8a057..21bf89ae056d8 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -89,7 +89,7 @@ class FactorAnalysis(TransformerMixin, BaseEstimator): Number of iterations for the power method. 3 by default. Only used if ``svd_method`` equals 'randomized' - random_state : int, RandomState instance, default=None + random_state : int, RandomState instance, default=0 Only used when ``svd_method`` equals 'randomized'. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. From e75490380338e89598ad77bede8360ee3dad6545 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 12 Jan 2020 04:55:23 -0500 Subject: [PATCH 207/448] MNT Uploads coverage in pylatest_pip_openblas_pandas CI (#16092) --- build_tools/azure/posix.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index f5c4a023b4c39..195e95051f1e1 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -40,7 +40,7 @@ jobs: - script: | build_tools/azure/test_pytest_soft_dependency.sh displayName: 'Test Soft Dependency' - condition: and(eq(variables['CHECK_PYTEST_SOFT_DEPENDENCY'], 'true'), eq(variables['DISTRIB'], 'conda')) + condition: eq(variables['CHECK_PYTEST_SOFT_DEPENDENCY'], 'true') - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' @@ -49,7 +49,7 @@ jobs: condition: succeededOrFailed() - script: | build_tools/azure/upload_codecov.sh - condition: and(succeeded(), eq(variables['COVERAGE'], 'true'), eq(variables['DISTRIB'], 'conda')) + condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) From 133d1ce6d2ae88165dfe9c0e3d81c48e276cc493 Mon Sep 17 00:00:00 2001 From: Samesh Lakhotia <43701530+sameshl@users.noreply.github.com> Date: Sun, 12 Jan 2020 17:10:40 +0530 Subject: [PATCH 208/448] DOC Add link for detail explanation in glossary/estimato-tags (#14616) --- doc/glossary.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/glossary.rst b/doc/glossary.rst index e259fa69745bc..70dd230c7ecd3 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -374,6 +374,8 @@ General Concepts the data needs to be indexed on both axes, while other data is indexed only on the first axis. + For more detailed info, see :ref:`estimator_tags`. + feature features feature vector From dd93e0106162983421975faed17800cb83463680 Mon Sep 17 00:00:00 2001 From: Johann Faouzi Date: Mon, 13 Jan 2020 11:18:03 +0100 Subject: [PATCH 209/448] DOC Add missing backquote in LogisticRegression docstring (#16106) --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 41c06de1b9df2..4b541884eece8 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1157,7 +1157,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, l1_ratio : float, default=None The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only - used if ``penalty='elasticnet'`. Setting ``l1_ratio=0`` is equivalent + used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. From 594fc8502251b31c3d028fdefc125b8a94f5c4b1 Mon Sep 17 00:00:00 2001 From: SHUBH CHATTERJEE Date: Mon, 13 Jan 2020 22:40:32 +1100 Subject: [PATCH 210/448] DOC fix docstring in semi_supervised module following doc guideline (#16042) --- sklearn/semi_supervised/_label_propagation.py | 114 +++++++++--------- 1 file changed, 58 insertions(+), 56 deletions(-) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index a07b717d6f932..d40b5008db33c 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -76,29 +76,29 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): Parameters ---------- - kernel : {'knn', 'rbf', callable} + kernel : {'knn', 'rbf'} or callable, default='rbf' String identifier for kernel function to use or the kernel function itself. Only 'rbf' and 'knn' strings are valid inputs. The function - passed should take two inputs, each of shape [n_samples, n_features], - and return a [n_samples, n_samples] shaped weight matrix + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. - gamma : float - Parameter for rbf kernel + gamma : float, default=20 + Parameter for rbf kernel. - n_neighbors : integer > 0 - Parameter for knn kernel + n_neighbors : int, default=7 + Parameter for knn kernel. Need to be strictly positive. - alpha : float - Clamping factor + alpha : float, default=1.0 + Clamping factor. - max_iter : integer - Change maximum number of iterations allowed + max_iter : int, default=30 + Change maximum number of iterations allowed. - tol : float + tol : float, default=1e-3 Convergence tolerance: threshold to consider the system at steady - state + state. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -158,11 +158,12 @@ def predict(self, X): Parameters ---------- X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - y : array_like, shape = [n_samples] - Predictions for input data + y : ndarray of shape (n_samples,) + Predictions for input data. """ probas = self.predict_proba(X) return self.classes_[np.argmax(probas, axis=1)].ravel() @@ -177,12 +178,13 @@ def predict_proba(self, X): Parameters ---------- X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - probabilities : array, shape = [n_samples, n_classes] + probabilities : ndarray of shape (n_samples, n_classes) Normalized probability distributions across - class labels + class labels. """ check_is_fitted(self) @@ -211,15 +213,15 @@ def fit(self, X, y): Parameters ---------- X : array-like of shape (n_samples, n_features) - A {n_samples by n_samples} size matrix will be created from this + A matrix of shape (n_samples, n_samples) will be created from this. - y : array_like, shape = [n_samples] - n_labeled_samples (unlabeled points are marked as -1) - All unlabeled samples will be transductively assigned labels + y : array-like of shape (n_samples,) + `n_labeled_samples` (unlabeled points are marked as -1) + All unlabeled samples will be transductively assigned labels. Returns ------- - self : returns an instance of self. + self : object """ X, y = check_X_y(X, y) self.X_ = X @@ -307,26 +309,26 @@ class LabelPropagation(BaseLabelPropagation): Parameters ---------- - kernel : {'knn', 'rbf', callable} + kernel : {'knn', 'rbf'} or callable, default='rbf' String identifier for kernel function to use or the kernel function itself. Only 'rbf' and 'knn' strings are valid inputs. The function - passed should take two inputs, each of shape [n_samples, n_features], - and return a [n_samples, n_samples] shaped weight matrix. + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. - gamma : float - Parameter for rbf kernel + gamma : float, default=20 + Parameter for rbf kernel. - n_neighbors : integer > 0 - Parameter for knn kernel + n_neighbors : int, default=7 + Parameter for knn kernel which need to be strictly positive. - max_iter : integer - Change maximum number of iterations allowed + max_iter : int, default=1000 + Change maximum number of iterations allowed. - tol : float + tol : float, 1e-3 Convergence tolerance: threshold to consider the system at steady - state + state. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -334,16 +336,16 @@ class LabelPropagation(BaseLabelPropagation): Attributes ---------- - X_ : array, shape = [n_samples, n_features] + X_ : ndarray of shape (n_samples, n_features) Input array. - classes_ : array, shape = [n_classes] + classes_ : ndarray of shape (n_classes,) The distinct labels used in classifying instances. - label_distributions_ : array, shape = [n_samples, n_classes] + label_distributions_ : ndarray of shape (n_samples, n_classes) Categorical distribution for each item. - transduction_ : array, shape = [n_samples] + transduction_ : ndarray of shape (n_samples) Label assigned to each item via the transduction. n_iter_ : int @@ -413,33 +415,33 @@ class LabelSpreading(BaseLabelPropagation): Parameters ---------- - kernel : {'knn', 'rbf', callable} + kernel : {'knn', 'rbf'} or callable, default='rbf' String identifier for kernel function to use or the kernel function itself. Only 'rbf' and 'knn' strings are valid inputs. The function - passed should take two inputs, each of shape [n_samples, n_features], - and return a [n_samples, n_samples] shaped weight matrix + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. - gamma : float - parameter for rbf kernel + gamma : float, default=20 + Parameter for rbf kernel. - n_neighbors : integer > 0 - parameter for knn kernel + n_neighbors : int, default=7 + Parameter for knn kernel which is a strictly positive integer. - alpha : float + alpha : float, default=0.2 Clamping factor. A value in (0, 1) that specifies the relative amount that an instance should adopt the information from its neighbors as opposed to its initial label. alpha=0 means keeping the initial label information; alpha=1 means replacing all initial information. - max_iter : integer - maximum number of iterations allowed + max_iter : int, default=30 + Maximum number of iterations allowed. - tol : float + tol : float, default=1e-3 Convergence tolerance: threshold to consider the system at steady - state + state. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -447,16 +449,16 @@ class LabelSpreading(BaseLabelPropagation): Attributes ---------- - X_ : array, shape = [n_samples, n_features] + X_ : ndarray of shape (n_samples, n_features) Input array. - classes_ : array, shape = [n_classes] + classes_ : ndarray of shape (n_classes,) The distinct labels used in classifying instances. - label_distributions_ : array, shape = [n_samples, n_classes] + label_distributions_ : ndarray of shape (n_samples, n_classes) Categorical distribution for each item. - transduction_ : array, shape = [n_samples] + transduction_ : ndarray of shape (n_samples,) Label assigned to each item via the transduction. n_iter_ : int From 9355b3c5d963ef54d47eea044e4e001f2d9d210b Mon Sep 17 00:00:00 2001 From: Madhura Jayaratne Date: Mon, 13 Jan 2020 23:23:27 +1100 Subject: [PATCH 211/448] DOC Add Gitter details and remove IRC details (#16107) --- doc/support.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/support.rst b/doc/support.rst index 5dd52c01030f0..75f9f4cee9a6d 100644 --- a/doc/support.rst +++ b/doc/support.rst @@ -73,16 +73,13 @@ Note: gists are git cloneable repositories and thus you can use git to push datafiles to them. -.. _irc: +.. _gitter: -IRC +Gitter === -Some developers like to hang out on channel ``#scikit-learn`` on -``irc.freenode.net``. - -If you do not have an IRC client or are behind a firewall this web -client works fine: https://webchat.freenode.net +Some developers like to hang out on scikit-learn Gitter room: +https://gitter.im/scikit-learn/scikit-learn. .. _documentation_resources: From 4953ec36a41694d38f8d92fd48c7049cdd82e0ed Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 13 Jan 2020 13:43:50 +0100 Subject: [PATCH 212/448] MNT remove more deprecations for 0.23 (#15860) * removed warn_on_dtype * removed parameters to check_is_fitted * all_estimators parameters * deprecated n_components attribute in AgglomerativeClustering * change default of base.score for multioutput * removed lots of useless decorators? * changed default of copy in quantil_transform * removed six.py * nmf default value of init param * raise error instead of warning in LinearDiscriminantAnalysis * removed label param in hamming_loss * updated method parameter of power_transform * pep8 * changed default value of min_impurity_split * removed assert_false and assert_true * added and fixed versionchanged directives * reset min_impurity_split default to None * fixed LDA issue * fixed some test * more docstrings updates * set min_impurity_decrease for test to pass * upate docstring example * fixed doctest * removed multiouput.score since it's now consistent with the default * deprecate least_angle parameter combination * remove support for l1 or l2 loss in svm * removed linear_assignment.py * add test --- sklearn/linear_model/_least_angle.py | 18 +- .../linear_model/tests/test_least_angle.py | 9 +- sklearn/multioutput.py | 38 --- sklearn/svm/_classes.py | 26 -- sklearn/svm/tests/test_svm.py | 33 -- sklearn/utils/linear_assignment_.py | 284 ------------------ sklearn/utils/tests/test_linear_assignment.py | 64 ---- 7 files changed, 12 insertions(+), 460 deletions(-) delete mode 100644 sklearn/utils/linear_assignment_.py delete mode 100644 sklearn/utils/tests/test_linear_assignment.py diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index b0be830eb76c6..0a9a67844a3f3 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -47,12 +47,6 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, Input data. Note that if X is None then the Gram matrix must be specified, i.e., cannot be None or False. - .. deprecated:: 0.21 - - The use of ``X`` is ``None`` in combination with ``Gram`` is not - ``None`` will be removed in v0.23. Use :func:`lars_path_gram` - instead. - y : None or array-like of shape (n_samples,) Input targets. @@ -67,11 +61,6 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, matrix is precomputed from the given X, if there are more samples than features. - .. deprecated:: 0.21 - - The use of ``X`` is ``None`` in combination with ``Gram`` is not - None will be removed in v0.23. Use :func:`lars_path_gram` instead. - max_iter : int, default=500 Maximum number of iterations to perform, set to infinity for no limit. @@ -155,9 +144,10 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, """ if X is None and Gram is not None: - warnings.warn('Use lars_path_gram to avoid passing X and y. ' - 'The current option will be removed in v0.23.', - FutureWarning) + raise ValueError( + 'X cannot be None if Gram is not None' + 'Use lars_path_gram to avoid passing X and y.' + ) return _lars_path_solver( X=X, y=y, Xy=Xy, Gram=Gram, n_samples=None, max_iter=max_iter, alpha_min=alpha_min, method=method, copy_X=copy_X, diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 2b7ed5a83b8d8..6e7c1fb37096a 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -15,7 +15,8 @@ from sklearn.utils._testing import TempMemmap from sklearn.exceptions import ConvergenceWarning from sklearn import linear_model, datasets -from sklearn.linear_model._least_angle import _lars_path_residues, LassoLarsIC +from sklearn.linear_model._least_angle import _lars_path_residues +from sklearn.linear_model import LassoLarsIC, lars_path # TODO: use another dataset that has multiple drops diabetes = datasets.load_diabetes() @@ -730,3 +731,9 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): y = X[:, 2] lasso_lars.fit(X, y, copy_X=copy_X) assert copy_X == np.array_equal(X, X_copy) + + +def test_X_none_gram_not_none(): + with pytest.raises(ValueError, + match="X cannot be None if Gram is not None"): + lars_path(X=None, y=[1], Gram='not None') diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 945957cdc12a7..90e393e19503a 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -265,44 +265,6 @@ def partial_fit(self, X, y, sample_weight=None): super().partial_fit( X, y, sample_weight=sample_weight) - # XXX Remove this method in 0.23 - def score(self, X, y, sample_weight=None): - """Returns the coefficient of determination R^2 of the prediction. - - The coefficient R^2 is defined as (1 - u/v), where u is the residual - sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression - sum of squares ((y_true - y_true.mean()) ** 2).sum(). - Best possible score is 1.0 and it can be negative (because the - model can be arbitrarily worse). A constant model that always - predicts the expected value of y, disregarding the input features, - would get a R^2 score of 0.0. - - Notes - ----- - R^2 is calculated by weighting all the targets equally using - `multioutput='uniform_average'`. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Test samples. - - y : array-like, shape (n_samples) or (n_samples, n_outputs) - True values for X. - - sample_weight : array-like, shape [n_samples], optional - Sample weights. - - Returns - ------- - score : float - R^2 of self.predict(X) wrt. y. - """ - # XXX remove in 0.19 when r2_score default for multioutput changes - from .metrics import r2_score - return r2_score(y, self.predict(X), sample_weight=sample_weight, - multioutput='uniform_average') - class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): """Multi target classification diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index c8a27a12e72e5..698acb6ae68b3 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -214,18 +214,6 @@ def fit(self, X, y, sample_weight=None): self : object An instance of the estimator. """ - # FIXME Remove l1/l2 support in 0.23 ---------------------------------- - msg = ("loss='%s' has been deprecated in favor of " - "loss='%s' as of 0.16. Backward compatibility" - " for the loss='%s' will be removed in %s") - - if self.loss in ('l1', 'l2'): - old_loss = self.loss - self.loss = {'l1': 'hinge', 'l2': 'squared_hinge'}.get(self.loss) - warnings.warn(msg % (old_loss, self.loss, old_loss, '0.23'), - FutureWarning) - # --------------------------------------------------------------------- - if self.C < 0: raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) @@ -406,20 +394,6 @@ def fit(self, X, y, sample_weight=None): self : object An instance of the estimator. """ - # FIXME Remove l1/l2 support in 0.23 ---------------------------------- - msg = ("loss='%s' has been deprecated in favor of " - "loss='%s' as of 0.16. Backward compatibility" - " for the loss='%s' will be removed in %s") - - if self.loss in ('l1', 'l2'): - old_loss = self.loss - self.loss = {'l1': 'epsilon_insensitive', - 'l2': 'squared_epsilon_insensitive' - }.get(self.loss) - warnings.warn(msg % (old_loss, self.loss, old_loss, '0.23'), - FutureWarning) - # --------------------------------------------------------------------- - if self.C < 0: raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index b38a4697577a3..d789be7f26383 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -736,39 +736,6 @@ def test_linearsvc_parameters(): svm.LinearSVC(loss="l3").fit(X, y) -# FIXME remove in 0.23 -def test_linearsvx_loss_penalty_deprecations(): - X, y = [[0.0], [1.0]], [0, 1] - - msg = ("loss='%s' has been deprecated in favor of " - "loss='%s' as of 0.16. Backward compatibility" - " for the %s will be removed in %s") - - # LinearSVC - # loss l1 --> hinge - assert_warns_message(FutureWarning, - msg % ("l1", "hinge", "loss='l1'", "0.23"), - svm.LinearSVC(loss="l1").fit, X, y) - - # loss l2 --> squared_hinge - assert_warns_message(FutureWarning, - msg % ("l2", "squared_hinge", "loss='l2'", "0.23"), - svm.LinearSVC(loss="l2").fit, X, y) - - # LinearSVR - # loss l1 --> epsilon_insensitive - assert_warns_message(FutureWarning, - msg % ("l1", "epsilon_insensitive", "loss='l1'", - "0.23"), - svm.LinearSVR(loss="l1").fit, X, y) - - # loss l2 --> squared_epsilon_insensitive - assert_warns_message(FutureWarning, - msg % ("l2", "squared_epsilon_insensitive", - "loss='l2'", "0.23"), - svm.LinearSVR(loss="l2").fit, X, y) - - def test_linear_svx_uppercase_loss_penality_raises_error(): # Check if Upper case notation raises error at _fit_liblinear # which is called by fit diff --git a/sklearn/utils/linear_assignment_.py b/sklearn/utils/linear_assignment_.py deleted file mode 100644 index b396d90fb27bc..0000000000000 --- a/sklearn/utils/linear_assignment_.py +++ /dev/null @@ -1,284 +0,0 @@ -""" -Solve the unique lowest-cost assignment problem using the -Hungarian algorithm (also known as Munkres algorithm). - -""" -# Based on original code by Brain Clapper, adapted to NumPy by Gael Varoquaux. -# Heavily refactored by Lars Buitinck. - -# Copyright (c) 2008 Brian M. Clapper , Gael Varoquaux -# Author: Brian M. Clapper, Gael Varoquaux -# LICENSE: BSD - -import numpy as np -import warnings - - -# Deprecation warning for module -warnings.warn( - "The linear_assignment_ module is deprecated in 0.21 " - "and will be removed from 0.23. Use " - "scipy.optimize.linear_sum_assignment instead.", - FutureWarning) - - -def linear_assignment(X): - """Solve the linear assignment problem using the Hungarian algorithm. - - The problem is also known as maximum weight matching in bipartite graphs. - The method is also known as the Munkres or Kuhn-Munkres algorithm. - - Parameters - ---------- - X : array - The cost matrix of the bipartite graph - - Returns - ------- - indices : array - The pairs of (row, col) indices in the original array giving - the original ordering. - - References - ---------- - - 1. http://www.public.iastate.edu/~ddoty/HungarianAlgorithm.html - - 2. Harold W. Kuhn. The Hungarian Method for the assignment problem. - *Naval Research Logistics Quarterly*, 2:83-97, 1955. - - 3. Harold W. Kuhn. Variants of the Hungarian method for assignment - problems. *Naval Research Logistics Quarterly*, 3: 253-258, 1956. - - 4. Munkres, J. Algorithms for the Assignment and Transportation Problems. - *Journal of the Society of Industrial and Applied Mathematics*, - 5(1):32-38, March, 1957. - - 5. https://en.wikipedia.org/wiki/Hungarian_algorithm - """ - indices = _hungarian(X).tolist() - indices.sort() - # Re-force dtype to ints in case of empty list - indices = np.array(indices, dtype=int) - # Make sure the array is 2D with 2 columns. - # This is needed when dealing with an empty list - indices.shape = (-1, 2) - return indices - - -class _HungarianState: - """State of one execution of the Hungarian algorithm. - - Parameters - ---------- - cost_matrix : 2D matrix - The cost matrix. Does not have to be square. - """ - - def __init__(self, cost_matrix): - cost_matrix = np.atleast_2d(cost_matrix) - - # If there are more rows (n) than columns (m), then the algorithm - # will not be able to work correctly. Therefore, we - # transpose the cost function when needed. Just have to - # remember to swap the result columns back later. - transposed = (cost_matrix.shape[1] < cost_matrix.shape[0]) - if transposed: - self.C = (cost_matrix.T).copy() - else: - self.C = cost_matrix.copy() - self.transposed = transposed - - # At this point, m >= n. - n, m = self.C.shape - self.row_uncovered = np.ones(n, dtype=np.bool) - self.col_uncovered = np.ones(m, dtype=np.bool) - self.Z0_r = 0 - self.Z0_c = 0 - self.path = np.zeros((n + m, 2), dtype=int) - self.marked = np.zeros((n, m), dtype=int) - - def _clear_covers(self): - """Clear all covered matrix cells""" - self.row_uncovered[:] = True - self.col_uncovered[:] = True - - -def _hungarian(cost_matrix): - """The Hungarian algorithm. - - Calculate the Munkres solution to the classical assignment problem and - return the indices for the lowest-cost pairings. - - Parameters - ---------- - cost_matrix : 2D matrix - The cost matrix. Does not have to be square. - - Returns - ------- - indices : 2D array of indices - The pairs of (row, col) indices in the original array giving - the original ordering. - """ - warnings.warn( - "The linear_assignment function is deprecated in 0.21 " - "and will be removed from 0.23. Use " - "scipy.optimize.linear_sum_assignment instead.", - FutureWarning) - - state = _HungarianState(cost_matrix) - - # No need to bother with assignments if one of the dimensions - # of the cost matrix is zero-length. - step = None if 0 in cost_matrix.shape else _step1 - - while step is not None: - step = step(state) - - # Look for the starred columns - results = np.array(np.where(state.marked == 1)).T - - # We need to swap the columns because we originally - # did a transpose on the input cost matrix. - if state.transposed: - results = results[:, ::-1] - - return results - - -# Individual steps of the algorithm follow, as a state machine: they return -# the next step to be taken (function to be called), if any. - -def _step1(state): - """Steps 1 and 2 in the Wikipedia page.""" - - # Step1: For each row of the matrix, find the smallest element and - # subtract it from every element in its row. - state.C -= state.C.min(axis=1)[:, np.newaxis] - # Step2: Find a zero (Z) in the resulting matrix. If there is no - # starred zero in its row or column, star Z. Repeat for each element - # in the matrix. - for i, j in zip(*np.where(state.C == 0)): - if state.col_uncovered[j] and state.row_uncovered[i]: - state.marked[i, j] = 1 - state.col_uncovered[j] = False - state.row_uncovered[i] = False - - state._clear_covers() - return _step3 - - -def _step3(state): - """ - Cover each column containing a starred zero. If n columns are covered, - the starred zeros describe a complete set of unique assignments. - In this case, Go to DONE, otherwise, Go to Step 4. - """ - marked = (state.marked == 1) - state.col_uncovered[np.any(marked, axis=0)] = False - - if marked.sum() < state.C.shape[0]: - return _step4 - - -def _step4(state): - """ - Find a noncovered zero and prime it. If there is no starred zero - in the row containing this primed zero, Go to Step 5. Otherwise, - cover this row and uncover the column containing the starred - zero. Continue in this manner until there are no uncovered zeros - left. Save the smallest uncovered value and Go to Step 6. - """ - # We convert to int as numpy operations are faster on int - C = (state.C == 0).astype(np.int) - covered_C = C * state.row_uncovered[:, np.newaxis] - covered_C *= state.col_uncovered.astype(dtype=np.int, copy=False) - n = state.C.shape[0] - m = state.C.shape[1] - while True: - # Find an uncovered zero - row, col = np.unravel_index(np.argmax(covered_C), (n, m)) - if covered_C[row, col] == 0: - return _step6 - else: - state.marked[row, col] = 2 - # Find the first starred element in the row - star_col = np.argmax(state.marked[row] == 1) - if not state.marked[row, star_col] == 1: - # Could not find one - state.Z0_r = row - state.Z0_c = col - return _step5 - else: - col = star_col - state.row_uncovered[row] = False - state.col_uncovered[col] = True - covered_C[:, col] = C[:, col] * ( - state.row_uncovered.astype(dtype=np.int, copy=False)) - covered_C[row] = 0 - - -def _step5(state): - """ - Construct a series of alternating primed and starred zeros as follows. - Let Z0 represent the uncovered primed zero found in Step 4. - Let Z1 denote the starred zero in the column of Z0 (if any). - Let Z2 denote the primed zero in the row of Z1 (there will always be one). - Continue until the series terminates at a primed zero that has no starred - zero in its column. Unstar each starred zero of the series, star each - primed zero of the series, erase all primes and uncover every line in the - matrix. Return to Step 3 - """ - count = 0 - path = state.path - path[count, 0] = state.Z0_r - path[count, 1] = state.Z0_c - - while True: - # Find the first starred element in the col defined by - # the path. - row = np.argmax(state.marked[:, path[count, 1]] == 1) - if not state.marked[row, path[count, 1]] == 1: - # Could not find one - break - else: - count += 1 - path[count, 0] = row - path[count, 1] = path[count - 1, 1] - - # Find the first prime element in the row defined by the - # first path step - col = np.argmax(state.marked[path[count, 0]] == 2) - if state.marked[row, col] != 2: - col = -1 - count += 1 - path[count, 0] = path[count - 1, 0] - path[count, 1] = col - - # Convert paths - for i in range(count + 1): - if state.marked[path[i, 0], path[i, 1]] == 1: - state.marked[path[i, 0], path[i, 1]] = 0 - else: - state.marked[path[i, 0], path[i, 1]] = 1 - - state._clear_covers() - # Erase all prime markings - state.marked[state.marked == 2] = 0 - return _step3 - - -def _step6(state): - """ - Add the value found in Step 4 to every element of each covered row, - and subtract it from every element of each uncovered column. - Return to Step 4 without altering any stars, primes, or covered lines. - """ - # the smallest uncovered value in the matrix - if np.any(state.row_uncovered) and np.any(state.col_uncovered): - minval = np.min(state.C[state.row_uncovered], axis=0) - minval = np.min(minval[state.col_uncovered]) - state.C[np.logical_not(state.row_uncovered)] += minval - state.C[:, state.col_uncovered] -= minval - return _step4 diff --git a/sklearn/utils/tests/test_linear_assignment.py b/sklearn/utils/tests/test_linear_assignment.py deleted file mode 100644 index 2f9399e68606c..0000000000000 --- a/sklearn/utils/tests/test_linear_assignment.py +++ /dev/null @@ -1,64 +0,0 @@ -# Author: Brian M. Clapper, G Varoquaux -# License: BSD - -# TODO #0.23: Remove this test module as the methods being tested -# have been replaced by SciPy methods - -import numpy as np -import pytest - - -@pytest.mark.filterwarnings( - "ignore::FutureWarning") -def test_hungarian(): - from sklearn.utils.linear_assignment_ import _hungarian - matrices = [ - # Square - ([[400, 150, 400], - [400, 450, 600], - [300, 225, 300]], - 850 # expected cost - ), - - # Rectangular variant - ([[400, 150, 400, 1], - [400, 450, 600, 2], - [300, 225, 300, 3]], - 452 # expected cost - ), - - # Square - ([[10, 10, 8], - [9, 8, 1], - [9, 7, 4]], - 18 - ), - - # Rectangular variant - ([[10, 10, 8, 11], - [9, 8, 1, 1], - [9, 7, 4, 10]], - 15 - ), - - # n == 2, m == 0 matrix - ([[], []], - 0 - ), - ] - - for cost_matrix, expected_total in matrices: - cost_matrix = np.array(cost_matrix) - indexes = _hungarian(cost_matrix) - total_cost = 0 - for r, c in indexes: - x = cost_matrix[r, c] - total_cost += x - assert expected_total == total_cost - - indexes = _hungarian(cost_matrix.T) - total_cost = 0 - for c, r in indexes: - x = cost_matrix[r, c] - total_cost += x - assert expected_total == total_cost From 19479d7af1711f1bb403eca1c02eebf212999091 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 13 Jan 2020 07:55:51 -0500 Subject: [PATCH 213/448] DOC Updates kernel function to be consistent (#16099) --- doc/modules/density.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/density.rst b/doc/modules/density.rst index b6dbe5e49abbb..1070b9fbf9f1b 100644 --- a/doc/modules/density.rst +++ b/doc/modules/density.rst @@ -93,7 +93,7 @@ Given this kernel form, the density estimate at a point :math:`y` within a group of points :math:`x_i; i=1\cdots N` is given by: .. math:: - \rho_K(y) = \sum_{i=1}^{N} K((y - x_i) / h) + \rho_K(y) = \sum_{i=1}^{N} K(y - x_i; h) The bandwidth here acts as a smoothing parameter, controlling the tradeoff between bias and variance in the result. A large bandwidth leads to a very From d2febf2c3214609cbc7d53e094fd7d6e483dbc25 Mon Sep 17 00:00:00 2001 From: meyer89 Date: Mon, 13 Jan 2020 16:35:39 +0100 Subject: [PATCH 214/448] ENH Reduce memory footprint in MLP when shuffle activated (#14075) --- .../neural_network/_multilayer_perceptron.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 51af0e33139dd..9cc66bedb46ce 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -21,6 +21,7 @@ from ..preprocessing import LabelBinarizer from ..utils import gen_batches, check_random_state from ..utils import shuffle +from ..utils import _safe_indexing from ..utils import check_array, check_X_y, column_or_1d from ..exceptions import ConvergenceWarning from ..utils.extmath import safe_sparse_dot @@ -503,6 +504,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads, y_val = None n_samples = X.shape[0] + sample_idx = np.arange(n_samples, dtype=int) if self.batch_size == 'auto': batch_size = min(200, n_samples) @@ -512,12 +514,24 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads, try: for it in range(self.max_iter): if self.shuffle: - X, y = shuffle(X, y, random_state=self._random_state) + # Only shuffle the sample indices instead of X and y to + # reduce the memory footprint. These indices will be used + # to slice the X and y. + sample_idx = shuffle(sample_idx, + random_state=self._random_state) + accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): - activations[0] = X[batch_slice] + if self.shuffle: + X_batch = _safe_indexing(X, sample_idx[batch_slice]) + y_batch = y[sample_idx[batch_slice]] + else: + X_batch = X[batch_slice] + y_batch = y[batch_slice] + + activations[0] = X_batch batch_loss, coef_grads, intercept_grads = self._backprop( - X[batch_slice], y[batch_slice], activations, deltas, + X_batch, y_batch, activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) @@ -664,7 +678,7 @@ def _predict(self, X): y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) The decision function of the samples for each class in the model. """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + X = check_array(X, accept_sparse=['csr', 'csc']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes @@ -928,7 +942,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", n_iter_no_change=n_iter_no_change, max_fun=max_fun) def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) @@ -1336,7 +1350,7 @@ def predict(self, X): return y_pred def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True, y_numeric=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) From 8e61534f1087703f476414d8dbd3688282f8eebf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 13 Jan 2020 18:19:14 +0100 Subject: [PATCH 215/448] MNT create issue templates with automatic tagging (#16053) --- .github/ISSUE_TEMPLATE/blank_template.md | 10 ++++ .github/ISSUE_TEMPLATE/bug_report.md | 69 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/doc_improvement.md | 20 +++++++ .github/ISSUE_TEMPLATE/feature_request.md | 22 ++++++++ .github/ISSUE_TEMPLATE/usage_question.md | 20 +++++++ 5 files changed, 141 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/blank_template.md create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/doc_improvement.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/ISSUE_TEMPLATE/usage_question.md diff --git a/.github/ISSUE_TEMPLATE/blank_template.md b/.github/ISSUE_TEMPLATE/blank_template.md new file mode 100644 index 0000000000000..d46ae9e50b18f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/blank_template.md @@ -0,0 +1,10 @@ +--- +name: Other +about: For all other issues to reach the community... +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000000..9144526ec8185 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,69 @@ +--- +name: Bug report +about: Create a report to help us reproduce and correct the bug +title: '' +labels: Bug +assignees: '' + +--- + +Before submitting a bug, please make sure the issue hasn't been already +addressed by searching through the past issues. + +#### Describe the bug + + +#### Steps/Code to Reproduce + + +``` +Sample code to reproduce the problem +``` + +#### Expected Results + + +#### Actual Results + + +#### Versions + + + + diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.md b/.github/ISSUE_TEMPLATE/doc_improvement.md new file mode 100644 index 0000000000000..0cbf3fdb4963c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/doc_improvement.md @@ -0,0 +1,20 @@ +--- +name: Documentation improvement +about: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. +title: '' +labels: Documentation, help wanted +assignees: '' + +--- + +#### Describe the issue linked to the documentation + + + +#### Suggest a potential alternative/fix + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000000..b2ff110d69a04 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,22 @@ +--- +name: Feature request +about: Suggest a new algorithm, enhancement to an existing algorithm, etc. +title: '' +labels: New Feature +assignees: '' + +--- + + + +#### Describe the workflow you want to enable + +#### Describe your proposed solution + +#### Describe alternatives you've considered, if relevant + +#### Additional context diff --git a/.github/ISSUE_TEMPLATE/usage_question.md b/.github/ISSUE_TEMPLATE/usage_question.md new file mode 100644 index 0000000000000..1b0dd8ef8340a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/usage_question.md @@ -0,0 +1,20 @@ +--- +name: Usage question +about: If you have a usage question +title: '' +labels: Question +assignees: '' + +--- + + From 0d9961517362656f9c073c3839fb64af6f30bedf Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 14 Jan 2020 09:59:53 +1100 Subject: [PATCH 216/448] Delete ISSUE_TEMPLATE.md --- ISSUE_TEMPLATE.md | 57 ----------------------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 ISSUE_TEMPLATE.md diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md deleted file mode 100644 index c8ce3e4905b37..0000000000000 --- a/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,57 +0,0 @@ - - - - -#### Description - - -#### Steps/Code to Reproduce - - -#### Expected Results - - -#### Actual Results - - -#### Versions - - - - From fa5817b5d038b0cb6067907a5e9ee8fae1b95c19 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 14 Jan 2020 10:03:31 +1100 Subject: [PATCH 217/448] MNT create config.yml for new issue templates --- .github/ISSUE_TEMPLATE/config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/ISSUE_TEMPLATE/config.yml diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000..3ba13e0cec6cb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false From 562c08b7e0550cdff31a41683647ed2ac19f8200 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 14 Jan 2020 02:24:59 +0100 Subject: [PATCH 218/448] MNT remoe deprecated _refresh_cache usage (joblib related) (#16108) --- sklearn/datasets/_base.py | 28 ------------ sklearn/datasets/_california_housing.py | 5 +- sklearn/datasets/_covtype.py | 7 +-- sklearn/datasets/_kddcup99.py | 7 +-- sklearn/datasets/_olivetti_faces.py | 5 +- sklearn/datasets/_rcv1.py | 13 ++---- sklearn/datasets/_species_distributions.py | 5 +- sklearn/datasets/tests/test_base.py | 53 ---------------------- 8 files changed, 11 insertions(+), 112 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index bdb5cfe8772ca..62bffb947a8ff 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -919,31 +919,3 @@ def _fetch_remote(remote, dirname=None): "file may be corrupted.".format(file_path, checksum, remote.checksum)) return file_path - - -def _refresh_cache(files, compress): - # TODO: REMOVE in v0.23 - import joblib - msg = "sklearn.externals.joblib is deprecated in 0.21" - with warnings.catch_warnings(record=True) as warns: - data = tuple([joblib.load(f) for f in files]) - - refresh_needed = any([str(x.message).startswith(msg) for x in warns]) - - other_warns = [w for w in warns if not str(w.message).startswith(msg)] - for w in other_warns: - warnings.warn(message=w.message, category=w.category) - - if refresh_needed: - try: - for value, path in zip(data, files): - joblib.dump(value, path, compress=compress) - except IOError: - message = ("This dataset will stop being loadable in scikit-learn " - "version 0.23 because it references a deprecated " - "import path. Consider removing the following files " - "and allowing it to be cached anew:\n%s" - % ("\n".join(files))) - warnings.warn(message=message, category=FutureWarning) - - return data[0] if len(data) == 1 else data diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index c71ebf3871b75..958184369b63d 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -35,7 +35,6 @@ from ._base import _fetch_remote from ._base import _pkl_filepath from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ..utils import Bunch # The original data can be found at: @@ -146,9 +145,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True, remove(archive_path) else: - cal_housing = _refresh_cache([filepath], 6) - # TODO: Revert to the following line in v0.23 - # cal_housing = joblib.load(filepath) + cal_housing = joblib.load(filepath) feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"] diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index f9fab853adc84..367ec1f9e2970 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -25,7 +25,6 @@ from . import get_data_home from ._base import _fetch_remote from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ..utils import Bunch from ._base import _pkl_filepath from ..utils import check_random_state @@ -126,10 +125,8 @@ def fetch_covtype(data_home=None, download_if_missing=True, try: X, y except NameError: - X, y = _refresh_cache([samples_path, targets_path], 9) - # TODO: Revert to the following two lines in v0.23 - # X = joblib.load(samples_path) - # y = joblib.load(targets_path) + X = joblib.load(samples_path) + y = joblib.load(targets_path) if shuffle: ind = np.arange(X.shape[0]) diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 0a8121521ac82..4585df8b0fb8b 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -20,7 +20,6 @@ from ._base import _fetch_remote from . import get_data_home from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ..utils import Bunch from ..utils import check_random_state from ..utils import shuffle as shuffle_method @@ -293,10 +292,8 @@ def _fetch_brute_kddcup99(data_home=None, try: X, y except NameError: - X, y = _refresh_cache([samples_path, targets_path], 0) - # TODO: Revert to the following two lines in v0.23 - # X = joblib.load(samples_path) - # y = joblib.load(targets_path) + X = joblib.load(samples_path) + y = joblib.load(targets_path) return Bunch(data=X, target=y) diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index f88f088e82e8b..d1a9805b495f2 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -24,7 +24,6 @@ from ._base import _fetch_remote from ._base import RemoteFileMetadata from ._base import _pkl_filepath -from ._base import _refresh_cache from ..utils import check_random_state, Bunch # The original data can be found at: @@ -110,9 +109,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, joblib.dump(faces, filepath, compress=6) del mfile else: - faces = _refresh_cache([filepath], 6) - # TODO: Revert to the following line in v0.23 - # faces = joblib.load(filepath) + faces = joblib.load(filepath) # We want floating point data, but float32 is enough (there is only # one byte of precision in the original uint8s anyway) diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index 0836fe1249271..d930a347b7f7c 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -22,7 +22,6 @@ from ._base import _pkl_filepath from ._base import _fetch_remote from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ._svmlight_format_io import load_svmlight_files from ..utils import shuffle as shuffle_ from ..utils import Bunch @@ -190,10 +189,8 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, f.close() remove(f.name) else: - X, sample_id = _refresh_cache([samples_path, sample_id_path], 9) - # TODO: Revert to the following two lines in v0.23 - # X = joblib.load(samples_path) - # sample_id = joblib.load(sample_id_path) + X = joblib.load(samples_path) + sample_id = joblib.load(sample_id_path) # load target (y), categories, and sample_id_bis if download_if_missing and (not exists(sample_topics_path) or @@ -246,10 +243,8 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(y, sample_topics_path, compress=9) joblib.dump(categories, topics_path, compress=9) else: - y, categories = _refresh_cache([sample_topics_path, topics_path], 9) - # TODO: Revert to the following two lines in v0.23 - # y = joblib.load(sample_topics_path) - # categories = joblib.load(topics_path) + y = joblib.load(sample_topics_path) + categories = joblib.load(topics_path) if subset == 'all': pass diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py index 99dc192af755b..7800dfce2c190 100644 --- a/sklearn/datasets/_species_distributions.py +++ b/sklearn/datasets/_species_distributions.py @@ -51,7 +51,6 @@ from ._base import RemoteFileMetadata from ..utils import Bunch from ._base import _pkl_filepath -from ._base import _refresh_cache # The original data can be found at: # https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip @@ -260,8 +259,6 @@ def fetch_species_distributions(data_home=None, **extra_params) joblib.dump(bunch, archive_path, compress=9) else: - bunch = _refresh_cache([archive_path], 9) - # TODO: Revert to the following line in v0.23 - # bunch = joblib.load(archive_path) + bunch = joblib.load(archive_path) return bunch diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 3a0ad41ced969..7f56217e93455 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -24,7 +24,6 @@ from sklearn.datasets import load_boston from sklearn.datasets import load_wine from sklearn.utils import Bunch -from sklearn.datasets._base import _refresh_cache from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.externals._pilutil import pillow_installed @@ -277,55 +276,3 @@ def test_bunch_dir(): # check that dir (important for autocomplete) shows attributes data = load_iris() assert "data" in dir(data) - - -def test_refresh_cache(monkeypatch): - # uses pytests monkeypatch fixture - # https://docs.pytest.org/en/latest/monkeypatch.html - - def _load_warn(*args, **kwargs): - # raise the warning from "externals.joblib.__init__.py" - # this is raised when a file persisted by the old joblib is loaded now - msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be " - "removed in 0.23. Please import this functionality directly " - "from joblib, which can be installed with: pip install joblib. " - "If this warning is raised when loading pickled models, you " - "may need to re-serialize those models with scikit-learn " - "0.21+.") - warnings.warn(msg, FutureWarning) - return 0 - - def _load_warn_unrelated(*args, **kwargs): - warnings.warn("unrelated warning", FutureWarning) - return 0 - - def _dump_safe(*args, **kwargs): - pass - - def _dump_raise(*args, **kwargs): - # this happens if the file is read-only and joblib.dump fails to write - # on it. - raise IOError() - - # test if the dataset spesific warning is raised if load raises the joblib - # warning, and dump fails to dump with new joblib - monkeypatch.setattr(joblib, "load", _load_warn) - monkeypatch.setattr(joblib, "dump", _dump_raise) - msg = "This dataset will stop being loadable in scikit-learn" - with pytest.warns(FutureWarning, match=msg): - _refresh_cache('test', 0) - - # make sure no warning is raised if load raises the warning, but dump - # manages to dump the new data - monkeypatch.setattr(joblib, "load", _load_warn) - monkeypatch.setattr(joblib, "dump", _dump_safe) - with pytest.warns(None) as warns: - _refresh_cache('test', 0) - assert len(warns) == 0 - - # test if an unrelated warning is still passed through and not suppressed - # by _refresh_cache - monkeypatch.setattr(joblib, "load", _load_warn_unrelated) - monkeypatch.setattr(joblib, "dump", _dump_safe) - with pytest.warns(FutureWarning, match="unrelated warning"): - _refresh_cache('test', 0) From 54344b117554862a54ac50ed42669703c671a253 Mon Sep 17 00:00:00 2001 From: Siddharth Gupta <8859981+sid21g@users.noreply.github.com> Date: Tue, 14 Jan 2020 12:44:56 +0530 Subject: [PATCH 219/448] DOC Correct docstring definition for log_loss function (#16037) --- sklearn/metrics/_classification.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 5e56816ab726f..15d5aa6687131 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -2091,8 +2091,9 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, This is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative - log-likelihood of the true labels given a probabilistic classifier's - predictions. The log loss is only defined for two or more labels. + log-likelihood of a logistic model that returns ``y_pred`` probabilities + for its training data ``y_true``. + The log loss is only defined for two or more labels. For a single sample with true label yt in {0,1} and estimated probability yp that yt = 1, the log loss is From b84428e79602a9760a6c06a77531eeb70893b500 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 14 Jan 2020 06:44:04 -0500 Subject: [PATCH 220/448] TST Updates test for deprecation in pandas.SparseArray (#16040) --- sklearn/utils/tests/test_multiclass.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 022252d0c4836..428c6afdd18f9 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -3,6 +3,7 @@ import scipy.sparse as sp from itertools import product import pytest +from distutils.version import LooseVersion from scipy.sparse import issparse from scipy.sparse import csc_matrix @@ -291,7 +292,12 @@ def test_type_of_target(): def test_type_of_target_pandas_sparse(): pd = pytest.importorskip("pandas") - y = pd.SparseArray([1, np.nan, np.nan, 1, np.nan]) + if LooseVersion(pd.__version__) >= '0.25': + pd_sparse_array = pd.arrays.SparseArray + else: + pd_sparse_array = pd.SparseArray + + y = pd_sparse_array([1, np.nan, np.nan, 1, np.nan]) msg = "y cannot be class 'SparseSeries' or 'SparseArray'" with pytest.raises(ValueError, match=msg): type_of_target(y) From 88eadf00ac5c32c3094f5c47f88b90ee20aa5d01 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 22:12:17 +0100 Subject: [PATCH 221/448] DOC Better docstring and User Guide for PDPs (#16114) --- doc/modules/partial_dependence.rst | 86 +++++++++++--- sklearn/inspection/_partial_dependence.py | 135 +++++++++++----------- 2 files changed, 135 insertions(+), 86 deletions(-) diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst index a538f1156b748..612dbcefce01d 100644 --- a/doc/modules/partial_dependence.rst +++ b/doc/modules/partial_dependence.rst @@ -92,22 +92,76 @@ generated. The ``values`` field returned by used in the grid for each target feature. They also correspond to the axis of the plots. -For each value of the 'target' features in the ``grid`` the partial -dependence function needs to marginalize the predictions of the estimator -over all possible values of the 'complement' features. With the ``'brute'`` -method, this is done by replacing every target feature value of ``X`` by those -in the grid, and computing the average prediction. - -In decision trees this can be evaluated efficiently without reference to the -training data (``'recursion'`` method). For each grid point a weighted tree -traversal is performed: if a split node involves a 'target' feature, the -corresponding left or right branch is followed, otherwise both branches are -followed, each branch is weighted by the fraction of training samples that -entered that branch. Finally, the partial dependence is given by a weighted -average of all visited leaves. Note that with the ``'recursion'`` method, -``X`` is only used to generate the grid, not to compute the averaged -predictions. The averaged predictions will always be computed on the data with -which the trees were trained. +Mathematical Definition +^^^^^^^^^^^^^^^^^^^^^^^ + +Let :math:`X_S` be the set of target features (i.e. the `features` parameter) +and let :math:`X_C` be its complement. + +The partial dependence of the response :math:`f` at a point :math:`x_S` is +defined as: + +.. math:: + + pd_{X_S}(x_S) &\overset{def}{=} \mathbb{E}_{X_C}\left[ f(x_S, X_C) \right]\\ + &= \int f(x_S, x_C) p(x_C) dx_C, + +where :math:`f(x_S, x_C)` is the response function (:term:`predict`, +:term:`predict_proba` or :term:`decision_function`) for a given sample whose +values are defined by :math:`x_S` for the features in :math:`X_S`, and by +:math:`x_C` for the features in :math:`X_C`. Note that :math:`x_S` and +:math:`x_C` may be tuples. + +Computing this integral for various values of :math:`x_S` produces a plot as +above. + +Computation methods +^^^^^^^^^^^^^^^^^^^ + +There are two main methods to approximate the integral above, namely the +'brute' and 'recursion' methods. The `method` parameter controls which method +to use. + +The 'brute' method is a generic method that works with any estimator. It +approximates the above integral by computing an average over the data `X`: + +.. math:: + + pd_{X_S}(x_S) \approx \frac{1}{n_\text{samples}} \sum_{i=1}^n f(x_S, x_C^{(i)}), + +where :math:`x_C^{(i)}` is the value of the i-th sample for the features in +:math:`X_C`. For each value of :math:`x_S`, this method requires a full pass +over the dataset `X` which is computationally intensive. + +The 'recursion' method is faster than the 'brute' method, but it is only +supported by some tree-based estimators. It is computed as follows. For a +given point :math:`x_S`, a weighted tree traversal is performed: if a split +node involves a 'target' feature, the corresponding left or right branch is +followed; otherwise both branches are followed, each branch being weighted +by the fraction of training samples that entered that branch. Finally, the +partial dependence is given by a weighted average of all the visited leaves +values. + +With the 'brute' method, the parameter `X` is used both for generating the +grid of values :math:`x_S` and the complement feature values :math:`x_C`. +However with the 'recursion' method, `X` is only used for the grid values: +implicitly, the :math:`x_C` values are those of the training data. + +By default, the 'recursion' method is used on tree-based estimators that +support it, and 'brute' is used for the rest. + +.. _pdp_method_differences: + +.. note:: + + While both methods should be close in general, they might differ in some + specific settings. The 'brute' method assumes the existence of the + data points :math:`(x_S, x_C^{(i)})`. When the features are correlated, + such artificial samples may have a very low probability mass. The 'brute' + and 'recursion' methods will likely disagree regarding the value of the + partial dependence, because they will treat these unlikely + samples differently. Remember, however, that the primary assumption for + interpreting PDPs is that the features should be independent. .. rubric:: Footnotes diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 12233a766969c..2f42dece3f40a 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -188,6 +188,23 @@ def partial_dependence(estimator, X, features, response_method='auto', Read more in the :ref:`User Guide `. + .. warning:: + + For :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, the + 'recursion' method (used by default) will not account for the `init` + predictor of the boosting process. In practice, this will produce + the same values as 'brute' up to a constant offset in the target + response, provided that `init` is a constant estimator (which is the + default). However, if `init` is not a constant estimator, the + partial dependence values are incorrect for 'recursion' because the + offset will be sample-dependent. It is preferable to use the 'brute' + method. Note that this only applies to + :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. + Parameters ---------- estimator : BaseEstimator @@ -196,9 +213,10 @@ def partial_dependence(estimator, X, features, response_method='auto', Multioutput-multiclass classifiers are not supported. X : {array-like or dataframe} of shape (n_samples, n_features) - ``X`` is used both to generate a grid of values for the - ``features``, and to compute the averaged predictions when - method is 'brute'. + ``X`` is used to generate a grid of values for the target + ``features`` (where the partial dependence will be evaluated), and + also to generate values for the complement features when the + `method` is 'brute'. features : array-like of {int, str} The feature (e.g. `[0]`) or pair of interacting features @@ -225,34 +243,24 @@ def partial_dependence(estimator, X, features, response_method='auto', method : str, optional (default='auto') The method used to calculate the averaged predictions: - - 'recursion' is only supported for gradient boosting estimator (namely - :class:`GradientBoostingClassifier`, - :class:`GradientBoostingRegressor`, - :class:`HistGradientBoostingClassifier`, - :class:`HistGradientBoostingRegressor`) + - 'recursion' is only supported for some tree-based estimators (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`) but is more efficient in terms of speed. - With this method, ``X`` is only used to build the - grid and the partial dependences are computed using the training - data. This method does not account for the ``init`` predictor of - the boosting process, which may lead to incorrect values (see - warning below). With this method, the target response of a + With this method, the target response of a classifier is always the decision function, not the predicted probabilities. - 'brute' is supported for any estimator, but is more computationally intensive. - - 'auto': + - 'auto': the 'recursion' is used for estimators that support it, + and 'brute' is used otherwise. - - 'recursion' is used for - :class:`GradientBoostingClassifier` - and - :class:`GradientBoostingRegressor` - if ``init=None``, and for - :class:`HistGradientBoostingClassifier` - and - :class:`HistGradientBoostingRegressor`. - - 'brute' is used for all other estimators. + Please see :ref:`this note ` for + differences between the 'brute' and 'recursion' method. Returns ------- @@ -286,21 +294,6 @@ def partial_dependence(estimator, X, features, response_method='auto', See also -------- sklearn.inspection.plot_partial_dependence: Plot partial dependence - - Warnings - -------- - The 'recursion' method only works for gradient boosting estimators, and - unlike the 'brute' method, it does not account for the ``init`` - predictor of the boosting process. In practice this will produce the - same values as 'brute' up to a constant offset in the target response, - provided that ``init`` is a consant estimator (which is the default). - However, as soon as ``init`` is not a constant estimator, the partial - dependence values are incorrect for 'recursion'. This is not relevant for - :class:`HistGradientBoostingClassifier - ` and - :class:`HistGradientBoostingRegressor - `, which do not have an - ``init`` parameter. """ if not (is_classifier(estimator) or is_regressor(estimator)): raise ValueError( @@ -425,6 +418,8 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. + Read more in the :ref:`User Guide `. + .. note:: :func:`plot_partial_dependence` does not support using the same axes @@ -441,7 +436,22 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, >>> disp2 = plot_partial_dependence(est, X, ... ax=disp1.axes_) # doctest: +SKIP - Read more in the :ref:`User Guide `. + .. warning:: + + For :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, the + 'recursion' method (used by default) will not account for the `init` + predictor of the boosting process. In practice, this will produce + the same values as 'brute' up to a constant offset in the target + response, provided that `init` is a constant estimator (which is the + default). However, if `init` is not a constant estimator, the + partial dependence values are incorrect for 'recursion' because the + offset will be sample-dependent. It is preferable to use the 'brute' + method. Note that this only applies to + :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. Parameters ---------- @@ -451,8 +461,10 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, Multioutput-multiclass classifiers are not supported. X : {array-like or dataframe} of shape (n_samples, n_features) - The data to use to build the grid of values on which the dependence - will be evaluated. This is usually the training data. + ``X`` is used to generate a grid of values for the target + ``features`` (where the partial dependence will be evaluated), and + also to generate values for the complement features when the + `method` is 'brute'. features : list of {int, str, pair of int, pair of str} The target features for which to create the PDPs. @@ -499,28 +511,26 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, for the PDP axes. Must be in [0, 1]. method : str, optional (default='auto') - The method to use to calculate the partial dependence predictions: + The method used to calculate the averaged predictions: - - 'recursion' is only supported for gradient boosting estimator (namely - :class:`GradientBoostingClassifier`, - :class:`GradientBoostingRegressor`, - :class:`HistGradientBoostingClassifier`, - :class:`HistGradientBoostingRegressor`) + - 'recursion' is only supported for some tree-based estimators (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`) but is more efficient in terms of speed. - With this method, ``X`` is optional and is only used to build the - grid and the partial dependences are computed using the training - data. This method does not account for the ``init`` predictor of - the boosting process, which may lead to incorrect values (see - warning below. With this method, the target response of a + With this method, the target response of a classifier is always the decision function, not the predicted probabilities. - 'brute' is supported for any estimator, but is more computationally intensive. - - 'auto': - - 'recursion' is used for estimators that supports it. - - 'brute' is used for all other estimators. + - 'auto': the 'recursion' is used for estimators that support it, + and 'brute' is used otherwise. + + Please see :ref:`this note ` for + differences between the 'brute' and 'recursion' method. n_jobs : int, optional (default=None) The number of CPUs to use to compute the partial dependences. @@ -574,21 +584,6 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, -------- sklearn.inspection.partial_dependence: Return raw partial dependence values - - Warnings - -------- - The 'recursion' method only works for gradient boosting estimators, and - unlike the 'brute' method, it does not account for the ``init`` - predictor of the boosting process. In practice this will produce the - same values as 'brute' up to a constant offset in the target response, - provided that ``init`` is a consant estimator (which is the default). - However, as soon as ``init`` is not a constant estimator, the partial - dependence values are incorrect for 'recursion'. This is not relevant for - :class:`HistGradientBoostingClassifier - ` and - :class:`HistGradientBoostingRegressor - `, which do not have an - ``init`` parameter. """ check_matplotlib_support('plot_partial_dependence') # noqa import matplotlib.pyplot as plt # noqa From c4ea377198f4289af16d33f60b74c6258158bf9f Mon Sep 17 00:00:00 2001 From: alexshacked Date: Wed, 15 Jan 2020 04:06:50 +0200 Subject: [PATCH 222/448] FIX ensure object array are properly casted when dtype=object (#16076) --- doc/whats_new/v0.23.rst | 9 ++++++ sklearn/neighbors/_base.py | 16 ++++------ sklearn/neighbors/tests/test_neighbors.py | 24 ++++++++++++++ sklearn/preprocessing/tests/test_label.py | 4 +-- sklearn/utils/__init__.py | 39 +++++++++++++++++++++++ sklearn/utils/tests/test_utils.py | 12 +++++++ 6 files changed, 92 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 6d398ceef3cbd..352f19cfe3a73 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -136,6 +136,15 @@ Changelog differs between `predict` and `fit`. :pr:`16090` by :user:`Madhura Jayaratne `. +:mod:`sklearn.neighbors` +.............................. + +- |Fix| Fix a bug which converted a list of arrays into a 2-D object + array instead of a 1-D array containing NumPy arrays. This bug + was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`. + :pr:`16076` by :user:`Guillaume Lemaitre ` and + :user:`Alex Shacked `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 258440d20c836..f927c26868a5f 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -24,6 +24,7 @@ from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, gen_even_slices +from ..utils import _to_object_array from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..utils.validation import check_non_negative @@ -276,8 +277,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance): indices = indices.astype(np.intp, copy=no_filter_needed) if return_distance: - neigh_dist = np.array(np.split(data, indptr[1:-1]), dtype=object) - neigh_ind = np.array(np.split(indices, indptr[1:-1]), dtype=object) + neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) + neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) if return_distance: return neigh_dist, neigh_ind @@ -940,17 +941,12 @@ class from an array representing our data set and ask who's neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) neigh_dist_list = sum(neigh_dist_chunks, []) neigh_ind_list = sum(neigh_ind_chunks, []) - # See https://github.com/numpy/numpy/issues/5456 - # to understand why this is initialized this way. - neigh_dist = np.empty(len(neigh_dist_list), dtype='object') - neigh_dist[:] = neigh_dist_list - neigh_ind = np.empty(len(neigh_ind_list), dtype='object') - neigh_ind[:] = neigh_ind_list + neigh_dist = _to_object_array(neigh_dist_list) + neigh_ind = _to_object_array(neigh_ind_list) results = neigh_dist, neigh_ind else: neigh_ind_list = sum(chunked_results, []) - results = np.empty(len(neigh_ind_list), dtype='object') - results[:] = neigh_ind_list + results = _to_object_array(neigh_ind_list) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 03c79086dfedd..88e32669777a1 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -649,6 +649,30 @@ def test_radius_neighbors_boundary_handling(): assert_array_equal(results[0], [0, 1]) +def test_radius_neighbors_returns_array_of_objects(): + # check that we can pass precomputed distances to + # NearestNeighbors.radius_neighbors() + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/16036 + X = csr_matrix(np.ones((4, 4))) + X.setdiag([0, 0, 0, 0]) + + nbrs = neighbors.NearestNeighbors(radius=0.5, algorithm='auto', + leaf_size=30, + metric='precomputed').fit(X) + neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True) + + expected_dist = np.empty(X.shape[0], dtype=object) + expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), + np.array([0])] + expected_ind = np.empty(X.shape[0], dtype=object) + expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), + np.array([3])] + + assert_array_equal(neigh_dist, expected_dist) + assert_array_equal(neigh_ind, expected_ind) + + def test_RadiusNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 6cdb198182a20..887fa90c98d61 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -14,6 +14,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings +from sklearn.utils import _to_object_array from sklearn.preprocessing._label import LabelBinarizer from sklearn.preprocessing._label import MultiLabelBinarizer @@ -433,8 +434,7 @@ def test_multilabel_binarizer_same_length_sequence(): def test_multilabel_binarizer_non_integer_labels(): - tuple_classes = np.empty(3, dtype=object) - tuple_classes[:] = [(1,), (2,), (3,)] + tuple_classes = _to_object_array([(1,), (2,), (3,)]) inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 82abff2b12183..ee38b9b924ccc 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -819,6 +819,45 @@ def tosequence(x): return list(x) +def _to_object_array(sequence): + """Convert sequence to a 1-D NumPy array of object dtype. + + numpy.array constructor has a similar use but it's output + is ambiguous. It can be 1-D NumPy array of object dtype if + the input is a ragged array, but if the input is a list of + equal length arrays, then the output is a 2D numpy.array. + _to_object_array solves this ambiguity by guarantying that + the output is a 1-D NumPy array of objects for any input. + + Parameters + ---------- + sequence : array-like of shape (n_elements,) + The sequence to be converted. + + Returns + ------- + out : ndarray of shape (n_elements,), dtype=object + The converted sequence into a 1-D NumPy array of object dtype. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.utils import _to_object_array + >>> _to_object_array([np.array([0]), np.array([1])]) + array([array([0]), array([1])], dtype=object) + >>> _to_object_array([np.array([0]), np.array([1, 2])]) + array([array([0]), array([1, 2])], dtype=object) + >>> np.array([np.array([0]), np.array([1])]) + array([[0], + [1]]) + >>> np.array([np.array([0]), np.array([1, 2])]) + array([array([0]), array([1, 2])], dtype=object) + """ + out = np.empty(len(sequence), dtype=object) + out[:] = sequence + return out + + def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 2e2711f595d11..c3ae523b32b39 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -28,6 +28,7 @@ from sklearn.utils import _message_with_time, _print_elapsed_time from sklearn.utils import get_chunk_n_rows from sklearn.utils import is_scalar_nan +from sklearn.utils import _to_object_array from sklearn.utils._mocking import MockDataFrame from sklearn import config_context @@ -646,3 +647,14 @@ def test_deprecation_joblib_api(tmpdir): from sklearn.utils._joblib import joblib del joblib.parallel.BACKENDS['failing'] + + +@pytest.mark.parametrize( + "sequence", + [[np.array(1), np.array(2)], [[1, 2], [3, 4]]] +) +def test_to_object_array(sequence): + out = _to_object_array(sequence) + assert isinstance(out, np.ndarray) + assert out.dtype.kind == 'O' + assert out.ndim == 1 From 999f288ef486697e307de0f461033118389013ca Mon Sep 17 00:00:00 2001 From: mo <31044045+mghah@users.noreply.github.com> Date: Wed, 15 Jan 2020 05:58:10 -0800 Subject: [PATCH 223/448] DOC improve docstring of covariance module following doc guideline (#16105) --- sklearn/covariance/_elliptic_envelope.py | 55 +++--- sklearn/covariance/_empirical_covariance.py | 59 +++---- sklearn/covariance/_graph_lasso.py | 157 +++++++++-------- sklearn/covariance/_robust_covariance.py | 183 ++++++++++---------- sklearn/covariance/_shrunk_covariance.py | 94 +++++----- 5 files changed, 275 insertions(+), 273 deletions(-) diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index 5ee4cdeeef96d..b96831077e68a 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -16,10 +16,10 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): Parameters ---------- - store_precision : boolean, optional (default=True) + store_precision : bool, default=True Specify if the estimated precision is stored. - assume_centered : boolean, optional (default=False) + assume_centered : bool, default=False If True, the support of robust location and covariance estimates is computed, and a covariance estimate is recomputed from it, without centering the data. @@ -28,16 +28,17 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): If False, the robust location and covariance are directly computed with the FastMCD algorithm without additional treatment. - support_fraction : float in (0., 1.), optional (default=None) + support_fraction : float, default=None The proportion of points to be included in the support of the raw MCD estimate. If None, the minimum value of support_fraction will be used within the algorithm: `[n_sample + n_features + 1] / 2`. + Range is (0, 1). - contamination : float in (0., 0.5), optional (default=0.1) + contamination : float, default=0.1 The amount of contamination of the data set, i.e. the proportion - of outliers in the data set. + of outliers in the data set. Range is (0, 0.5). - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number @@ -46,17 +47,17 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): Attributes ---------- - location_ : array-like, shape (n_features,) + location_ : ndarray of shape (n_features,) Estimated robust location - covariance_ : array-like, shape (n_features, n_features) + covariance_ : ndarray of shape (n_features, n_features) Estimated robust covariance matrix - precision_ : array-like, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) - support_ : array-like, shape (n_samples,) + support_ : ndarray of shape (n_samples,) A mask of the observations that have been used to compute the robust estimates of location and shape. @@ -102,7 +103,6 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): .. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the minimum covariance determinant estimator" Technometrics 41(3), 212 (1999) - """ def __init__(self, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, @@ -119,12 +119,11 @@ def fit(self, X, y=None): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features). - Training data + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. y : Ignored - not used, present for API consistency by convention. - + Not used, present for API consistency by convention. """ super().fit(X) self.offset_ = np.percentile(-self.dist_, 100. * self.contamination) @@ -135,17 +134,16 @@ def decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - - decision : array-like, shape (n_samples, ) + decision : ndarray of shape (n_samples, ) Decision function of the samples. It is equal to the shifted Mahalanobis distances. The threshold for being an outlier is 0, which ensures a compatibility with other outlier detection algorithms. - """ check_is_fitted(self) negative_mahal_dist = self.score_samples(X) @@ -156,11 +154,12 @@ def score_samples(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - negative_mahal_distances : array-like, shape (n_samples, ) + negative_mahal_distances : array-like of shape (n_samples,) Opposite of the Mahalanobis distances. """ check_is_fitted(self) @@ -173,11 +172,12 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier : ndarray of shape (n_samples,) Returns -1 for anomalies/outliers and +1 for inliers. """ X = check_array(X) @@ -196,19 +196,18 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Test samples. - y : array-like, shape (n_samples,) or (n_samples, n_outputs) + y : array-like of shape (n_samples,) or (n_samples, n_outputs) True labels for X. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float - Mean accuracy of self.predict(X) wrt. y. - + Mean accuracy of self.predict(X) w.r.t. y. """ return accuracy_score(y, self.predict(X), sample_weight=sample_weight) diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 3a76abb326a26..91dfd85740bf8 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -29,15 +29,16 @@ def log_likelihood(emp_cov, precision): Parameters ---------- - emp_cov : 2D ndarray (n_features, n_features) - Maximum Likelihood Estimator of covariance + emp_cov : ndarray of shape (n_features, n_features) + Maximum Likelihood Estimator of covariance. - precision : 2D ndarray (n_features, n_features) - The precision matrix of the covariance model to be tested + precision : ndarray of shape (n_features, n_features) + The precision matrix of the covariance model to be tested. Returns ------- - sample mean of the log-likelihood + log_likelihood_ : float + Sample mean of the log-likelihood. """ p = precision.shape[0] log_likelihood_ = - np.sum(emp_cov * precision) + fast_logdet(precision) @@ -52,10 +53,10 @@ def empirical_covariance(X, assume_centered=False): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Data from which to compute the covariance estimate - assume_centered : boolean + assume_centered : bool, default=False If True, data will not be centered before computation. Useful when working with data whose mean is almost, but not exactly zero. @@ -63,9 +64,8 @@ def empirical_covariance(X, assume_centered=False): Returns ------- - covariance : 2D ndarray, shape (n_features, n_features) + covariance : ndarray of shape (n_features, n_features) Empirical covariance (Maximum Likelihood Estimator). - """ X = np.asarray(X) if X.ndim == 1: @@ -92,10 +92,10 @@ class EmpiricalCovariance(BaseEstimator): Parameters ---------- - store_precision : bool + store_precision : bool, default=True Specifies if the estimated precision is stored. - assume_centered : bool + assume_centered : bool, default=False If True, data are not centered before computation. Useful when working with data whose mean is almost, but not exactly zero. @@ -103,13 +103,13 @@ class EmpiricalCovariance(BaseEstimator): Attributes ---------- - location_ : array-like, shape (n_features,) + location_ : ndarray of shape (n_features,) Estimated location, i.e. the estimated mean. - covariance_ : 2D ndarray, shape (n_features, n_features) + covariance_ : ndarray of shape (n_features, n_features) Estimated covariance matrix - precision_ : 2D ndarray, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo-inverse matrix. (stored only if store_precision is True) @@ -144,10 +144,9 @@ def _set_covariance(self, covariance): Parameters ---------- - covariance : 2D ndarray, shape (n_features, n_features) + covariance : array-like of shape (n_features, n_features) Estimated covariance matrix to be stored, and from which precision is computed. - """ covariance = check_array(covariance) # set covariance @@ -163,9 +162,8 @@ def get_precision(self): Returns ------- - precision_ : array-like + precision_ : array-like of shape (n_features, n_features) The precision matrix associated to the current covariance object. - """ if self.store_precision: precision = self.precision_ @@ -183,13 +181,12 @@ def fit(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. - y - not used, present for API consistence purpose. + y : Ignored + Not used, present for API consistence purpose. Returns ------- self : object - """ X = check_array(X) if self.assume_centered: @@ -214,15 +211,14 @@ def score(self, X_test, y=None): X_test is assumed to be drawn from the same distribution than the data used in fit (including centering). - y - not used, present for API consistence purpose. + y : Ignored + Not used, present for API consistence purpose. Returns ------- res : float The likelihood of the data set with `self.covariance_` as an estimator of its covariance matrix. - """ # compute empirical covariance of the test set test_cov = empirical_covariance( @@ -242,26 +238,26 @@ def error_norm(self, comp_cov, norm='frobenius', scaling=True, comp_cov : array-like of shape (n_features, n_features) The covariance to compare with. - norm : str + norm : {"frobenius", "spectral"}, default="frobenius" The type of norm used to compute the error. Available error types: - 'frobenius' (default): sqrt(tr(A^t.A)) - 'spectral': sqrt(max(eigenvalues(A^t.A)) where A is the error ``(comp_cov - self.covariance_)``. - scaling : bool + scaling : bool, default=True If True (default), the squared error norm is divided by n_features. If False, the squared error norm is not rescaled. - squared : bool + squared : bool, default=True Whether to compute the squared error norm or the error norm. If True (default), the squared error norm is returned. If False, the error norm is returned. Returns ------- - The Mean Squared Error (in the sense of the Frobenius norm) between - `self` and `comp_cov` covariance estimators. - + result : float + The Mean Squared Error (in the sense of the Frobenius norm) between + `self` and `comp_cov` covariance estimators. """ # compute the error error = comp_cov - self.covariance_ @@ -296,9 +292,8 @@ def mahalanobis(self, X): Returns ------- - dist : array, shape = [n_samples,] + dist : ndarray of shape (n_samples,) Squared Mahalanobis distances of the observations. - """ precision = self.get_precision() # compute mahalanobis distances diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index c282d40c826bd..9dbf786839585 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -58,16 +58,14 @@ def alpha_max(emp_cov): Parameters ---------- - emp_cov : 2D array, (n_features, n_features) - The sample covariance matrix + emp_cov : ndarray of shape (n_features, n_features) + The sample covariance matrix. Notes ----- - This results from the bound for the all the Lasso that are solved in GraphicalLasso: each time, the row of cov corresponds to Xy. As the bound for alpha is given by `max(abs(Xy))`, the result follows. - """ A = np.copy(emp_cov) A.flat[::A.shape[0] + 1] = 0 @@ -86,56 +84,57 @@ def graphical_lasso(emp_cov, alpha, cov_init=None, mode='cd', tol=1e-4, Parameters ---------- - emp_cov : 2D ndarray, shape (n_features, n_features) + emp_cov : ndarray of shape (n_features, n_features) Empirical covariance from which to compute the covariance estimate. - alpha : positive float + alpha : float The regularization parameter: the higher alpha, the more regularization, the sparser the inverse covariance. + Range is (0, inf]. - cov_init : 2D array (n_features, n_features), optional + cov_init : array of shape (n_features, n_features), default=None The initial guess for the covariance. - mode : {'cd', 'lars'} + mode : {'cd', 'lars'}, default='cd' The Lasso solver to use: coordinate descent or LARS. Use LARS for very sparse underlying graphs, where p > n. Elsewhere prefer cd which is more numerically stable. - tol : positive float, optional + tol : float, default=1e-4 The tolerance to declare convergence: if the dual gap goes below - this value, iterations are stopped. + this value, iterations are stopped. Range is (0, inf]. - enet_tol : positive float, optional + enet_tol : float, default=1e-4 The tolerance for the elastic net solver used to calculate the descent direction. This parameter controls the accuracy of the search direction for a given column update, not of the overall parameter estimate. Only - used for mode='cd'. + used for mode='cd'. Range is (0, inf]. - max_iter : integer, optional + max_iter : int, default=100 The maximum number of iterations. - verbose : boolean, optional + verbose : bool, default=False If verbose is True, the objective function and dual gap are printed at each iteration. - return_costs : boolean, optional + return_costs : bool, default=Flase If return_costs is True, the objective function and dual gap at each iteration are returned. - eps : float, optional + eps : float, default=eps The machine-precision regularization in the computation of the Cholesky diagonal factors. Increase this for very ill-conditioned - systems. + systems. Default is `np.finfo(np.float64).eps`. - return_n_iter : bool, optional + return_n_iter : bool, default=False Whether or not to return the number of iterations. Returns ------- - covariance : 2D ndarray, shape (n_features, n_features) + covariance : ndarray of shape (n_features, n_features) The estimated covariance matrix. - precision : 2D ndarray, shape (n_features, n_features) + precision : ndarray of shape (n_features, n_features) The estimated (sparse) precision matrix. costs : list of (objective, dual_gap) pairs @@ -157,7 +156,6 @@ def graphical_lasso(emp_cov, alpha, cov_init=None, mode='cd', tol=1e-4, One possible difference with the `glasso` R package is that the diagonal coefficients are not penalized. - """ _, n_features = emp_cov.shape if alpha == 0: @@ -285,33 +283,34 @@ class GraphicalLasso(EmpiricalCovariance): Parameters ---------- - alpha : positive float, default 0.01 + alpha : float, default=0.01 The regularization parameter: the higher alpha, the more regularization, the sparser the inverse covariance. + Range is (0, inf]. - mode : {'cd', 'lars'}, default 'cd' + mode : {'cd', 'lars'}, default='cd' The Lasso solver to use: coordinate descent or LARS. Use LARS for very sparse underlying graphs, where p > n. Elsewhere prefer cd which is more numerically stable. - tol : positive float, default 1e-4 + tol : float, default=1e-4 The tolerance to declare convergence: if the dual gap goes below - this value, iterations are stopped. + this value, iterations are stopped. Range is (0, inf]. - enet_tol : positive float, optional + enet_tol : float, default=1e-4 The tolerance for the elastic net solver used to calculate the descent direction. This parameter controls the accuracy of the search direction for a given column update, not of the overall parameter estimate. Only - used for mode='cd'. + used for mode='cd'. Range is (0, inf]. - max_iter : integer, default 100 + max_iter : int, default=100 The maximum number of iterations. - verbose : boolean, default False + verbose : bool, default=False If verbose is True, the objective function and dual gap are plotted at each iteration. - assume_centered : boolean, default False + assume_centered : bool, default=False If True, data are not centered before computation. Useful when working with data whose mean is almost, but not exactly zero. @@ -319,13 +318,13 @@ class GraphicalLasso(EmpiricalCovariance): Attributes ---------- - location_ : array-like, shape (n_features,) + location_ : ndarray of shape (n_features,) Estimated location, i.e. the estimated mean. - covariance_ : array-like, shape (n_features, n_features) + covariance_ : ndarray of shape (n_features, n_features) Estimated covariance matrix - precision_ : array-like, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo inverse matrix. n_iter_ : int @@ -372,9 +371,15 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data from which to compute the covariance estimate - y : (ignored) + + y : Ignored + Not used, present for API consistence purpose. + + Returns + ------- + self : object """ # Covariance does not make sense for a single feature X = check_array(X, ensure_min_features=2, ensure_min_samples=2, @@ -402,49 +407,53 @@ def graphical_lasso_path(X, alphas, cov_init=None, X_test=None, mode='cd', Parameters ---------- - X : 2D ndarray, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) Data from which to compute the covariance estimate. - alphas : list of positive floats + alphas : array-like of shape (n_alphas,) The list of regularization parameters, decreasing order. - cov_init : 2D array (n_features, n_features), optional + cov_init : array of shape (n_features, n_features), default=None The initial guess for the covariance. - X_test : 2D array, shape (n_test_samples, n_features), optional + X_test : array of shape (n_test_samples, n_features), default=None Optional test matrix to measure generalisation error. - mode : {'cd', 'lars'} + mode : {'cd', 'lars'}, default='cd' The Lasso solver to use: coordinate descent or LARS. Use LARS for very sparse underlying graphs, where p > n. Elsewhere prefer cd which is more numerically stable. - tol : positive float, optional + tol : float, default=1e-4 The tolerance to declare convergence: if the dual gap goes below - this value, iterations are stopped. + this value, iterations are stopped. The tolerance must be a positive + number. - enet_tol : positive float, optional + enet_tol : float, default=1e-4 The tolerance for the elastic net solver used to calculate the descent direction. This parameter controls the accuracy of the search direction for a given column update, not of the overall parameter estimate. Only - used for mode='cd'. + used for mode='cd'. The tolerance must be a positive number. - max_iter : integer, optional - The maximum number of iterations. + max_iter : int, default=100 + The maximum number of iterations. This parameter should be a strictly + positive integer. - verbose : integer, optional + verbose : int or bool, default=False The higher the verbosity flag, the more information is printed during the fitting. Returns ------- - covariances_ : List of 2D ndarray, shape (n_features, n_features) + covariances_ : list of shape (n_alphas,) of ndarray of shape \ + (n_features, n_features) The estimated covariance matrices. - precisions_ : List of 2D ndarray, shape (n_features, n_features) + precisions_ : list of shape (n_alphas,) of ndarray of shape \ + (n_features, n_features) The estimated (sparse) precision matrices. - scores_ : List of float + scores_ : list of shape (n_alphas,), dtype=float The generalisation error (log-likelihood) on the test data. Returned only if test data is passed. """ @@ -500,17 +509,17 @@ class GraphicalLassoCV(GraphicalLasso): Parameters ---------- - alphas : integer, or list positive float, optional + alphas : int or array-like of shape (n_alphas,), dtype=float, default=4 If an integer is given, it fixes the number of points on the grids of alpha to be used. If a list is given, it gives the grid to be used. See the notes in the class docstring for - more details. + more details. Range is (0, inf] when floats given. - n_refinements : strictly positive integer + n_refinements : int, default=4 The number of times the grid is refined. Not used if explicit - values of alphas are passed. + values of alphas are passed. Range is [1, inf). - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -527,36 +536,36 @@ class GraphicalLassoCV(GraphicalLasso): .. versionchanged:: 0.20 ``cv`` default value if None changed from 3-fold to 5-fold. - tol : positive float, optional + tol : float, default=1e-4 The tolerance to declare convergence: if the dual gap goes below - this value, iterations are stopped. + this value, iterations are stopped. Range is (0, inf]. - enet_tol : positive float, optional + enet_tol : float, default=1e-4 The tolerance for the elastic net solver used to calculate the descent direction. This parameter controls the accuracy of the search direction for a given column update, not of the overall parameter estimate. Only - used for mode='cd'. + used for mode='cd'. Range is (0, inf]. - max_iter : integer, optional + max_iter : int, default=100 Maximum number of iterations. - mode : {'cd', 'lars'} + mode : {'cd', 'lars'}, default='cd' The Lasso solver to use: coordinate descent or LARS. Use LARS for very sparse underlying graphs, where number of features is greater than number of samples. Elsewhere prefer cd which is more numerically stable. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : boolean, optional + verbose : bool, default=False If verbose is True, the objective function and duality gap are printed at each iteration. - assume_centered : boolean + assume_centered : bool, default=False If True, data are not centered before computation. Useful when working with data whose mean is almost, but not exactly zero. @@ -564,22 +573,22 @@ class GraphicalLassoCV(GraphicalLasso): Attributes ---------- - location_ : array-like, shape (n_features,) + location_ : ndarray of shape (n_features,) Estimated location, i.e. the estimated mean. - covariance_ : numpy.ndarray, shape (n_features, n_features) + covariance_ : ndarray of shape (n_features, n_features) Estimated covariance matrix. - precision_ : numpy.ndarray, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated precision matrix (inverse covariance). alpha_ : float Penalization parameter selected. - cv_alphas_ : list of float + cv_alphas_ : list of shape (n_alphas,), dtype=float All penalization parameters explored. - grid_scores_ : 2D numpy.ndarray (n_alphas, n_folds) + grid_scores_ : ndarray of shape (n_alphas, n_folds) Log-likelihood score on left-out data across folds. n_iter_ : int @@ -639,9 +648,15 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data from which to compute the covariance estimate - y : (ignored) + + y : Ignored + Not used, present for API consistence purpose. + + Returns + ------- + self : object """ # Covariance does not make sense for a single feature X = check_array(X, ensure_min_features=2, estimator=self) diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 9c59f204a7636..8afac2c3c0eee 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -33,34 +33,36 @@ def c_step(X, n_support, remaining_iterations=30, initial_estimates=None, Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data set in which we look for the n_support observations whose scatter matrix has minimum determinant. - n_support : int, > n_samples / 2 + n_support : int, Number of observations to compute the robust estimates of location - and covariance from. + and covariance from. This parameter must be greater than + `n_samples / 2`. - remaining_iterations : int, optional + remaining_iterations : int, default=30 Number of iterations to perform. According to [Rouseeuw1999]_, two iterations are sufficient to get close to the minimum, and we never need more than 30 to reach convergence. - initial_estimates : 2-tuple, optional + initial_estimates : tuple of shape (2,), default=None Initial estimates of location and shape from which to run the c_step procedure: - initial_estimates[0]: an initial location estimate - initial_estimates[1]: an initial covariance estimate - verbose : boolean, optional + verbose : bool, defaut=False Verbose mode. - cov_computation_method : callable, default empirical_covariance + cov_computation_method : callable, \ + default=:func:`sklearn.covariance.empirical_covariance` The function which will be used to compute the covariance. - Must return shape (n_features, n_features) + Must return array of shape (n_features, n_features). - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -68,13 +70,13 @@ def c_step(X, n_support, remaining_iterations=30, initial_estimates=None, Returns ------- - location : array-like, shape (n_features,) + location : ndarray of shape (n_features,) Robust location estimates. - covariance : array-like, shape (n_features, n_features) + covariance : ndarray of shape (n_features, n_features) Robust covariance estimates. - support : array-like, shape (n_samples,) + support : ndarray of shape (n_samples,) A mask for the `n_support` observations whose scatter matrix has minimum determinant. @@ -83,7 +85,6 @@ def c_step(X, n_support, remaining_iterations=30, initial_estimates=None, .. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS - """ X = np.asarray(X) random_state = check_random_state(random_state) @@ -199,15 +200,17 @@ def select_candidates(X, n_support, n_trials, select=1, n_iter=30, Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data (sub)set in which we look for the n_support purest observations. - n_support : int, [(n + p + 1)/2] < n_support < n + n_support : int The number of samples the pure data set must contain. + This parameter must be in the range `[(n + p + 1)/2] < n_support < n`. - n_trials : int, nb_trials > 0 or 2-tuple + n_trials : int or tuple of shape (2,) Number of different initial sets of observations from which to - run the algorithm. + run the algorithm. This parameter should be a strictly positive + integer. Instead of giving a number of trials to perform, one can provide a list of initial estimates that will be used to iteratively run c_step procedures. In this case: @@ -216,21 +219,24 @@ def select_candidates(X, n_support, n_trials, select=1, n_iter=30, - n_trials[1]: array-like, shape (n_trials, n_features, n_features) is the list of `n_trials` initial covariances estimates - select : int, int > 0 - Number of best candidates results to return. + select : int, default=1 + Number of best candidates results to return. This parameter must be + a strictly positive integer. - n_iter : int, nb_iter > 0 + n_iter : int, default=30 Maximum number of iterations for the c_step procedure. (2 is enough to be close to the final solution. "Never" exceeds 20). + This parameter must be a strictly positive integer. - verbose : boolean, default False + verbose : bool, default False Control the output verbosity. - cov_computation_method : callable, default empirical_covariance + cov_computation_method : callable, \ + default=:func:`sklearn.covariance.empirical_covariance` The function which will be used to compute the covariance. - Must return shape (n_features, n_features) + Must return an array of shape (n_features, n_features). - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -242,15 +248,15 @@ def select_candidates(X, n_support, n_trials, select=1, n_iter=30, Returns ------- - best_locations : array-like, shape (select, n_features) + best_locations : ndarray of shape (select, n_features) The `select` location estimates computed from the `select` best supports found in the data set (`X`). - best_covariances : array-like, shape (select, n_features, n_features) + best_covariances : ndarray of shape (select, n_features, n_features) The `select` covariance estimates computed from the `select` best supports found in the data set (`X`). - best_supports : array-like, shape (select, n_samples) + best_supports : ndarray of shape (select, n_samples) The `select` best supports found in the data set (`X`). References @@ -258,7 +264,6 @@ def select_candidates(X, n_support, n_trials, select=1, n_iter=30, .. [RV] A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS - """ random_state = check_random_state(random_state) @@ -312,25 +317,39 @@ def fast_mcd(X, support_fraction=None, Parameters ---------- - X : array-like, shape (n_samples, n_features) - The data matrix, with p features and n samples. + X : array-like of shape (n_samples, n_features) + The data matrix, with p features and n samples. - support_fraction : float, 0 < support_fraction < 1 - The proportion of points to be included in the support of the raw - MCD estimate. Default is None, which implies that the minimum - value of support_fraction will be used within the algorithm: - `[n_sample + n_features + 1] / 2`. + support_fraction : float, default=None + The proportion of points to be included in the support of the raw + MCD estimate. Default is `None`, which implies that the minimum + value of `support_fraction` will be used within the algorithm: + `(n_sample + n_features + 1) / 2`. This parameter must be in the + range (0, 1). - cov_computation_method : callable, default empirical_covariance + cov_computation_method : callable, \ + default=:func:`sklearn.covariance.empirical_covariance` The function which will be used to compute the covariance. - Must return shape (n_features, n_features) + Must return an array of shape (n_features, n_features). - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. + Returns + ------- + location : ndarray of shape (n_features,) + Robust location of the data. + + covariance : ndarray of shape (n_features, n_features) + Robust covariance of the features. + + support : ndarray of shape (n_samples,), dtype=bool + A mask of the observations that have been used to compute + the robust location and covariance estimates of the data set. + Notes ----- The FastMCD algorithm has been introduced by Rousseuw and Van Driessen @@ -356,19 +375,6 @@ def fast_mcd(X, support_fraction=None, .. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun, Asymptotics For The Minimum Covariance Determinant Estimator, The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400 - - Returns - ------- - location : array-like, shape (n_features,) - Robust location of the data. - - covariance : array-like, shape (n_features, n_features) - Robust covariance of the features. - - support : array-like, type boolean, shape (n_samples,) - A mask of the observations that have been used to compute - the robust location and covariance estimates of the data set. - """ random_state = check_random_state(random_state) @@ -524,10 +530,10 @@ class MinCovDet(EmpiricalCovariance): Parameters ---------- - store_precision : bool + store_precision : bool, default=True Specify if the estimated precision is stored. - assume_centered : bool + assume_centered : bool, default=False If True, the support of the robust location and the covariance estimates is computed, and a covariance estimate is recomputed from it, without centering the data. @@ -536,13 +542,14 @@ class MinCovDet(EmpiricalCovariance): If False, the robust location and covariance are directly computed with the FastMCD algorithm without additional treatment. - support_fraction : float, 0 < support_fraction < 1 + support_fraction : float, default=None The proportion of points to be included in the support of the raw MCD estimate. Default is None, which implies that the minimum value of support_fraction will be used within the algorithm: - [n_sample + n_features + 1] / 2 + `(n_sample + n_features + 1) / 2`. The parameter must be in the range + (0, 1). - random_state : int, RandomState instance or None, optional (default=None) + random_state : int or RandomState instance, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -550,32 +557,32 @@ class MinCovDet(EmpiricalCovariance): Attributes ---------- - raw_location_ : array-like, shape (n_features,) + raw_location_ : ndarray of shape (n_features,) The raw robust estimated location before correction and re-weighting. - raw_covariance_ : array-like, shape (n_features, n_features) + raw_covariance_ : ndarray of shape (n_features, n_features) The raw robust estimated covariance before correction and re-weighting. - raw_support_ : array-like, shape (n_samples,) + raw_support_ : ndarray of shape (n_samples,) A mask of the observations that have been used to compute the raw robust estimates of location and shape, before correction and re-weighting. - location_ : array-like, shape (n_features,) - Estimated robust location + location_ : ndarray of shape (n_features,) + Estimated robust location. - covariance_ : array-like, shape (n_features, n_features) - Estimated robust covariance matrix + covariance_ : ndarray of shape (n_features, n_features) + Estimated robust covariance matrix. - precision_ : array-like, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) - support_ : array-like, shape (n_samples,) + support_ : ndarray of shape (n_samples,) A mask of the observations that have been used to compute the robust estimates of location and shape. - dist_ : array-like, shape (n_samples,) + dist_ : ndarray of shape (n_samples,) Mahalanobis distances of the training set (on which :meth:`fit` is called) observations. @@ -608,7 +615,6 @@ class MinCovDet(EmpiricalCovariance): .. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun, Asymptotics For The Minimum Covariance Determinant Estimator, The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400 - """ _nonrobust_covariance = staticmethod(empirical_covariance) @@ -625,16 +631,15 @@ def fit(self, X, y=None): Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where n_samples is the number of samples - and n_features is the number of features. + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. - y - not used, present for API consistence purpose. + y: Ignored + Not used, present for API consistence purpose. Returns ------- self : object - """ X = check_array(X, ensure_min_samples=2, estimator='MinCovDet') random_state = check_random_state(self.random_state) @@ -676,23 +681,22 @@ def correct_covariance(self, data): Parameters ---------- - data : array-like, shape (n_samples, n_features) + data : array-like of shape (n_samples, n_features) The data matrix, with p features and n samples. The data set must be the one which was used to compute the raw estimates. + Returns + ------- + covariance_corrected : ndarray of shape (n_features, n_features) + Corrected robust covariance estimate. + References ---------- .. [RVD] A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS - - Returns - ------- - covariance_corrected : array-like, shape (n_features, n_features) - Corrected robust covariance estimate. - """ # Check that the covariance of the support data is not equal to 0. @@ -717,30 +721,29 @@ def reweight_covariance(self, data): Parameters ---------- - data : array-like, shape (n_samples, n_features) + data : array-like of shape (n_samples, n_features) The data matrix, with p features and n samples. The data set must be the one which was used to compute the raw estimates. - References - ---------- - - .. [RVDriessen] A Fast Algorithm for the Minimum Covariance - Determinant Estimator, 1999, American Statistical Association - and the American Society for Quality, TECHNOMETRICS - Returns ------- - location_reweighted : array-like, shape (n_features, ) + location_reweighted : ndarray of shape (n_features,) Re-weighted robust location estimate. - covariance_reweighted : array-like, shape (n_features, n_features) + covariance_reweighted : ndarray of shape (n_features, n_features) Re-weighted robust covariance estimate. - support_reweighted : array-like, type boolean, shape (n_samples,) + support_reweighted : ndarray of shape (n_samples,), dtype=bool A mask of the observations that have been used to compute the re-weighted robust location and covariance estimates. + References + ---------- + + .. [RVDriessen] A Fast Algorithm for the Minimum Covariance + Determinant Estimator, 1999, American Statistical Association + and the American Society for Quality, TECHNOMETRICS """ n_samples, n_features = data.shape mask = self.dist_ < chi2(n_features).isf(0.025) diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 9b01d3e7a9041..61907274ad823 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -29,16 +29,16 @@ def shrunk_covariance(emp_cov, shrinkage=0.1): Parameters ---------- - emp_cov : array-like, shape (n_features, n_features) + emp_cov : array-like of shape (n_features, n_features) Covariance matrix to be shrunk - shrinkage : float, 0 <= shrinkage <= 1 + shrinkage : float, default=0.1 Coefficient in the convex combination used for the computation - of the shrunk estimate. + of the shrunk estimate. Range is [0, 1]. Returns ------- - shrunk_cov : array-like + shrunk_cov : ndarray of shape (n_features, n_features) Shrunk covariance. Notes @@ -48,7 +48,6 @@ def shrunk_covariance(emp_cov, shrinkage=0.1): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where mu = trace(cov) / n_features - """ emp_cov = check_array(emp_cov) n_features = emp_cov.shape[0] @@ -67,28 +66,28 @@ class ShrunkCovariance(EmpiricalCovariance): Parameters ---------- - store_precision : boolean, default True + store_precision : bool, default=True Specify if the estimated precision is stored - assume_centered : boolean, default False + assume_centered : bool, default=False If True, data will not be centered before computation. Useful when working with data whose mean is almost, but not exactly zero. If False, data will be centered before computation. - shrinkage : float, 0 <= shrinkage <= 1, default 0.1 + shrinkage : float, default=0.1 Coefficient in the convex combination used for the computation - of the shrunk estimate. + of the shrunk estimate. Range is [0, 1]. Attributes ---------- - location_ : array-like, shape (n_features,) + location_ : ndarray of shape (n_features,) Estimated location, i.e. the estimated mean. - covariance_ : array-like, shape (n_features, n_features) + covariance_ : ndarray of shape (n_features, n_features) Estimated covariance matrix - precision_ : array-like, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) @@ -117,7 +116,6 @@ class ShrunkCovariance(EmpiricalCovariance): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where mu = trace(cov) / n_features - """ def __init__(self, store_precision=True, assume_centered=False, shrinkage=0.1): @@ -126,8 +124,8 @@ def __init__(self, store_precision=True, assume_centered=False, self.shrinkage = shrinkage def fit(self, X, y=None): - """ Fits the shrunk covariance model - according to the given training data and parameters. + """Fit the shrunk covariance model according to the given training data + and parameters. Parameters ---------- @@ -135,13 +133,12 @@ def fit(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. - y + y: Ignored not used, present for API consistence purpose. Returns ------- self : object - """ X = check_array(X) # Not calling the parent object to fit, to avoid a potential @@ -167,16 +164,16 @@ def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage. - assume_centered : bool + assume_centered : bool, default=False If True, data will not be centered before computation. Useful to work with data whose mean is significantly equal to zero but is not exactly zero. If False, data will be centered before computation. - block_size : int + block_size : int, default=1000 Size of the blocks into which the covariance matrix will be split. Returns @@ -192,7 +189,6 @@ def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where mu = trace(cov) / n_features - """ X = np.asarray(X) # for only one feature, the result is the same whatever the shrinkage @@ -262,10 +258,10 @@ def ledoit_wolf(X, assume_centered=False, block_size=1000): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data from which to compute the covariance estimate - assume_centered : boolean, default=False + assume_centered : bool, default=False If True, data will not be centered before computation. Useful to work with data whose mean is significantly equal to zero but is not exactly zero. @@ -277,7 +273,7 @@ def ledoit_wolf(X, assume_centered=False, block_size=1000): Returns ------- - shrunk_cov : array-like, shape (n_features, n_features) + shrunk_cov : ndarray of shape (n_features, n_features) Shrunk covariance. shrinkage : float @@ -291,7 +287,6 @@ def ledoit_wolf(X, assume_centered=False, block_size=1000): (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) where mu = trace(cov) / n_features - """ X = np.asarray(X) # for only one feature, the result is the same whatever the shrinkage @@ -347,19 +342,19 @@ class LedoitWolf(EmpiricalCovariance): Attributes ---------- - location_ : array-like, shape (n_features,) + location_ : ndarray of shape (n_features,) Estimated location, i.e. the estimated mean. - covariance_ : array-like, shape (n_features, n_features) - Estimated covariance matrix + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix. - precision_ : array-like, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) - shrinkage_ : float, 0 <= shrinkage <= 1 + shrinkage_ : float Coefficient in the convex combination used for the computation - of the shrunk estimate. + of the shrunk estimate. Range is [0, 1]. Examples -------- @@ -392,7 +387,6 @@ class LedoitWolf(EmpiricalCovariance): "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices", Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2, February 2004, pages 365-411. - """ def __init__(self, store_precision=True, assume_centered=False, block_size=1000): @@ -401,21 +395,20 @@ def __init__(self, store_precision=True, assume_centered=False, self.block_size = block_size def fit(self, X, y=None): - """ Fits the Ledoit-Wolf shrunk covariance model - according to the given training data and parameters. + """Fit the Ledoit-Wolf shrunk covariance model according to the given + training data and parameters. Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where n_samples is the number of samples - and n_features is the number of features. - y + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + y : Ignored not used, present for API consistence purpose. Returns ------- self : object - """ # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) @@ -440,10 +433,10 @@ def oas(X, assume_centered=False): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Data from which to compute the covariance estimate. - assume_centered : boolean + assume_centered : bool, default=False If True, data will not be centered before computation. Useful to work with data whose mean is significantly equal to zero but is not exactly zero. @@ -451,7 +444,7 @@ def oas(X, assume_centered=False): Returns ------- - shrunk_cov : array-like, shape (n_features, n_features) + shrunk_cov : array-like of shape (n_features, n_features) Shrunk covariance. shrinkage : float @@ -468,7 +461,6 @@ def oas(X, assume_centered=False): The formula we used to implement the OAS is slightly modified compared to the one given in the article. See :class:`OAS` for more details. - """ X = np.asarray(X) # for only one feature, the result is the same whatever the shrinkage @@ -528,16 +520,16 @@ class OAS(EmpiricalCovariance): Attributes ---------- - covariance_ : array-like, shape (n_features, n_features) + covariance_ : ndarray of shape (n_features, n_features) Estimated covariance matrix. - precision_ : array-like, shape (n_features, n_features) + precision_ : ndarray of shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) - shrinkage_ : float, 0 <= shrinkage <= 1 + shrinkage_ : float coefficient in the convex combination used for the computation - of the shrunk estimate. + of the shrunk estimate. Range is [0, 1]. Notes ----- @@ -552,25 +544,23 @@ class OAS(EmpiricalCovariance): ---------- "Shrinkage Algorithms for MMSE Covariance Estimation" Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010. - """ def fit(self, X, y=None): - """ Fits the Oracle Approximating Shrinkage covariance model + """Fit the Oracle Approximating Shrinkage covariance model according to the given training data and parameters. Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where n_samples is the number of samples - and n_features is the number of features. - y + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + y : Ignored not used, present for API consistence purpose. Returns ------- self : object - """ X = check_array(X) # Not calling the parent object to fit, to avoid computing the From d7c375869ada53040f035f6fb0eb3b8d2d5dfff2 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 15 Jan 2020 13:38:44 -0500 Subject: [PATCH 224/448] [MRG] BUG Clips log_loss calculation in neural_network (#16117) --- doc/whats_new/v0.23.rst | 7 ++++++ sklearn/neural_network/_base.py | 6 +++++- sklearn/neural_network/tests/test_base.py | 26 +++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 sklearn/neural_network/tests/test_base.py diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 352f19cfe3a73..852cc360a702c 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -145,6 +145,13 @@ Changelog :pr:`16076` by :user:`Guillaume Lemaitre ` and :user:`Alex Shacked `. +:mod:`sklearn.neural_network` +............................. + +- |Fix| Increases the numerical stability of the logistic loss function in + :class:`neural_network.MLPClassifier` by clipping the probabilities. + :pr:`16117` by `Thomas Fan`_. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/neural_network/_base.py b/sklearn/neural_network/_base.py index c29f6bbb161cb..466c082ed8745 100644 --- a/sklearn/neural_network/_base.py +++ b/sklearn/neural_network/_base.py @@ -212,6 +212,8 @@ def log_loss(y_true, y_prob): loss : float The degree to which the samples are correctly predicted. """ + eps = np.finfo(y_prob.dtype).eps + y_prob = np.clip(y_prob, eps, 1 - eps) if y_prob.shape[1] == 1: y_prob = np.append(1 - y_prob, y_prob, axis=1) @@ -232,7 +234,7 @@ def binary_log_loss(y_true, y_prob): y_true : array-like or label indicator matrix Ground truth (correct) labels. - y_prob : array-like of float, shape = (n_samples, n_classes) + y_prob : array-like of float, shape = (n_samples, 1) Predicted probabilities, as returned by a classifier's predict_proba method. @@ -241,6 +243,8 @@ def binary_log_loss(y_true, y_prob): loss : float The degree to which the samples are correctly predicted. """ + eps = np.finfo(y_prob.dtype).eps + y_prob = np.clip(y_prob, eps, 1 - eps) return -(xlogy(y_true, y_prob) + xlogy(1 - y_true, 1 - y_prob)).sum() / y_prob.shape[0] diff --git a/sklearn/neural_network/tests/test_base.py b/sklearn/neural_network/tests/test_base.py new file mode 100644 index 0000000000000..c803efe561faa --- /dev/null +++ b/sklearn/neural_network/tests/test_base.py @@ -0,0 +1,26 @@ +import pytest +import numpy as np + +from sklearn.neural_network._base import binary_log_loss +from sklearn.neural_network._base import log_loss + + +def test_binary_log_loss_1_prob_finite(): + # y_proba is equal to one should result in a finite logloss + y_true = np.array([[0, 0, 1]]).T + y_prob = np.array([[0.9, 1.0, 1.0]]).T + + loss = binary_log_loss(y_true, y_prob) + assert np.isfinite(loss) + + +@pytest.mark.parametrize("y_true, y_prob", [ + (np.array([[1, 0, 0], [0, 1, 0]]), + np.array([[0., 1., 0.], [0.9, 0.05, 0.05]])), + (np.array([[0, 0, 1]]).T, + np.array([[0.9, 1.0, 1.0]]).T), +]) +def test_log_loss_1_prob_finite(y_true, y_prob): + # y_proba is equal to 1 should result in a finite logloss + loss = log_loss(y_true, y_prob) + assert np.isfinite(loss) From da6065b088e03a09964a51f14ff8b069a3873cc7 Mon Sep 17 00:00:00 2001 From: Madhura Jayaratne Date: Fri, 17 Jan 2020 20:12:24 +1100 Subject: [PATCH 225/448] [MRG] Fix documentation of the tol parameter of kmeans (#16125) --- sklearn/cluster/_kmeans.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 92dae7a4d7726..7ca1db87e0035 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -236,7 +236,9 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', Verbosity mode. tol : float, optional - The relative increment in the results before declaring convergence. + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use @@ -681,7 +683,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): single run. tol : float, default=1e-4 - Relative tolerance with regards to inertia to declare convergence. + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. precompute_distances : 'auto' or bool, default='auto' Precompute distances (faster but takes more memory). From 856d273b9dd100cf478f86579333a2b2d78c2d76 Mon Sep 17 00:00:00 2001 From: trimeta Date: Fri, 17 Jan 2020 08:37:26 -0600 Subject: [PATCH 226/448] FIX Expose SelectorMixin through sklearn/feature_selection/ (#16132) --- doc/modules/classes.rst | 1 + sklearn/feature_selection/__init__.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index f8e5195cc9174..f81941402b2dc 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -33,6 +33,7 @@ Base classes base.DensityMixin base.RegressorMixin base.TransformerMixin + feature_selection.SelectorMixin Functions --------- diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index f8bda21a5813d..e9fa9ada1a5e4 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -24,6 +24,8 @@ from ._mutual_info import mutual_info_regression, mutual_info_classif +from ._base import SelectorMixin + __all__ = ['GenericUnivariateSelect', 'RFE', @@ -40,4 +42,5 @@ 'f_oneway', 'f_regression', 'mutual_info_classif', - 'mutual_info_regression'] + 'mutual_info_regression', + 'SelectorMixin'] From 882a675b89f2bd5b267db34a8fef0406186d0f5d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 17 Jan 2020 18:28:33 +0100 Subject: [PATCH 227/448] MNT remove tag help wanted in doc template (#16122) --- .github/ISSUE_TEMPLATE/doc_improvement.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.md b/.github/ISSUE_TEMPLATE/doc_improvement.md index 0cbf3fdb4963c..4c2906bb18418 100644 --- a/.github/ISSUE_TEMPLATE/doc_improvement.md +++ b/.github/ISSUE_TEMPLATE/doc_improvement.md @@ -2,7 +2,7 @@ name: Documentation improvement about: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. title: '' -labels: Documentation, help wanted +labels: Documentation assignees: '' --- From 00fe3d6944f91d52b24d0f59cc9a4dd83be99bcf Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Jan 2020 02:47:54 -0800 Subject: [PATCH 228/448] Validate sample weight with check_sample weight in kernel_ridge (#16154) --- sklearn/kernel_ridge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index d2ae51f466f0b..dd7cc73124235 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -9,8 +9,8 @@ from .base import BaseEstimator, RegressorMixin, MultiOutputMixin from .metrics.pairwise import pairwise_kernels from .linear_model._ridge import _solve_cholesky_kernel -from .utils import check_array, check_X_y -from .utils.validation import check_is_fitted +from .utils import check_X_y +from .utils.validation import check_is_fitted, _check_sample_weight class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): @@ -151,7 +151,7 @@ def fit(self, X, y=None, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): - sample_weight = check_array(sample_weight, ensure_2d=False) + sample_weight = _check_sample_weight(sample_weight, X) K = self._get_kernel(X) alpha = np.atleast_1d(self.alpha) From 2f992722fd4bda6d31d7dabfa1f5f55261b241e5 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Tue, 21 Jan 2020 22:26:22 +0530 Subject: [PATCH 229/448] DOC Documentation Improvement in _stochastic_optimizers.py (#16166) * doc improv * Update param doc for SGDOptimizer in _stochastic_optimizers.py Co-Authored-By: Adrin Jalali * rmv linting error Co-authored-by: Adrin Jalali --- sklearn/neural_network/_stochastic_optimizers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py index 02fc53a7aecc2..2da9c0b278e71 100644 --- a/sklearn/neural_network/_stochastic_optimizers.py +++ b/sklearn/neural_network/_stochastic_optimizers.py @@ -106,6 +106,10 @@ class SGDOptimizer(BaseOptimizer): nesterov : bool, default=True Whether to use nesterov's momentum or not. Use nesterov's if True + power_t : float, default=0.5 + Power of time step 't' in inverse scaling. See `lr_schedule` for + more details. + Attributes ---------- learning_rate : float @@ -192,7 +196,7 @@ class AdamOptimizer(BaseOptimizer): The concatenated list containing coefs_ and intercepts_ in MLP model. Used for initializing velocities and updating params - learning_rate_init : float, default=0.1 + learning_rate_init : float, default=0.001 The initial learning rate used. It controls the step-size in updating the weights From a89d4a5457bae1de722fe42caa286f52e1faac2f Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 21 Jan 2020 20:19:06 +0100 Subject: [PATCH 230/448] DOC add clear default values to estimator tags (#16168) --- doc/developers/develop.rst | 78 ++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 2211ab153197a..a14d880921a03 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -468,50 +468,52 @@ Estimator Tags The estimator tags are experimental and the API is subject to change. -Scikit-learn introduced estimator tags in version 0.21. These are annotations +Scikit-learn introduced estimator tags in version 0.21. These are annotations of estimators that allow programmatic inspection of their capabilities, such as -sparse matrix support, supported output types and supported methods. The -estimator tags are a dictionary returned by the method ``_get_tags()``. These -tags are used by the common tests and the :func:`sklearn.utils.estimator_checks.check_estimator` function to -decide what tests to run and what input data is appropriate. Tags can depend on -estimator parameters or even system architecture and can in general only be -determined at runtime. - -The default value of all tags except for ``X_types`` and ``requires_fit`` is -``False``. These are defined in the ``BaseEstimator`` class. +sparse matrix support, supported output types and supported methods. The +estimator tags are a dictionary returned by the method ``_get_tags()``. These +tags are used by the common tests and the +:func:`sklearn.utils.estimator_checks.check_estimator` function to decide what +tests to run and what input data is appropriate. Tags can depend on estimator +parameters or even system architecture and can in general only be determined at +runtime. The default values for the estimator tags are defined in the +``BaseEstimator`` class. The current set of estimator tags are: -non_deterministic +non_deterministic (default=``False``) whether the estimator is not deterministic given a fixed ``random_state`` -requires_positive_X +requires_positive_X (default=``False``) whether the estimator requires positive X. -requires_positive_y +requires_positive_y (default=``False``) whether the estimator requires a positive y (only applicable for regression). -no_validation - whether the estimator skips input-validation. This is only meant for stateless and dummy transformers! +no_validation (default=``False``) + whether the estimator skips input-validation. This is only meant for + stateless and dummy transformers! -multioutput - unused for now - whether a regressor supports multi-target outputs or a classifier supports multi-class multi-output. +multioutput - unused for now (default=``False``) + whether a regressor supports multi-target outputs or a classifier supports + multi-class multi-output. -multilabel +multilabel (default=``False``) whether the estimator supports multilabel output -stateless - whether the estimator needs access to data for fitting. Even though - an estimator is stateless, it might still need a call to ``fit`` for initialization. +stateless (default=``False``) + whether the estimator needs access to data for fitting. Even though an + estimator is stateless, it might still need a call to ``fit`` for + initialization. -requires_fit +requires_fit (default=``True``) whether the estimator requires to be fitted before calling one of `transform`, `predict`, `predict_proba`, or `decision_function`. -allow_nan +allow_nan (default=``False``) whether the estimator supports data with missing values encoded as np.NaN -poor_score +poor_score (default=``False``) whether the estimator fails to provide a "reasonable" test-set score, which currently for regression is an R2 of 0.5 on a subset of the boston housing dataset, and for classification an accuracy of 0.83 on @@ -519,24 +521,26 @@ poor_score are based on current estimators in sklearn and might be replaced by something more systematic. -multioutput_only +multioutput_only (default=``False``) whether estimator supports only multi-output classification or regression. -binary_only +binary_only (default=``False``) whether estimator supports binary classification but lacks multi-class classification support. -_skip_test - whether to skip common tests entirely. Don't use this unless you have a *very good* reason. - -X_types - Supported input types for X as list of strings. Tests are currently only run if '2darray' is contained - in the list, signifying that the estimator takes continuous 2d numpy arrays as input. The default - value is ['2darray']. Other possible types are ``'string'``, ``'sparse'``, - ``'categorical'``, ``dict``, ``'1dlabels'`` and ``'2dlabels'``. - The goal is that in the future the supported input type will determine the - data used during testing, in particular for ``'string'``, ``'sparse'`` and - ``'categorical'`` data. For now, the test for sparse data do not make use +_skip_test (default=``False``) + whether to skip common tests entirely. Don't use this unless you have a + *very good* reason. + +X_types (default=``['2darray']``) + Supported input types for X as list of strings. Tests are currently only + run if '2darray' is contained in the list, signifying that the estimator + takes continuous 2d numpy arrays as input. The default value is + ['2darray']. Other possible types are ``'string'``, ``'sparse'``, + ``'categorical'``, ``dict``, ``'1dlabels'`` and ``'2dlabels'``. The goal is + that in the future the supported input type will determine the data used + during testing, in particular for ``'string'``, ``'sparse'`` and + ``'categorical'`` data. For now, the test for sparse data do not make use of the ``'sparse'`` tag. From 4f32360737c87eb53941f47a1695c6c809a41931 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Jan 2020 22:19:52 -0500 Subject: [PATCH 231/448] DOC better doc for calibration_curve (#16172) --- doc/modules/calibration.rst | 3 ++- sklearn/calibration.py | 41 ++++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst index 6fe30c93ff142..70286521e09c1 100644 --- a/doc/modules/calibration.rst +++ b/doc/modules/calibration.rst @@ -20,7 +20,8 @@ of the predict_proba method can be directly interpreted as a confidence level. For instance, a well calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approximately 80% actually belong to the positive class. The following plot compares -how well the probabilistic predictions of different classifiers are calibrated: +how well the probabilistic predictions of different classifiers are calibrated, +using :func:`calibration_curve`: .. figure:: ../auto_examples/calibration/images/sphx_glr_plot_compare_calibration_001.png :target: ../auto_examples/calibration/plot_compare_calibration.html diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 4c8a81a2137ec..ac802a39c9058 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -523,7 +523,8 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5, strategy='uniform'): """Compute true and predicted probabilities for a calibration curve. - The method assumes the inputs come from a binary classifier. + The method assumes the inputs come from a binary classifier, and + discretize the [0, 1] interval into bins. Calibration curves may also be referred to as reliability diagrams. @@ -531,36 +532,38 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5, Parameters ---------- - y_true : array, shape (n_samples,) + y_true : array-like of shape (n_samples,) True targets. - y_prob : array, shape (n_samples,) + y_prob : array-like of shape (n_samples,) Probabilities of the positive class. - normalize : bool, optional, default=False - Whether y_prob needs to be normalized into the bin [0, 1], i.e. is not - a proper probability. If True, the smallest value in y_prob is mapped - onto 0 and the largest one onto 1. + normalize : bool, default=False + Whether y_prob needs to be normalized into the [0, 1] interval, i.e. + is not a proper probability. If True, the smallest value in y_prob + is linearly mapped onto 0 and the largest one onto 1. - n_bins : int - Number of bins. A bigger number requires more data. Bins with no data - points (i.e. without corresponding values in y_prob) will not be - returned, thus there may be fewer than n_bins in the return value. + n_bins : int, default=5 + Number of bins to discretize the [0, 1] interval. A bigger number + requires more data. Bins with no samples (i.e. without + corresponding values in `y_prob`) will not be returned, thus the + returned arrays may have less than `n_bins` values. - strategy : {'uniform', 'quantile'}, (default='uniform') + strategy : {'uniform', 'quantile'}, default='uniform' Strategy used to define the widths of the bins. uniform - All bins have identical widths. + The bins have identical widths. quantile - All bins have the same number of points. + The bins have the same number of samples and depend on `y_prob`. Returns ------- - prob_true : array, shape (n_bins,) or smaller - The true probability in each bin (fraction of positives). + prob_true : ndarray of shape (n_bins,) or smaller + The proportion of samples whose class is the positive class, in each + bin (fraction of positives). - prob_pred : array, shape (n_bins,) or smaller + prob_pred : ndarray of shape (n_bins,) or smaller The mean predicted probability in each bin. References @@ -603,7 +606,7 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5, bin_total = np.bincount(binids, minlength=len(bins)) nonzero = bin_total != 0 - prob_true = (bin_true[nonzero] / bin_total[nonzero]) - prob_pred = (bin_sums[nonzero] / bin_total[nonzero]) + prob_true = bin_true[nonzero] / bin_total[nonzero] + prob_pred = bin_sums[nonzero] / bin_total[nonzero] return prob_true, prob_pred From 76314fbe47fc47d841e22ebf2f9e8b26431b2ed2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 22 Jan 2020 02:52:53 -0500 Subject: [PATCH 232/448] MNT Small refactoing of CalibratedClassifier (#16174) --- sklearn/calibration.py | 46 +++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ac802a39c9058..d4f6fea8c93b5 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -159,43 +159,35 @@ def fit(self, X, y, sample_weight=None): if self.cv == "prefit": calibrated_classifier = _CalibratedClassifier( base_estimator, method=self.method) - if sample_weight is not None: - calibrated_classifier.fit(X, y, sample_weight) - else: - calibrated_classifier.fit(X, y) + calibrated_classifier.fit(X, y, sample_weight) self.calibrated_classifiers_.append(calibrated_classifier) else: cv = check_cv(self.cv, y, classifier=True) fit_parameters = signature(base_estimator.fit).parameters - estimator_name = type(base_estimator).__name__ - if (sample_weight is not None - and "sample_weight" not in fit_parameters): - warnings.warn("%s does not support sample_weight. Samples" - " weights are only used for the calibration" - " itself." % estimator_name) - sample_weight = check_array(sample_weight, ensure_2d=False) - base_estimator_sample_weight = None - else: - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) - base_estimator_sample_weight = sample_weight + base_estimator_supports_sw = "sample_weight" in fit_parameters + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if not base_estimator_supports_sw: + estimator_name = type(base_estimator).__name__ + warnings.warn("Since %s does not support sample_weights, " + "sample weights will only be used for the " + "calibration itself." % estimator_name) + for train, test in cv.split(X, y): this_estimator = clone(base_estimator) - if base_estimator_sample_weight is not None: - this_estimator.fit( - X[train], y[train], - sample_weight=base_estimator_sample_weight[train]) + + if sample_weight is not None and base_estimator_supports_sw: + this_estimator.fit(X[train], y[train], + sample_weight=sample_weight[train]) else: this_estimator.fit(X[train], y[train]) calibrated_classifier = _CalibratedClassifier( - this_estimator, method=self.method, - classes=self.classes_) - if sample_weight is not None: - calibrated_classifier.fit(X[test], y[test], - sample_weight[test]) - else: - calibrated_classifier.fit(X[test], y[test]) + this_estimator, method=self.method, classes=self.classes_) + sw = None if sample_weight is None else sample_weight[test] + calibrated_classifier.fit(X[test], y[test], sample_weight=sw) self.calibrated_classifiers_.append(calibrated_classifier) return self From 1e8ea9f22b0a72ee417df60cf187ea00e204eb73 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 22 Jan 2020 10:13:15 +0100 Subject: [PATCH 233/448] DOC adding CoC to the repo (#16173) --- CODE_OF_CONDUCT.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000..f99ec64342af9 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,16 @@ +# Code of Conduct + +We are a community based on openness and friendly, didactic, discussions. + +We aspire to treat everybody equally, and value their contributions. + +Decisions are made based on technical merit and consensus. + +Code is not the only way to help the project. Reviewing pull requests, +answering questions to help others on mailing lists or issues, organizing and +teaching tutorials, working on the website, improving the documentation, are +all priceless contributions. + +We abide by the principles of openness, respect, and consideration of others of +the Python Software Foundation: https://www.python.org/psf/codeofconduct/ + From f6a19a7e8d31305d55d0e9ac449c5158635bf08f Mon Sep 17 00:00:00 2001 From: indecisiveuser <47398675+indecisiveuser@users.noreply.github.com> Date: Wed, 22 Jan 2020 03:22:57 -0800 Subject: [PATCH 234/448] DOC Format docstrings in validation module (#16134) --- sklearn/model_selection/_validation.py | 241 +++++++++++++------------ 1 file changed, 122 insertions(+), 119 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 6bbe6e0c5ce95..9e27ee28005dc 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -24,7 +24,7 @@ from ..utils import (indexable, check_random_state, _safe_indexing, _message_with_time) from ..utils.validation import _check_fit_params -from ..utils.validation import _is_arraylike, _num_samples +from ..utils.validation import _num_samples from ..utils.metaestimators import _safe_split from ..metrics import check_scoring from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer @@ -50,20 +50,21 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, estimator : estimator object implementing 'fit' The object to use to fit the data. - X : array-like + X : array-like of shape (n_samples, n_features) The data to fit. Can be for example a list, or an array. - y : array-like, optional, default: None + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None The target variable to try to predict in the case of supervised learning. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). - scoring : string, callable, list/tuple, dict or None, default: None - A single string (see :ref:`scoring_parameter`) or a callable + scoring : str, callable, list/tuple, or dict, default=None + A single str (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. For evaluating multiple metrics, either give a list of (unique) strings @@ -77,16 +78,16 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, If None, the estimator's score method is used. - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - - integer, to specify the number of folds in a `(Stratified)KFold`, + - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is + For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. @@ -96,19 +97,19 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : integer, optional + verbose : int, default=0 The verbosity level. - fit_params : dict, optional + fit_params : dict, default=None Parameters to pass to the fit method of the estimator. - pre_dispatch : int, or string, optional + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched @@ -122,10 +123,10 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, - An int, giving the exact number of total jobs that are spawned - - A string, giving an expression as a function of n_jobs, + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' - return_train_score : boolean, default=False + return_train_score : bool, default=False Whether to include train scores. Computing training scores is used to get insights on how different parameter settings impact the overfitting/underfitting trade-off. @@ -133,7 +134,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, expensive and is not strictly required to select the parameters that yield the best generalization performance. - return_estimator : boolean, default False + return_estimator : bool, default=False Whether to return the estimators fitted on each split. error_score : 'raise' or numeric @@ -272,20 +273,21 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, estimator : estimator object implementing 'fit' The object to use to fit the data. - X : array-like + X : array-like of shape (n_samples, n_features) The data to fit. Can be for example a list, or an array. - y : array-like, optional, default: None + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None The target variable to try to predict in the case of supervised learning. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or + scoring : str or callable, default=None + A str (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)`` which should return only a single value. @@ -295,16 +297,16 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, If None, the estimator's default scorer (if available) is used. - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - - integer, to specify the number of folds in a `(Stratified)KFold`, + - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is + For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. @@ -314,19 +316,19 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : integer, optional + verbose : int, default=0 The verbosity level. - fit_params : dict, optional + fit_params : dict, default=None Parameters to pass to the fit method of the estimator. - pre_dispatch : int, or string, optional + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched @@ -340,10 +342,10 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, - An int, giving the exact number of total jobs that are spawned - - A string, giving an expression as a function of n_jobs, + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter @@ -403,10 +405,10 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, estimator : estimator object implementing 'fit' The object to use to fit the data. - X : array-like of shape at least 2D + X : array-like of shape (n_samples, n_features) The data to fit. - y : array-like, optional, default: None + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None The target variable to try to predict in the case of supervised learning. @@ -420,16 +422,16 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, The callable object / fn should have signature ``scorer(estimator, X, y)``. - train : array-like, shape (n_train_samples,) + train : array-like of shape (n_train_samples,) Indices of training samples. - test : array-like, shape (n_test_samples,) + test : array-like of shape (n_test_samples,) Indices of test samples. - verbose : integer + verbose : int The verbosity level. - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter @@ -441,28 +443,28 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, fit_params : dict or None Parameters that will be passed to ``estimator.fit``. - return_train_score : boolean, optional, default: False + return_train_score : bool, default=False Compute and return score on training set. - return_parameters : boolean, optional, default: False + return_parameters : bool, default=False Return parameters that has been used for the estimator. - return_n_test_samples : boolean, optional, default: False + return_n_test_samples : bool, default=False Whether to return the ``n_test_samples`` - return_times : boolean, optional, default: False + return_times : bool, default=False Whether to return the fit/score times. - return_estimator : boolean, optional, default: False + return_estimator : bool, default=False Whether to return the fitted estimator. Returns ------- - train_scores : dict of scorer name -> float, optional + train_scores : dict of scorer name -> float Score on training set (for all the scorers), returned only if `return_train_score` is `True`. - test_scores : dict of scorer name -> float, optional + test_scores : dict of scorer name -> float Score on testing set (for all the scorers). n_test_samples : int @@ -474,7 +476,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, score_time : float Time spent for scoring in seconds. - parameters : dict or None, optional + parameters : dict or None The parameters that have been evaluated. estimator : estimator object @@ -632,28 +634,29 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, estimator : estimator object implementing 'fit' and 'predict' The object to use to fit the data. - X : array-like + X : array-like of shape (n_samples, n_features) The data to fit. Can be, for example a list, or an array at least 2d. - y : array-like, optional, default: None + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None The target variable to try to predict in the case of supervised learning. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - - integer, to specify the number of folds in a `(Stratified)KFold`, + - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is + For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. @@ -663,19 +666,19 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : integer, optional + verbose : int, default=0 The verbosity level. - fit_params : dict, optional + fit_params : dict, defualt=None Parameters to pass to the fit method of the estimator. - pre_dispatch : int, or string, optional + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched @@ -689,10 +692,10 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, - An int, giving the exact number of total jobs that are spawned - - A string, giving an expression as a function of n_jobs, + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' - method : string, optional, default: 'predict' + method : str, default='predict' Invokes the passed method name of the passed estimator. For method='predict_proba', the columns correspond to the classes in sorted order. @@ -798,26 +801,26 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator : estimator object implementing 'fit' and 'predict' The object to use to fit the data. - X : array-like of shape at least 2D + X : array-like of shape (n_samples, n_features) The data to fit. - y : array-like, optional, default: None + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None The target variable to try to predict in the case of supervised learning. - train : array-like, shape (n_train_samples,) + train : array-like of shape (n_train_samples,) Indices of training samples. - test : array-like, shape (n_test_samples,) + test : array-like of shape (n_test_samples,) Indices of test samples. - verbose : integer + verbose : int The verbosity level. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. - method : string + method : str Invokes the passed method name of the passed estimator. Returns @@ -866,7 +869,7 @@ def _enforce_prediction_order(classes, predictions, n_classes, method): not present in the subset of data used for training, then the output prediction array might not have the same columns as other folds. Use the list of class names - (assumed to be integers) to enforce the correct column order. + (assumed to be ints) to enforce the correct column order. Note that `classes` is the list of classes in this fold (a subset of the classes in the full training set) @@ -922,7 +925,7 @@ def _check_is_permutation(indices, n_samples): Parameters ---------- indices : ndarray - integer array to test + int array to test n_samples : int number of expected elements @@ -955,11 +958,11 @@ def permutation_test_score(estimator, X, y, groups=None, cv=None, X : array-like of shape at least 2D The data to fit. - y : array-like + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None The target variable to try to predict in the case of supervised learning. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Labels to constrain permutation within groups, i.e. ``y`` values are permuted among samples with the same group identifier. When not specified, ``y`` values are permuted among all samples. @@ -969,22 +972,22 @@ def permutation_test_score(estimator, X, y, groups=None, cv=None, cross-validator uses them for grouping the samples while splitting the dataset into train/test set. - scoring : string, callable or None, optional, default: None - A single string (see :ref:`scoring_parameter`) or a callable + scoring : str or callable, default=None + A single str (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. If None the estimator's score method is used. - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - - integer, to specify the number of folds in a `(Stratified)KFold`, + - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is + For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. @@ -994,21 +997,21 @@ def permutation_test_score(estimator, X, y, groups=None, cv=None, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - n_permutations : integer, optional + n_permutations : int, default=100 Number of times to permute ``y``. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of CPUs to use to do the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, default=None + random_state : int, RandomState instance or None, default=0 Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. - verbose : integer, optional + verbose : int, default=0 The verbosity level. Returns @@ -1016,7 +1019,7 @@ def permutation_test_score(estimator, X, y, groups=None, cv=None, score : float The true score without permuting targets. - permutation_scores : array, shape (n_permutations,) + permutation_scores : array of shape (n_permutations,) The scores obtained for each permutations. pvalue : float @@ -1103,20 +1106,21 @@ def learning_curve(estimator, X, y, groups=None, estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples) or (n_samples, n_features), optional + y : array-like of shape (n_samples,) or (n_samples, n_outputs) Target relative to X for classification or regression; None for unsupervised learning. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). - train_sizes : array-like, shape (n_ticks,), dtype float or int + train_sizes : array-like of shape (n_ticks,), \ + default=np.linspace(0.1, 1.0, 5) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined @@ -1124,18 +1128,17 @@ def learning_curve(estimator, X, y, groups=None, Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. - (default: np.linspace(0.1, 1.0, 5)) - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - - integer, to specify the number of folds in a `(Stratified)KFold`, + - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is + For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. @@ -1145,66 +1148,66 @@ def learning_curve(estimator, X, y, groups=None, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or + scoring : str or callable, default=None + A str (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. - exploit_incremental_learning : boolean, optional, default: False + exploit_incremental_learning : bool, default=False If the estimator supports incremental learning, this will be used to speed up fitting for different training set sizes. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - pre_dispatch : integer or string, optional + pre_dispatch : int or str, default='all' Number of predispatched jobs for parallel execution (default is - all). The option can reduce the allocated memory. The string can + all). The option can reduce the allocated memory. The str can be an expression like '2*n_jobs'. - verbose : integer, optional + verbose : int, default=0 Controls the verbosity: the higher, the more messages. - shuffle : boolean, optional + shuffle : bool, default=False Whether to shuffle training data before taking prefixes of it based on``train_sizes``. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Used when ``shuffle`` is True. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. - return_times : boolean, optional (default: False) + return_times : bool, default=False Whether to return the fit and score times. Returns ------- - train_sizes_abs : array, shape (n_unique_ticks,), dtype int + train_sizes_abs : array of shape (n_unique_ticks,) Numbers of training examples that has been used to generate the learning curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. - train_scores : array, shape (n_ticks, n_cv_folds) + train_scores : array of shape (n_ticks, n_cv_folds) Scores on training sets. - test_scores : array, shape (n_ticks, n_cv_folds) + test_scores : array of shape (n_ticks, n_cv_folds) Scores on test set. - fit_times : array, shape (n_ticks, n_cv_folds) + fit_times : array of shape (n_ticks, n_cv_folds) Times spent for fitting in seconds. Only present if ``return_times`` is True. - score_times : array, shape (n_ticks, n_cv_folds) + score_times : array of shape (n_ticks, n_cv_folds) Times spent for scoring in seconds. Only present if ``return_times`` is True. @@ -1281,7 +1284,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): Parameters ---------- - train_sizes : array-like, shape (n_ticks,), dtype float or int + train_sizes : array-like of shape (n_ticks,) Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. @@ -1291,7 +1294,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): Returns ------- - train_sizes_abs : array, shape (n_unique_ticks,), dtype int + train_sizes_abs : array of shape (n_unique_ticks,) Numbers of training examples that will be used to generate the learning curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. @@ -1384,35 +1387,35 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples) or (n_samples, n_features), optional + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None Target relative to X for classification or regression; None for unsupervised learning. - param_name : string + param_name : str Name of the parameter that will be varied. - param_range : array-like, shape (n_values,) + param_range : array-like of shape (n_values,) The values of the parameter that will be evaluated. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - - integer, to specify the number of folds in a `(Stratified)KFold`, + - int, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is + For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. @@ -1422,26 +1425,26 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or + scoring : str or callable, default=None + A str (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - pre_dispatch : integer or string, optional + pre_dispatch : int or str, default='all' Number of predispatched jobs for parallel execution (default is - all). The option can reduce the allocated memory. The string can + all). The option can reduce the allocated memory. The str can be an expression like '2*n_jobs'. - verbose : integer, optional + verbose : int, default=0 Controls the verbosity: the higher, the more messages. - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter @@ -1449,10 +1452,10 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, Returns ------- - train_scores : array, shape (n_ticks, n_cv_folds) + train_scores : array of shape (n_ticks, n_cv_folds) Scores on training sets. - test_scores : array, shape (n_ticks, n_cv_folds) + test_scores : array of shape (n_ticks, n_cv_folds) Scores on test set. Notes From ac9c3e1d33ebd771d4598eeb6cda61ce9d9a70de Mon Sep 17 00:00:00 2001 From: Venkatachalam N Date: Wed, 22 Jan 2020 19:40:39 +0530 Subject: [PATCH 235/448] ENH adding return_centers param in make_blobs (#15709) --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/datasets/_samples_generator.py | 17 +++++++++++++++-- .../datasets/tests/test_samples_generator.py | 9 +++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 852cc360a702c..e7e6371d45524 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -54,6 +54,12 @@ Changelog :mod:`sklearn.datasets` ....................... +- |Enhancement| Added ``return_centers`` parameter in + :func:`datasets.make_blobs`, which can be used to return + centers for each cluster. + :pr:`15709` by :user:`` and + :user:`Venkatachalam N `. + - |Enhancement| Functions :func:`datasets.make_circles` and :func:`datasets.make_moons` now accept two-element tuple. :pr:`15707` by :user:`Maciej J Mikulski `. diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index d57dfc9bda999..62ef492f42f5e 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -731,7 +731,8 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None): def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, - center_box=(-10.0, 10.0), shuffle=True, random_state=None): + center_box=(-10.0, 10.0), shuffle=True, random_state=None, + return_centers=False): """Generate isotropic Gaussian blobs for clustering. Read more in the :ref:`User Guide `. @@ -769,6 +770,11 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, for reproducible output across multiple function calls. See :term:`Glossary `. + return_centers : bool, optional (default=False) + If True, then return the centers of each cluster + + .. versionadded:: 0.23 + Returns ------- X : array of shape [n_samples, n_features] @@ -777,6 +783,10 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, y : array of shape [n_samples] The integer labels for cluster membership of each sample. + centers : array, shape [n_centers, n_features] + The centers of each cluster. Only returned if + ``return_centers=True``. + Examples -------- >>> from sklearn.datasets import make_blobs @@ -869,7 +879,10 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, X = X[indices] y = y[indices] - return X, y + if return_centers: + return X, y, centers + else: + return X, y def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None): diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index c683e277c705a..ab712d8c235a6 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -324,6 +324,15 @@ def test_make_blobs_n_samples_centers_none(n_samples): "Incorrect number of samples per blob" +def test_make_blobs_return_centers(): + n_samples = [10, 20] + n_features = 3 + X, y, centers = make_blobs(n_samples=n_samples, n_features=n_features, + return_centers=True, random_state=0) + + assert centers.shape == (len(n_samples), n_features) + + def test_make_blobs_error(): n_samples = [20, 20, 20] centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) From 3f89a4143405141c550efd7c11f3745e191259f3 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Thu, 23 Jan 2020 00:17:28 +0800 Subject: [PATCH 236/448] DOC Mention in FAQ that we will not accept new deep learning features (#16176) --- doc/faq.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/faq.rst b/doc/faq.rst index 6972d79fd5513..f9594740c137d 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -328,6 +328,14 @@ scikit-learn seeks to achieve. You can find more information about addition of gpu support at `Will you add GPU support?`_. +Note that scikit-learn currently implements a simple multilayer perceptron +in `sklearn.neural_network`. We will only accept bug fixes for this module. +If you want to implement more complex deep learning models, please turn to +popular deep learning frameworks such as +`tensorflow `_, +`keras `_ +and `pytorch `_. + Why is my pull request not getting any attention? ------------------------------------------------- From fd12d5684ad224ad7760374b1dcca2821c644feb Mon Sep 17 00:00:00 2001 From: krishnachaitanya9 Date: Wed, 22 Jan 2020 11:49:45 -0700 Subject: [PATCH 237/448] =?UTF-8?q?BUG=20take=20n=5Fcomponents=20strictly?= =?UTF-8?q?=20greater=20than=20fraction=20of=20explai=E2=80=A6=20(#15669)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/whats_new/v0.23.rst | 9 ++++++++- sklearn/decomposition/_pca.py | 7 +++++-- sklearn/decomposition/tests/test_pca.py | 13 +++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index e7e6371d45524..08cd80a7ae24f 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -73,8 +73,15 @@ Changelog `ValueError` for arguments `n_classes < 1` OR `length < 1`. :pr:`16006` by :user:`Rushabh Vasani `. +:mod:`sklearn.decomposition` +............................ + +- |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will + exclusively choose the components that explain the variance greater than + `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` + :mod:`sklearn.ensemble` -................................. +....................... - |API| Added boolean `verbose` flag to classes: :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index faa083e099b5e..41b5bd68fecce 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -462,9 +462,12 @@ def _fit_full(self, X, n_components): elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold + # side='right' ensures that number of features selected + # their variance is always greater than n_components float + # passed. More discussion in issue: #15669 ratio_cumsum = stable_cumsum(explained_variance_ratio_) - n_components = np.searchsorted(ratio_cumsum, n_components) + 1 - + n_components = np.searchsorted(ratio_cumsum, n_components, + side='right') + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 65624215b1158..b94d2d5be7e0f 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -7,6 +7,7 @@ from sklearn import datasets from sklearn.decomposition import PCA +from sklearn.datasets import load_iris from sklearn.decomposition._pca import _assess_dimension_ from sklearn.decomposition._pca import _infer_dimension_ @@ -555,3 +556,15 @@ def check_pca_int_dtype_upcast_to_double(svd_solver): assert pca_32.transform(X_i32).dtype == np.float64 assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4) + + +def test_pca_n_components_mostly_explained_variance_ratio(): + # when n_components is the second highest cumulative sum of the + # explained_variance_ratio_, then n_components_ should equal the + # number of features in the dataset #15669 + X, y = load_iris(return_X_y=True) + pca1 = PCA().fit(X, y) + + n_components = pca1.explained_variance_ratio_.cumsum()[-2] + pca2 = PCA(n_components=n_components).fit(X, y) + assert pca2.n_components_ == X.shape[1] From 5c36df6098d4a6325b621030163897d19853e698 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 24 Jan 2020 14:45:47 +0100 Subject: [PATCH 238/448] MNT remove check for deprecated behavior in test.py (#16109) --- sklearn/feature_extraction/tests/test_text.py | 7 +---- sklearn/feature_extraction/text.py | 26 +------------------ 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index f8f741a862594..3ccda942b65e1 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -30,7 +30,6 @@ from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal from sklearn.utils import IS_PYPY -from sklearn.exceptions import ChangedBehaviorWarning from sklearn.utils._testing import (assert_almost_equal, assert_warns_message, assert_raise_message, clean_warning_registry, @@ -1294,12 +1293,8 @@ def test_callable_analyzer_error(Estimator, input_type, err_type, err_msg): @pytest.mark.parametrize('input_type', ['file', 'filename']) def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type): data = ['this is text, not file or filename'] - warn_msg = 'Since v0.21, vectorizer' with pytest.raises((FileNotFoundError, AttributeError)): - with pytest.warns(ChangedBehaviorWarning, match=warn_msg) as records: - Estimator(analyzer=analyzer, input=input_type).fit_transform(data) - assert len(records) == 1 - assert warn_msg in str(records[0]) + Estimator(analyzer=analyzer, input=input_type).fit_transform(data) @pytest.mark.parametrize( diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 5b127a10962bc..4954329728d5e 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -32,7 +32,7 @@ from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES from ..utils import _IS_32BIT, deprecated from ..utils.fixes import _astype_copy_false -from ..exceptions import ChangedBehaviorWarning, NotFittedError +from ..exceptions import NotFittedError __all__ = ['HashingVectorizer', @@ -390,28 +390,6 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): self._stop_words_id = id(self.stop_words) return 'error' - def _validate_custom_analyzer(self): - # This is to check if the given custom analyzer expects file or a - # filename instead of data. - # Behavior changed in v0.21, function could be removed in v0.23 - import tempfile - with tempfile.NamedTemporaryFile() as f: - fname = f.name - # now we're sure fname doesn't exist - - msg = ("Since v0.21, vectorizers pass the data to the custom analyzer " - "and not the file names or the file objects. This warning " - "will be removed in v0.23.") - try: - self.analyzer(fname) - except FileNotFoundError: - warnings.warn(msg, ChangedBehaviorWarning) - except AttributeError as e: - if str(e) == "'str' object has no attribute 'read'": - warnings.warn(msg, ChangedBehaviorWarning) - except Exception: - pass - def build_analyzer(self): """Return a callable that handles preprocessing, tokenization and n-grams generation. @@ -424,8 +402,6 @@ def build_analyzer(self): """ if callable(self.analyzer): - if self.input in ['file', 'filename']: - self._validate_custom_analyzer() return partial( _analyze, analyzer=self.analyzer, decoder=self.decode ) From 1ad8a369569de57355100174e670d1173c35a712 Mon Sep 17 00:00:00 2001 From: Aleksandra Kocot Date: Sat, 25 Jan 2020 19:14:00 +0100 Subject: [PATCH 239/448] DOC Updates random_state descriptions for sklearn/preprocessing/_data.py (#16213) --- sklearn/preprocessing/_data.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index b047908842b38..9ff3723b25550 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2176,11 +2176,11 @@ class QuantileTransformer(TransformerMixin, BaseEstimator): differ for value-identical sparse and dense matrices. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. Note that this is used by subsampling and smoothing + Determines random number generation for subsampling and smoothing noise. + Please see ``subsample`` for more details. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary ` copy : boolean, optional, (default=True) Set to False to perform inplace transformation and avoid a copy (if the @@ -2605,11 +2605,11 @@ def quantile_transform(X, axis=0, n_quantiles=1000, differ for value-identical sparse and dense matrices. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. Note that this is used by subsampling and smoothing + Determines random number generation for subsampling and smoothing noise. + Please see ``subsample`` for more details. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary ` copy : boolean, optional, (default=True) Set to False to perform inplace transformation and avoid a copy (if the From 3de415a7e740a1fa414e931098eb15f305ed03ca Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 26 Jan 2020 03:36:36 -0500 Subject: [PATCH 240/448] DOC Fix links in KernelRidge user guide (#16235) --- doc/modules/kernel_ridge.rst | 56 +++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst index 3d032b52bb309..a193efaaf2d67 100644 --- a/doc/modules/kernel_ridge.rst +++ b/doc/modules/kernel_ridge.rst @@ -13,38 +13,42 @@ the data. For non-linear kernels, this corresponds to a non-linear function in the original space. The form of the model learned by :class:`KernelRidge` is identical to support -vector regression (:class:`SVR`). However, different loss functions are used: -KRR uses squared error loss while support vector regression uses -:math:`\epsilon`-insensitive loss, both combined with l2 regularization. In -contrast to :class:`SVR`, fitting :class:`KernelRidge` can be done in -closed-form and is typically faster for medium-sized datasets. On the other -hand, the learned model is non-sparse and thus slower than SVR, which learns -a sparse model for :math:`\epsilon > 0`, at prediction-time. - -The following figure compares :class:`KernelRidge` and :class:`SVR` on -an artificial dataset, which consists of a sinusoidal target function and -strong noise added to every fifth datapoint. The learned model of -:class:`KernelRidge` and :class:`SVR` is plotted, where both -complexity/regularization and bandwidth of the RBF kernel have been optimized -using grid-search. The learned functions are very similar; however, fitting -:class:`KernelRidge` is approx. seven times faster than fitting :class:`SVR` -(both with grid-search). However, prediction of 100000 target values is more -than three times faster with SVR since it has learned a sparse model using only -approx. 1/3 of the 100 training datapoints as support vectors. +vector regression (:class:`~sklearn.svm.SVR`). However, different loss +functions are used: KRR uses squared error loss while support vector +regression uses :math:`\epsilon`-insensitive loss, both combined with l2 +regularization. In contrast to :class:`~sklearn.svm.SVR`, fitting +:class:`KernelRidge` can be done in closed-form and is typically faster for +medium-sized datasets. On the other hand, the learned model is non-sparse and +thus slower than :class:`~sklearn.svm.SVR`, which learns a sparse model for +:math:`\epsilon > 0`, at prediction-time. + +The following figure compares :class:`KernelRidge` and +:class:`~sklearn.svm.SVR` on an artificial dataset, which consists of a +sinusoidal target function and strong noise added to every fifth datapoint. +The learned model of :class:`KernelRidge` and :class:`~sklearn.svm.SVR` is +plotted, where both complexity/regularization and bandwidth of the RBF kernel +have been optimized using grid-search. The learned functions are very +similar; however, fitting :class:`KernelRidge` is approximately seven times +faster than fitting :class:`~sklearn.svm.SVR` (both with grid-search). +However, prediction of 100000 target values is more than three times faster +with :class:`~sklearn.svm.SVR` since it has learned a sparse model using only +approximately 1/3 of the 100 training datapoints as support vectors. .. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_001.png :target: ../auto_examples/plot_kernel_ridge_regression.html :align: center The next figure compares the time for fitting and prediction of -:class:`KernelRidge` and :class:`SVR` for different sizes of the training set. -Fitting :class:`KernelRidge` is faster than :class:`SVR` for medium-sized -training sets (less than 1000 samples); however, for larger training sets -:class:`SVR` scales better. With regard to prediction time, :class:`SVR` is -faster than :class:`KernelRidge` for all sizes of the training set because of -the learned sparse solution. Note that the degree of sparsity and thus the -prediction time depends on the parameters :math:`\epsilon` and :math:`C` of the -:class:`SVR`; :math:`\epsilon = 0` would correspond to a dense model. +:class:`KernelRidge` and :class:`~sklearn.svm.SVR` for different sizes of the +training set. Fitting :class:`KernelRidge` is faster than +:class:`~sklearn.svm.SVR` for medium-sized training sets (less than 1000 +samples); however, for larger training sets :class:`~sklearn.svm.SVR` scales +better. With regard to prediction time, :class:`~sklearn.svm.SVR` is faster +than :class:`KernelRidge` for all sizes of the training set because of the +learned sparse solution. Note that the degree of sparsity and thus the +prediction time depends on the parameters :math:`\epsilon` and :math:`C` of +the :class:`~sklearn.svm.SVR`; :math:`\epsilon = 0` would correspond to a +dense model. .. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_002.png :target: ../auto_examples/plot_kernel_ridge_regression.html From c4592adb82af4059806a991230505cbdcaddc1fc Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 26 Jan 2020 03:37:02 -0500 Subject: [PATCH 241/448] MNT Skips failing SpectralCoclustering doctest (#16232) --- sklearn/cluster/_bicluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index cced1674e167b..8ac6ce3e27eca 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -260,9 +260,9 @@ class SpectralCoclustering(BaseSpectral): >>> X = np.array([[1, 1], [2, 1], [1, 0], ... [4, 7], [3, 5], [3, 6]]) >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X) - >>> clustering.row_labels_ + >>> clustering.row_labels_ #doctest: +SKIP array([0, 1, 1, 0, 0, 0], dtype=int32) - >>> clustering.column_labels_ + >>> clustering.column_labels_ #doctest: +SKIP array([0, 0], dtype=int32) >>> clustering SpectralCoclustering(n_clusters=2, random_state=0) From ede7905d3b0d055db70484275ece8e211c19d56c Mon Sep 17 00:00:00 2001 From: Daphne <50707529+daphn3k@users.noreply.github.com> Date: Sun, 26 Jan 2020 11:53:35 +0100 Subject: [PATCH 242/448] [MRG] DOC improve random state docstring in manifold module (#16204) --- sklearn/manifold/_locally_linear.py | 28 +++++++++++-------------- sklearn/manifold/_mds.py | 27 +++++++++++------------- sklearn/manifold/_spectral_embedding.py | 24 +++++++++------------ sklearn/manifold/_t_sne.py | 11 +++++----- 4 files changed, 39 insertions(+), 51 deletions(-) diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index b891c152e1a57..4854cf228f0ca 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -143,12 +143,10 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100, Maximum number of iterations for 'arpack' method. Not used if eigen_solver=='dense' - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``solver`` == 'arpack'. - + random_state : int, RandomState instance, default=None + Determines the random number generator when ``solver`` == 'arpack'. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. """ if eigen_solver == 'auto': if M.shape[0] > 200 and k + k_skip < 10: @@ -249,11 +247,10 @@ def locally_linear_embedding( Tolerance for modified LLE method. Only used if method == 'modified' - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``solver`` == 'arpack'. + random_state : int, RandomState instance, default=None + Determines the random number generator when ``solver`` == 'arpack'. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. @@ -581,11 +578,10 @@ class LocallyLinearEmbedding(TransformerMixin, algorithm to use for nearest neighbors search, passed to neighbors.NearestNeighbors instance - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``eigen_solver`` == 'arpack'. + random_state : int, RandomState instance, default=None + Determines the random number generator when + ``eigen_solver`` == 'arpack'. Pass an int for reproducible results + across multiple function calls. See :term: `Glossary `. n_jobs : int or None, optional (default=None) The number of parallel jobs to run. diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 5238c67e93dfd..b7b52344eb21b 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -48,11 +48,10 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, Relative tolerance with respect to stress at which to declare convergence. - random_state : int, RandomState instance or None, optional, default: None - The generator used to initialize the centers. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + Determines the random number generator used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. Returns ------- @@ -195,11 +194,10 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8, Relative tolerance with respect to stress at which to declare convergence. - random_state : int, RandomState instance or None, optional, default: None - The generator used to initialize the centers. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + Determines the random number generator used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. return_n_iter : bool, optional, default: False Whether or not to return the number of iterations. @@ -311,11 +309,10 @@ class MDS(BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance or None, optional, default: None - The generator used to initialize the centers. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + Determines the random number generator used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. dissimilarity : 'euclidean' | 'precomputed', optional, default: 'euclidean' Dissimilarity measure to use: diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 9d52a9787425c..e885a94eaaded 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -168,13 +168,11 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. - random_state : int, RandomState instance or None, optional, default: None - A pseudo random number generator used for the initialization of the - lobpcg eigenvectors decomposition. If int, random_state is the seed - used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. Used when - ``solver`` == 'amg'. + random_state : int, RandomState instance, default=None + Determines the random number generator used for the initialization of + the lobpcg eigenvectors decomposition when ``solver`` == 'amg'. Pass + an int for reproducible results across multiple function calls. + See :term: `Glossary `. eigen_tol : float, optional, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix @@ -384,13 +382,11 @@ class SpectralEmbedding(BaseEstimator): gamma : float, optional, default : 1/n_features Kernel coefficient for rbf kernel. - random_state : int, RandomState instance or None, optional, default: None - A pseudo random number generator used for the initialization of the - lobpcg eigenvectors. If int, random_state is the seed used by the - random number generator; If RandomState instance, random_state is the - random number generator; If None, the random number generator is the - RandomState instance used by `np.random`. Used when ``solver`` == - 'amg'. + random_state : int, RandomState instance, default=None + Determines the random number generator used for the initialization of + the lobpcg eigenvectors when ``solver`` == 'amg'. Pass an int for + reproducible results across multiple function calls. + See :term: `Glossary `. eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index f2d15f54e8f40..81972dac33d07 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -555,12 +555,11 @@ class TSNE(BaseEstimator): verbose : int, optional (default: 0) Verbosity level. - random_state : int, RandomState instance or None, optional (default: None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Note that different initializations might result in - different local minima of the cost function. + random_state : int, RandomState instance, default=None + Determines the random number generator. Pass an int for reproducible + results across multiple function calls. Note that different + initializations might result in different local minima of the cost + function. See :term: `Glossary `. method : string (default: 'barnes_hut') By default the gradient calculation algorithm uses Barnes-Hut From c23ed6060dc1d7e8a31e6e1e10fd82d7cf1e0da4 Mon Sep 17 00:00:00 2001 From: Maria <32207690+Malesche@users.noreply.github.com> Date: Sun, 26 Jan 2020 11:56:00 +0100 Subject: [PATCH 243/448] DOC improve random state docstring in HistGBDT (#16205) --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a4dec15763940..83c338d89633e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -34,9 +34,11 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly chosen to compute the quantiles. If ``None``, the whole data is used. - random_state: int or numpy.random.RandomState or None + random_state: int, RandomState instance or None Pseudo-random number generator to control the random sub-sampling. - See :term:`random_state`. + Pass an int for reproducible output across multiple + function calls. + See :term: `Glossary `. Return ------ @@ -109,10 +111,11 @@ class _BinMapper(TransformerMixin, BaseEstimator): If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly chosen to compute the quantiles. If ``None``, the whole data is used. - random_state: int or numpy.random.RandomState or None, \ - optional (default=None) + random_state: int, RandomState instance or None Pseudo-random number generator to control the random sub-sampling. - See :term:`random_state`. + Pass an int for reproducible output across multiple + function calls. + See :term: `Glossary `. Attributes ---------- From 1640d8cf6daad671b00febc5ed13521f5af5f590 Mon Sep 17 00:00:00 2001 From: Ana Casado Date: Sun, 26 Jan 2020 12:14:42 +0100 Subject: [PATCH 244/448] DOC improve random state description in gaussian_proccess module (#16211) --- sklearn/gaussian_process/_gpc.py | 19 ++++++++----------- sklearn/gaussian_process/_gpr.py | 19 +++++++++---------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index dc4eb6520c0b8..30e357b3986e7 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -107,11 +107,10 @@ def optimizer(obj_func, initial_theta, bounds): which might cause predictions to change if the data is modified externally. - random_state : int, RandomState instance or None, optional (default: None) - The generator used to initialize the centers. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. Attributes ---------- @@ -534,12 +533,10 @@ def optimizer(obj_func, initial_theta, bounds): which might cause predictions to change if the data is modified externally. - random_state : int, RandomState instance or None, optional (default: None) - The generator used to initialize the centers. - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. multi_class : string, default : "one_vs_rest" Specifies how multi-class classification problems are handled. diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index db850b3e442f8..bf51d4dfda299 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -106,11 +106,10 @@ def optimizer(obj_func, initial_theta, bounds): which might cause predictions to change if the data is modified externally. - random_state : int, RandomState instance or None, optional (default: None) - The generator used to initialize the centers. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. Attributes ---------- @@ -379,11 +378,11 @@ def sample_y(self, X, n_samples=1, random_state=0): n_samples : int, default: 1 The number of samples drawn from the Gaussian process - random_state : int, RandomState instance or None, optional (default=0) - If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the - random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=0 + Determines random number generation to randomly draw samples. + Pass an int for reproducible results across multiple function + calls. + See :term: `Glossary `. Returns ------- From 4f3878e542548704731ff7c335b96b37fb1e6b06 Mon Sep 17 00:00:00 2001 From: Ana Casado Date: Sun, 26 Jan 2020 12:22:58 +0100 Subject: [PATCH 245/448] DOC improve random state in neighbors module (#16217) --- sklearn/neighbors/_kde.py | 10 +++++----- sklearn/neighbors/_nca.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 6b1e2660c2014..5b44f6f6b2b75 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -230,11 +230,11 @@ def sample(self, n_samples=1, random_state=None): n_samples : int, optional Number of samples to generate. Defaults to 1. - random_state : int, RandomState instance or None. default to None - If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random - number generator; If None, the random number generator is the - RandomState instance used by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation used to generate + random samples. Pass an int for reproducible results + across multiple function calls. + See :term: `Glossary `. Returns ------- diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 3c9ddbbd411d0..131a1bf0b04c1 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -113,7 +113,9 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): A pseudo random number generator object or a seed for it if int. If ``init='random'``, ``random_state`` is used to initialize the random transformation. If ``init='pca'``, ``random_state`` is passed as an - argument to PCA when initializing the transformation. + argument to PCA when initializing the transformation. Pass an int + for reproducible results across multiple function calls. + See :term: `Glossary `. Attributes ---------- From 617206d7b3d8efd22255bb67aa095a59f6a43b23 Mon Sep 17 00:00:00 2001 From: Daphne <50707529+daphn3k@users.noreply.github.com> Date: Sun, 26 Jan 2020 12:24:45 +0100 Subject: [PATCH 246/448] DOC improve random state docsting in covariance module (#16218) --- sklearn/covariance/_elliptic_envelope.py | 8 +++--- sklearn/covariance/_robust_covariance.py | 34 +++++++++++------------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index b96831077e68a..4f4624c995ba7 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -39,11 +39,9 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): of outliers in the data set. Range is (0, 0.5). random_state : int or RandomState instance, default=None - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + Determines the pseudo random number generator for shuffling + the data. Pass an int for reproducible results across multiple function + calls. See :term: `Glossary `. Attributes ---------- diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 8afac2c3c0eee..93624931c4303 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -62,11 +62,10 @@ def c_step(X, n_support, remaining_iterations=30, initial_estimates=None, The function which will be used to compute the covariance. Must return array of shape (n_features, n_features). - random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int or RandomState instance, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. Returns ------- @@ -236,11 +235,10 @@ def select_candidates(X, n_support, n_trials, select=1, n_iter=30, The function which will be used to compute the covariance. Must return an array of shape (n_features, n_features). - random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int or RandomState instance, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. See Also --------- @@ -332,11 +330,10 @@ def fast_mcd(X, support_fraction=None, The function which will be used to compute the covariance. Must return an array of shape (n_features, n_features). - random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int or RandomState instance, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. Returns ------- @@ -550,10 +547,9 @@ class MinCovDet(EmpiricalCovariance): (0, 1). random_state : int or RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term: `Glossary `. Attributes ---------- From ea5e4d0f6b3ad5ba166cb3df498918bf67d45c86 Mon Sep 17 00:00:00 2001 From: Marielle Date: Sun, 26 Jan 2020 12:33:07 +0100 Subject: [PATCH 247/448] DOC added feature_importances_ to ExtraTreeRegressor (#16225) --- sklearn/tree/_classes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index eade7c9e56ad5..b0e65b6348241 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1598,6 +1598,10 @@ class ExtraTreeRegressor(DecisionTreeRegressor): n_features_ : int The number of features when ``fit`` is performed. + feature_importances_ : ndarray of shape (n_features,) + Return the feature importances (the higher, the more important the + feature). + n_outputs_ : int The number of outputs when ``fit`` is performed. From 4f28ba534442d88c6d875b4b672b517ea0edc259 Mon Sep 17 00:00:00 2001 From: Emily Taylor Date: Sun, 26 Jan 2020 15:17:23 +0100 Subject: [PATCH 248/448] DOC improve docsting following the user guide in mean-shift module (#16216) --- sklearn/cluster/_mean_shift.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index b5c69f6b92f16..83f655acdd7dd 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -38,11 +38,11 @@ def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0, X : array-like of shape (n_samples, n_features) Input points. - quantile : float, default 0.3 + quantile : float, default=0.3 should be between [0, 1] 0.5 means that the median of all pairwise distances is used. - n_samples : int, optional + n_samples : int, default=None The number of samples to use. If not given, all samples are used. random_state : int, RandomState instance, default=None @@ -51,7 +51,7 @@ def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0, deterministic. See :term:`Glossary `. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` @@ -119,7 +119,7 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False, X : array-like of shape (n_samples, n_features) Input data. - bandwidth : float, optional + bandwidth : float, default=None Kernel bandwidth. If bandwidth is not given, it is determined using a heuristic based on @@ -144,16 +144,16 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False, To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. - cluster_all : boolean, default True + cluster_all : bool, default=True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. - max_iter : int, default 300 + max_iter : int, default=300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. @@ -206,7 +206,7 @@ def get_bin_seeds(X, bin_size, min_bin_freq=1): not sure how to set this, set it to the value of the bandwidth used in clustering.mean_shift. - min_bin_freq : integer, optional + min_bin_freq : int, default=1 Only bins with at least min_bin_freq will be selected as seeds. Raising this value decreases the number of seeds found, which makes mean_shift computationally cheaper. @@ -249,38 +249,38 @@ class MeanShift(ClusterMixin, BaseEstimator): Parameters ---------- - bandwidth : float, optional + bandwidth : float, default=None Bandwidth used in the RBF kernel. If not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth; see the documentation for that function for hints on scalability (see also the Notes, below). - seeds : array, shape=[n_samples, n_features], optional + seeds : array-like of shape (n_samples, n_features), default=None Seeds used to initialize kernels. If not set, the seeds are calculated by clustering.get_bin_seeds with bandwidth as the grid size and default values for other parameters. - bin_seeding : boolean, optional + bin_seeding : bool, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. - default value: False + The default value is False. Ignored if seeds argument is not None. - min_bin_freq : int, optional + min_bin_freq : int, default=1 To speed up the algorithm, accept only those bins with at least - min_bin_freq points as seeds. If not defined, set to 1. + min_bin_freq points as seeds. - cluster_all : boolean, default True + cluster_all : bool, default=True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. From 5c9957764ac3837967f6769167effed39365c5e7 Mon Sep 17 00:00:00 2001 From: rachelcjordan Date: Sun, 26 Jan 2020 20:29:11 +0100 Subject: [PATCH 249/448] DOC improve random state docstring in the _validation module (#16231) --- sklearn/model_selection/_validation.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 9e27ee28005dc..2f02b84986b18 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1007,9 +1007,8 @@ def permutation_test_score(estimator, X, y, groups=None, cv=None, for more details. random_state : int, RandomState instance or None, default=0 - Pass an int for reproducible output across multiple - function calls. - See :term:`Glossary `. + Pass an int for reproducible output for permutation of + ``y`` values among samples. See :term:`Glossary `. verbose : int, default=0 The verbosity level. @@ -1176,9 +1175,8 @@ def learning_curve(estimator, X, y, groups=None, based on``train_sizes``. random_state : int or RandomState instance, default=None - Used when ``shuffle`` is True. - Pass an int for reproducible output across multiple - function calls. + Used when ``shuffle`` is True. Pass an int for reproducible + output across multiple function calls. See :term:`Glossary `. error_score : 'raise' or numeric, default=np.nan From 3ae671e07be4ee6ed7a60e288915e52950b5bd19 Mon Sep 17 00:00:00 2001 From: olicairns <57708953+olicairns@users.noreply.github.com> Date: Mon, 27 Jan 2020 08:21:10 +0000 Subject: [PATCH 250/448] ENH Improve error message in clone when passed class instances (#16189) --- sklearn/base.py | 15 +++++++++++---- sklearn/tests/test_base.py | 8 ++++++++ sklearn/utils/tests/test_estimator_checks.py | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 4732c7ba165a9..4ca561ac2fff0 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -61,10 +61,17 @@ def clone(estimator, safe=True): if not safe: return copy.deepcopy(estimator) else: - raise TypeError("Cannot clone object '%s' (type %s): " - "it does not seem to be a scikit-learn estimator " - "as it does not implement a 'get_params' methods." - % (repr(estimator), type(estimator))) + if isinstance(estimator, type): + raise TypeError("Cannot clone object. " + + "You should provide an instance of " + + "scikit-learn estimator instead of a class.") + else: + raise TypeError("Cannot clone object '%s' (type %s): " + "it does not seem to be a scikit-learn " + "estimator as it does not implement a " + "'get_params' method." + % (repr(estimator), type(estimator))) + klass = estimator.__class__ new_object_params = estimator.get_params(deep=False) for name, param in new_object_params.items(): diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index f480fffda1571..95f7b01f27058 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -196,6 +196,14 @@ def test_clone_estimator_types(): assert clf.empty is clf2.empty +def test_clone_class_rather_than_instance(): + # Check that clone raises expected error message when + # cloning class rather than instance + msg = "You should provide an instance of scikit-learn estimator" + with pytest.raises(TypeError, match=msg): + clone(MyEstimator) + + def test_repr(): # Smoke test the repr of the base estimator. my_estimator = MyEstimator() diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 15b423d6e0ce8..90a3a3b9604ef 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -336,7 +336,7 @@ def test_check_estimator(): # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone - msg = "it does not implement a 'get_params' methods" + msg = "it does not implement a 'get_params' method" assert_raises_regex(TypeError, msg, check_estimator, object) assert_raises_regex(TypeError, msg, check_estimator, object()) # check that values returned by get_params match set_params From c45721d538d36ab4c322d18e33d9c10b55f5fe27 Mon Sep 17 00:00:00 2001 From: bernie gray Date: Mon, 27 Jan 2020 03:24:50 -0500 Subject: [PATCH 251/448] ENH check for invalid batch_size in gen_batches (#16181) --- sklearn/utils/__init__.py | 6 ++++++ sklearn/utils/tests/test_utils.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index ee38b9b924ccc..5e7605e97f949 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -748,6 +748,12 @@ def gen_batches(n, batch_size, min_batch_size=0): >>> list(gen_batches(7, 3, min_batch_size=2)) [slice(0, 3, None), slice(3, 7, None)] """ + if not isinstance(batch_size, numbers.Integral): + raise TypeError("gen_batches got batch_size=%s, must be an" + " integer" % batch_size) + if batch_size <= 0: + raise ValueError("gen_batches got batch_size=%s, must be" + " positive" % batch_size) start = 0 for _ in range(int(n // batch_size)): end = start + batch_size diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index c3ae523b32b39..5536711a451e7 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -18,6 +18,7 @@ from sklearn.utils import check_random_state from sklearn.utils import _determine_key_type from sklearn.utils import deprecated +from sklearn.utils import gen_batches from sklearn.utils import _get_column_indices from sklearn.utils import resample from sklearn.utils import safe_mask @@ -53,6 +54,22 @@ def test_make_rng(): assert_raises(ValueError, check_random_state, "some invalid seed") +def test_gen_batches(): + # Make sure gen_batches errors on invalid batch_size + + assert_array_equal( + list(gen_batches(4, 2)), + [slice(0, 2, None), slice(2, 4, None)] + ) + msg_zero = "gen_batches got batch_size=0, must be positive" + with pytest.raises(ValueError, match=msg_zero): + next(gen_batches(4, 0)) + + msg_float = "gen_batches got batch_size=0.5, must be an integer" + with pytest.raises(TypeError, match=msg_float): + next(gen_batches(4, 0.5)) + + def test_deprecated(): # Test whether the deprecated decorator issues appropriate warnings # Copied almost verbatim from https://docs.python.org/library/warnings.html From b7c4f4f085afc0cfc6021ed25cc48be70ceddfcd Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Mon, 27 Jan 2020 03:09:59 -0700 Subject: [PATCH 252/448] MNT Use np.asarray to get numpy data type descriptors for C structs in trees (#16141) --- sklearn/neighbors/_binary_tree.pxi | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index e633cdb0d1ee6..ef6a2a2d5d330 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -180,18 +180,8 @@ cdef struct NodeHeapData_t: ITYPE_t i2 # build the corresponding numpy dtype for NodeHeapData -# There is no offsetof() function in cython, so we hack it. -# If we can ensure numpy 1.5 or greater, a cleaner way is to do -# cdef NodeHeapData_t nhd_tmp -# NodeHeapData = np.asarray((&nhd_tmp)).dtype cdef NodeHeapData_t nhd_tmp -offsets = [&(nhd_tmp.val) - &nhd_tmp, - &(nhd_tmp.i1) - &nhd_tmp, - &(nhd_tmp.i2) - &nhd_tmp] -NodeHeapData = np.dtype({'names': ['val', 'i1', 'i2'], - 'formats': [DTYPE, ITYPE, ITYPE], - 'offsets': offsets, - 'itemsize': sizeof(NodeHeapData_t)}) +NodeHeapData = np.asarray((&nhd_tmp)).dtype cdef struct NodeData_t: ITYPE_t idx_start @@ -200,19 +190,8 @@ cdef struct NodeData_t: DTYPE_t radius # build the corresponding numpy dtype for NodeData -# There is no offsetof() function in cython, so we hack it. -# If we can ensure numpy 1.5 or greater, a cleaner way is to do -# cdef NodeData_t nd_tmp -# NodeData = np.asarray((&nd_tmp)).dtype cdef NodeData_t nd_tmp -offsets = [&(nd_tmp.idx_start) - &nd_tmp, - &(nd_tmp.idx_end) - &nd_tmp, - &(nd_tmp.is_leaf) - &nd_tmp, - &(nd_tmp.radius) - &nd_tmp] -NodeData = np.dtype({'names': ['idx_start', 'idx_end', 'is_leaf', 'radius'], - 'formats': [ITYPE, ITYPE, ITYPE, DTYPE], - 'offsets': offsets, - 'itemsize': sizeof(NodeData_t)}) +NodeData = np.asarray((&nd_tmp)).dtype ###################################################################### From c2ede74db053d4821c51e1de53f9f8522562e9b5 Mon Sep 17 00:00:00 2001 From: Rushabh Vasani Date: Mon, 27 Jan 2020 17:00:55 +0530 Subject: [PATCH 253/448] ENH add warning for pandas sparse Dataframe in check_array (#16021) --- doc/whats_new/v0.23.rst | 3 +++ sklearn/linear_model/tests/test_base.py | 18 ++++++++++++++++++ sklearn/utils/validation.py | 11 +++++++++++ 3 files changed, 32 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 08cd80a7ae24f..21bf8a1bd3329 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -183,3 +183,6 @@ Changelog - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`. :pr:`15926` by :user:`Loïc Estève `. +- |Enhancement| add warning in :func:`utils.validation.check_array` for + pandas sparse DataFrame. + :pr:`16021` by :user:`Rushabh Vasani `. diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index a932d5ed33fe1..de7d8dd72b0e3 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -5,6 +5,8 @@ import pytest +from distutils.version import LooseVersion + import numpy as np from scipy import sparse from scipy import linalg @@ -205,6 +207,22 @@ def test_linear_regression_sparse_multiple_outcome(random_state=0): assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3) +def test_linear_regression_pd_sparse_dataframe_warning(): + pd = pytest.importorskip('pandas') + # restrict the pd versions < '0.24.0' as they have a bug in is_sparse func + if LooseVersion(pd.__version__) < '0.24.0': + pytest.skip("pandas 0.24+ required.") + df = pd.DataFrame() + for col in range(4): + arr = np.random.randn(10) + arr[:8] = 0 + df[str(col)] = pd.arrays.SparseArray(arr, fill_value=0) + msg = "pandas.DataFrame with sparse columns found." + with pytest.warns(UserWarning, match=msg): + reg = LinearRegression() + reg.fit(df.iloc[:, 0:2], df.iloc[:, 3]) + + def test_preprocess_data(): n_samples = 200 n_features = 2 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9929ff2f35502..1bb6ce73b0ddc 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -21,6 +21,8 @@ from numpy.core.numeric import ComplexWarning import joblib +from contextlib import suppress + from .fixes import _object_dtype_isnan from .. import get_config as _get_config from ..exceptions import NonBLASDotWarning, PositiveSpectrumWarning @@ -449,6 +451,15 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): + # throw warning if pandas dataframe is sparse + with suppress(ImportError): + from pandas.api.types import is_sparse + if array.dtypes.apply(is_sparse).any(): + warnings.warn( + "pandas.DataFrame with sparse columns found." + "It will be converted to a dense numpy array." + ) + dtypes_orig = list(array.dtypes) # pandas boolean dtype __array__ interface coerces bools to objects for i, dtype_iter in enumerate(dtypes_orig): From 872468f0acc47808b3696ba4b42a87cae591864f Mon Sep 17 00:00:00 2001 From: Fabiana <50251203+fabi-cast@users.noreply.github.com> Date: Mon, 27 Jan 2020 22:14:24 +0100 Subject: [PATCH 254/448] DOC improve random state docsting for extmath module (#16239) --- sklearn/utils/extmath.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index b0c28897a8ef1..d11b307e7500e 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -184,10 +184,9 @@ def randomized_range_finder(A, size, n_iter, random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + the data, i.e. getting the random vectors to initialize the algorithm. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. Returns ------- @@ -296,10 +295,9 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto', random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + the data, i.e. getting the random vectors to initialize the algorithm. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. Notes ----- From 3cff0f95bf6065307fda334c47992c3a6d762ff0 Mon Sep 17 00:00:00 2001 From: Mojca Bertoncelj <38767382+mojc@users.noreply.github.com> Date: Mon, 27 Jan 2020 22:25:28 +0100 Subject: [PATCH 255/448] DOC improve random state docsting in permutation_importance (#16215) --- sklearn/inspection/_permutation_importance.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 80bf4d2e2a62c..b95e5d11fad9d 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -78,9 +78,11 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, `-1` means using all processors. See :term:`Glossary ` for more details. - random_state : int, RandomState instance, or None, default=None + random_state : int, RandomState instance, default=None Pseudo-random number generator to control the permutations of each - feature. See :term:`random_state`. + feature. + Pass an int to get reproducible results across function calls. + See :term: `Glossary `. Returns ------- From d67647fcac9f8a3289e336420c440e99cef3591a Mon Sep 17 00:00:00 2001 From: Marielle Date: Mon, 27 Jan 2020 22:27:54 +0100 Subject: [PATCH 256/448] DOC improve docstring following doc guideline in model_selection module (#16207) --- sklearn/model_selection/_search.py | 81 ++++++++--------- sklearn/model_selection/_split.py | 134 ++++++++++++++--------------- 2 files changed, 107 insertions(+), 108 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index ede292c2b6261..059988bab22a2 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -53,7 +53,7 @@ class ParameterGrid: Parameters ---------- - param_grid : dict of string to sequence, or sequence of such + param_grid : dict of str to sequence, or sequence of such The parameter grid to explore, as a dictionary mapping estimator parameters to sequences of allowed values. @@ -115,7 +115,7 @@ def __iter__(self): Returns ------- - params : iterator over dict of string to any + params : iterator over dict of str to any Yields dictionaries mapping each estimator parameter to one of its allowed values. """ @@ -147,7 +147,7 @@ def __getitem__(self, ind): Returns ------- - params : dict of string to any + params : dict of str to any Equal to list(self)[ind] """ # This is used to make discrete sampling without replacement memory @@ -194,7 +194,7 @@ class ParameterSampler: Parameters ---------- param_distributions : dict - Dictionary with parameters names (string) as keys and distributions + Dictionary with parameters names (`str`) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. @@ -204,7 +204,7 @@ class ParameterSampler: n_iter : integer Number of parameter settings that are produced. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. Pass an int for reproducible output across multiple @@ -213,7 +213,7 @@ class ParameterSampler: Returns ------- - params : dict of string to any + params : dict of str to any **Yields** dictionaries mapping each estimator parameter to as sampled value. @@ -340,11 +340,11 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, **fit_params : kwargs Additional parameter passed to the fit function of the estimator. - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. Default is ``np.nan``. + step, which will always raise the error. Returns ------- @@ -430,7 +430,8 @@ def score(self, X, y=None): Input data, where n_samples is the number of samples and n_features is the number of features. - y : array-like of shape (n_samples, n_output) or (n_samples,), optional + y : array-like of shape (n_samples, n_output) \ + or (n_samples,), default=None Target relative to X for classification or regression; None for unsupervised learning. @@ -609,16 +610,17 @@ def fit(self, X, y=None, groups=None, **fit_params): Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like of shape (n_samples, n_output) or (n_samples,), optional + y : array-like of shape (n_samples, n_output) \ + or (n_samples,), default=None Target relative to X for classification or regression; None for unsupervised learning. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). - **fit_params : dict of string -> object + **fit_params : dict of str -> object Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator @@ -860,14 +862,14 @@ class GridSearchCV(BaseSearchCV): or ``scoring`` must be passed. param_grid : dict or list of dictionaries - Dictionary with parameters names (string) as keys and lists of + Dictionary with parameters names (`str`) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. - scoring : string, callable, list/tuple, dict or None, default: None - A single string (see :ref:`scoring_parameter`) or a callable + scoring : str, callable, list/tuple or dict, default=None + A single str (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. For evaluating multiple metrics, either give a list of (unique) strings @@ -881,13 +883,13 @@ class GridSearchCV(BaseSearchCV): If None, the estimator's score method is used. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - pre_dispatch : int, or string, optional + pre_dispatch : int, or str, default=n_jobs Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched @@ -901,10 +903,10 @@ class GridSearchCV(BaseSearchCV): - An int, giving the exact number of total jobs that are spawned - - A string, giving an expression as a function of n_jobs, + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' - iid : boolean, default=False + iid : bool, default=False If True, return the average score across folds, weighted by the number of samples in each test set. In this case, the data is assumed to be identically distributed across the folds, and the loss minimized is @@ -913,7 +915,7 @@ class GridSearchCV(BaseSearchCV): .. deprecated:: 0.22 Parameter ``iid`` is deprecated in 0.22 and will be removed in 0.24 - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -932,11 +934,11 @@ class GridSearchCV(BaseSearchCV): .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - refit : boolean, string, or callable, default=True + refit : bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. - For multiple metric evaluation, this needs to be a string denoting the + For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. @@ -965,13 +967,13 @@ class GridSearchCV(BaseSearchCV): verbose : integer Controls the verbosity: the higher, the more messages. - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. Default is ``np.nan``. + step, which will always raise the error. - return_train_score : boolean, default=False + return_train_score : bool, default=False If ``False``, the ``cv_results_`` attribute will not include training scores. Computing training scores is used to get insights on how different @@ -1185,7 +1187,7 @@ class RandomizedSearchCV(BaseSearchCV): or ``scoring`` must be passed. param_distributions : dict or list of dicts - Dictionary with parameters names (string) as keys and distributions + Dictionary with parameters names (`str`) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. @@ -1196,8 +1198,8 @@ class RandomizedSearchCV(BaseSearchCV): Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution. - scoring : string, callable, list/tuple, dict or None, default: None - A single string (see :ref:`scoring_parameter`) or a callable + scoring : str, callable, list/tuple or dict, default=None + A single str (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. For evaluating multiple metrics, either give a list of (unique) strings @@ -1211,13 +1213,13 @@ class RandomizedSearchCV(BaseSearchCV): If None, the estimator's score method is used. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - pre_dispatch : int, or string, optional + pre_dispatch : int, or str, default=None Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched @@ -1231,10 +1233,10 @@ class RandomizedSearchCV(BaseSearchCV): - An int, giving the exact number of total jobs that are spawned - - A string, giving an expression as a function of n_jobs, + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' - iid : boolean, default=False + iid : bool, default=False If True, return the average score across folds, weighted by the number of samples in each test set. In this case, the data is assumed to be identically distributed across the folds, and the loss minimized is @@ -1243,10 +1245,9 @@ class RandomizedSearchCV(BaseSearchCV): .. deprecated:: 0.22 Parameter ``iid`` is deprecated in 0.22 and will be removed in 0.24 - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, @@ -1262,11 +1263,11 @@ class RandomizedSearchCV(BaseSearchCV): .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. - refit : boolean, string, or callable, default=True + refit : bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. - For multiple metric evaluation, this needs to be a string denoting the + For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. @@ -1295,20 +1296,20 @@ class RandomizedSearchCV(BaseSearchCV): verbose : integer Controls the verbosity: the higher, the more messages. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. - error_score : 'raise' or numeric + error_score : 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. Default is ``np.nan``. + step, which will always raise the error. - return_train_score : boolean, default=False + return_train_score : bool, default=False If ``False``, the ``cv_results_`` attribute will not include training scores. Computing training scores is used to get insights on how different diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 6bab796fb54a7..0ee1edd1766ad 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -56,14 +56,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, of length n_samples + y : array-like of shape (n_samples,) The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. @@ -167,7 +167,7 @@ def get_n_splits(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. @@ -251,7 +251,7 @@ def get_n_splits(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. @@ -305,14 +305,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. @@ -376,11 +376,11 @@ class KFold(_BaseKFold): .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. - shuffle : boolean, optional + shuffle : bool, default=False Whether to shuffle the data before splitting into batches. Note that the samples within each split will not be shuffled. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Only used when ``shuffle`` is True. This should be left to None if ``shuffle`` is False. Pass an int for reproducible output across multiple @@ -542,14 +542,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,), optional + y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. @@ -583,11 +583,11 @@ class StratifiedKFold(_BaseKFold): .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. - shuffle : boolean, optional + shuffle : bool, default=False Whether to shuffle each class's samples before splitting into batches. Note that the samples within each split will not be shuffled. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Only used when ``shuffle`` is True. This should be left to None if ``shuffle`` is False. Pass an int for reproducible output across multiple @@ -700,7 +700,7 @@ def split(self, X, y, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. @@ -708,7 +708,7 @@ def split(self, X, y, groups=None): hence ``np.zeros(n_samples)`` may be used as a placeholder for ``X`` instead of actual training data. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) The target variable for supervised learning problems. Stratification is done based on the y labels. @@ -758,7 +758,7 @@ class TimeSeriesSplit(_BaseKFold): .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. - max_train_size : int, optional + max_train_size : int, default=None Maximum size for a single training set. Examples @@ -796,14 +796,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Always ignored, exists for compatibility. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Yields @@ -902,7 +902,7 @@ def get_n_splits(self, X=None, y=None, groups=None): y : object Always ignored, exists for compatibility. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. This 'groups' parameter must always be specified to calculate the number of splits, though the other parameters can be @@ -923,14 +923,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, of length n_samples, optional + y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. @@ -1033,7 +1033,7 @@ def get_n_splits(self, X=None, y=None, groups=None): y : object Always ignored, exists for compatibility. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. This 'groups' parameter must always be specified to calculate the number of splits, though the other parameters can be @@ -1054,14 +1054,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, of length n_samples, optional + y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. @@ -1090,7 +1090,7 @@ class _RepeatedSplits(metaclass=ABCMeta): n_repeats : int, default=10 Number of times cross-validator needs to be repeated. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1124,10 +1124,10 @@ def split(self, X, y=None, groups=None): Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, of length n_samples + y : array-like of length n_samples The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. @@ -1161,7 +1161,7 @@ def get_n_splits(self, X=None, y=None, groups=None): Always ignored, exists for compatibility. ``np.zeros(n_samples)`` may be used as a placeholder. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. @@ -1194,7 +1194,7 @@ class RepeatedKFold(_RepeatedSplits): n_repeats : int, default=10 Number of times cross-validator needs to be repeated. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls.See :term:`Glossary `. @@ -1246,7 +1246,7 @@ class RepeatedStratifiedKFold(_RepeatedSplits): n_repeats : int, default=10 Number of times cross-validator needs to be repeated. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1300,14 +1300,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,), optional + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. @@ -1371,23 +1371,23 @@ class ShuffleSplit(BaseShuffleSplit): Parameters ---------- - n_splits : int, default 10 + n_splits : int, default=10 Number of re-shuffling & splitting iterations. - test_size : float, int, None, default=None + test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1. - train_size : float, int, or None, default=None + train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1469,26 +1469,25 @@ class GroupShuffleSplit(ShuffleSplit): Parameters ---------- - n_splits : int (default 5) + n_splits : int, default=5 Number of re-shuffling & splitting iterations. - test_size : float, int, None, optional (default=None) + test_size : float, int, default=0.2 If float, should be between 0.0 and 1.0 and represent the proportion of groups to include in the test split (rounded up). If int, represents the absolute number of test groups. If None, the value is - set to the complement of the train size. By default, the value is set - to 0.2. + set to the complement of the train size. The default will change in version 0.21. It will remain 0.2 only if ``train_size`` is unspecified, otherwise it will complement the specified ``train_size``. - train_size : float, int, or None, default is None + train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the groups to include in the train split. If int, represents the absolute number of train groups. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1539,14 +1538,14 @@ def split(self, X, y=None, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,), optional + y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. - groups : array-like, with shape (n_samples,) + groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. @@ -1584,23 +1583,23 @@ class StratifiedShuffleSplit(BaseShuffleSplit): Parameters ---------- - n_splits : int, default 10 + n_splits : int, default=10 Number of re-shuffling & splitting iterations. - test_size : float, int, None, optional (default=None) + test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1. - train_size : float, int, or None, default is None + train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -1702,7 +1701,7 @@ def split(self, X, y, groups=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. @@ -1710,7 +1709,7 @@ def split(self, X, y, groups=None): hence ``np.zeros(n_samples)`` may be used as a placeholder for ``X`` instead of actual training data. - y : array-like, shape (n_samples,) or (n_samples, n_labels) + y : array-like of shape (n_samples,) or (n_samples, n_labels) The target variable for supervised learning problems. Stratification is done based on the y labels. @@ -1817,7 +1816,7 @@ class PredefinedSplit(BaseCrossValidator): Parameters ---------- - test_fold : array-like, shape (n_samples,) + test_fold : array-like of shape (n_samples,) The entry ``test_fold[i]`` represents the index of the test set that sample ``i`` belongs to. It is possible to exclude sample ``i`` from any test set (i.e. include sample ``i`` in every training set) by @@ -1964,11 +1963,10 @@ def check_cv(cv=5, y=None, classifier=False): Parameters ---------- - cv : int, cross-validation generator or an iterable, optional + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - - - None, to use the default 5-fold cross-validation, + - None, to use the default 5-fold cross validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. @@ -1983,10 +1981,10 @@ def check_cv(cv=5, y=None, classifier=False): .. versionchanged:: 0.22 ``cv`` default value changed from 3-fold to 5-fold. - y : array-like, optional + y : array-like, default=None The target variable for supervised learning problems. - classifier : boolean, optional, default False + classifier : bool, default=False Whether the task is a classification task, in which case stratified KFold will be used. @@ -2030,29 +2028,29 @@ def train_test_split(*arrays, **options): Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. - test_size : float, int or None, optional (default=None) + test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. - train_size : float, int, or None, (default=None) + train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, default=None + random_state : int or RandomState instance, default=None Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. - shuffle : boolean, optional (default=True) + shuffle : bool, default=True Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. - stratify : array-like or None (default=None) + stratify : array-like, default=None If not None, data is split in a stratified fashion, using this as the class labels. From a429bd5728bc88e0ab10917b12aa786c8bb20f2a Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 27 Jan 2020 22:35:09 +0100 Subject: [PATCH 257/448] DOC fix default values in isotonic module (#16198) --- sklearn/isotonic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 173a747b927c2..dbaa21b05f40d 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -91,10 +91,10 @@ def isotonic_regression(y, sample_weight=None, y_min=None, y_max=None, Parameters ---------- - y : iterable of floats + y : array-like of shape (n_samples,) The data. - sample_weight : iterable of floats, optional, default: None + sample_weight : array-like of shape (n_samples,), default=None Weights on each point of the regression. If None, weight is set to 1 (equal weights). @@ -160,13 +160,13 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): Parameters ---------- - y_min : optional, default: None + y_min : float, default=None If not None, set the lowest value of the fit to y_min. - y_max : optional, default: None + y_max : float, default=None If not None, set the highest value of the fit to y_max. - increasing : boolean or string, optional, default: True + increasing : bool or string, default=True If boolean, whether or not to fit the isotonic regression with y increasing or decreasing. @@ -174,7 +174,7 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): increase or decrease based on the Spearman correlation estimate's sign. - out_of_bounds : string, optional, default: "nan" + out_of_bounds : str, default="nan" The ``out_of_bounds`` parameter handles how x-values outside of the training domain are handled. When set to "nan", predicted y-values will be NaN. When set to "clip", predicted y-values will be From 3424f72469c0ece8cb71011c61860c1945904a2a Mon Sep 17 00:00:00 2001 From: Hye Sung Jung Date: Mon, 27 Jan 2020 19:52:17 -0600 Subject: [PATCH 258/448] DOC spelling fixes (#16247) --- doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py | 2 +- doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py | 2 +- examples/mixture/plot_gmm_sin.py | 2 +- sklearn/compose/_column_transformer.py | 2 +- sklearn/svm/_base.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py b/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py index 11b1ff07acf7e..23299f5f01b3d 100644 --- a/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py +++ b/doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py @@ -2,7 +2,7 @@ Sentiment analysis can be casted as a binary text classification problem, that is fitting a linear classifier on features extracted from the text -of the user messages so as to guess wether the opinion of the author is +of the user messages so as to guess whether the opinion of the author is positive or negative. In this examples we will use a movie review dataset. diff --git a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py index 9f747694064ac..434bece341975 100644 --- a/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py +++ b/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py @@ -2,7 +2,7 @@ Sentiment analysis can be casted as a binary text classification problem, that is fitting a linear classifier on features extracted from the text -of the user messages so as to guess wether the opinion of the author is +of the user messages so as to guess whether the opinion of the author is positive or negative. In this examples we will use a movie review dataset. diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py index f5fb2ded45120..1d436b93d15cc 100644 --- a/examples/mixture/plot_gmm_sin.py +++ b/examples/mixture/plot_gmm_sin.py @@ -26,7 +26,7 @@ similar to the first model where we arbitrarily decided to fix the number of components to 10. -Which model is the best is a matter of subjective judgement: do we want to +Which model is the best is a matter of subjective judgment: do we want to favor models that only capture the big picture to summarize and explain most of the structure of the data while ignoring the details or do we prefer models that closely follow the high density regions of the signal? diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 0b6a7363686a9..e39c859f20fd1 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -130,7 +130,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): objects. sparse_output_ : boolean - Boolean flag indicating wether the output of ``transform`` is a + Boolean flag indicating whether the output of ``transform`` is a sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index e83ee8ada9d57..43dc8b428e4b3 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -749,7 +749,7 @@ def _get_liblinear_solver_type(multi_class, penalty, loss, dual): """ # nested dicts containing level 1: available loss functions, # level2: available penalties for the given loss function, - # level3: wether the dual solver is available for the specified + # level3: whether the dual solver is available for the specified # combination of loss function and penalty _solver_type_dict = { 'logistic_regression': { From a5b689429d519038ed9de07c13cdbde7b83c767e Mon Sep 17 00:00:00 2001 From: waelbenamara <34647652+waelbenamara@users.noreply.github.com> Date: Tue, 28 Jan 2020 07:30:19 +0100 Subject: [PATCH 259/448] MAINT/EXA replaced deprecated NavigationToolbar2TkAgg (#16187) --- examples/applications/svm_gui.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/examples/applications/svm_gui.py b/examples/applications/svm_gui.py index 46b7f7369a0fe..d085851422e18 100644 --- a/examples/applications/svm_gui.py +++ b/examples/applications/svm_gui.py @@ -22,9 +22,14 @@ import matplotlib matplotlib.use('TkAgg') - from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg -from matplotlib.backends.backend_tkagg import NavigationToolbar2TkAgg +try: + from matplotlib.backends.backend_tkagg import NavigationToolbar2Tk +except ImportError: + # NavigationToolbar2TkAgg was deprecated in matplotlib 2.2 + from matplotlib.backends.backend_tkagg import ( + NavigationToolbar2TkAgg as NavigationToolbar2Tk + ) from matplotlib.figure import Figure from matplotlib.contour import ContourSet @@ -144,11 +149,15 @@ def __init__(self, root, controller): ax.set_xlim((x_min, x_max)) ax.set_ylim((y_min, y_max)) canvas = FigureCanvasTkAgg(f, master=root) - canvas.show() + try: + canvas.draw() + except AttributeError: + # support for matplotlib (1.*) + canvas.show() canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) canvas._tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) canvas.mpl_connect('button_press_event', self.onclick) - toolbar = NavigationToolbar2TkAgg(canvas, root) + toolbar = NavigationToolbar2Tk(canvas, root) toolbar.update() self.controllbar = ControllBar(root, controller) self.f = f From 002f891a33b612be389d9c488699db5689753ef4 Mon Sep 17 00:00:00 2001 From: lopusz Date: Tue, 28 Jan 2020 11:55:52 +0100 Subject: [PATCH 260/448] DOC Removing word optional from parameter docstring in pipeline module (#16208) --- sklearn/pipeline.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index af2feed1a861e..eb42c43a98905 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -56,7 +56,7 @@ class Pipeline(_BaseComposition): chained, in the order in which they are chained, with the last object an estimator. - memory : None, str or object with the joblib.Memory interface, optional + memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of @@ -140,7 +140,7 @@ def get_params(self, deep=True): Parameters ---------- - deep : boolean, optional + deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. @@ -502,7 +502,7 @@ def score_samples(self, X): Returns ------- - y_score : ndarray, shape (n_samples,) + y_score : ndarray of shape (n_samples,) """ Xt = X for _, _, transformer in self._iter(with_final=False): @@ -664,7 +664,7 @@ def make_pipeline(*steps, **kwargs): ---------- *steps : list of estimators. - memory : None, str or object with the joblib.Memory interface, optional + memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of @@ -674,7 +674,7 @@ def make_pipeline(*steps, **kwargs): inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. - verbose : boolean, default=False + verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. @@ -773,17 +773,17 @@ class FeatureUnion(TransformerMixin, _BaseComposition): .. versionchanged:: 0.22 Deprecated `None` as a transformer in favor of 'drop'. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - transformer_weights : dict, optional + transformer_weights : dict, default=None Multiplicative weights for features per transformer. Keys are transformer names, values the weights. - verbose : boolean, optional(default=False) + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. @@ -818,7 +818,7 @@ def get_params(self, deep=True): Parameters ---------- - deep : boolean, optional + deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. @@ -900,7 +900,7 @@ def fit(self, X, y=None, **fit_params): X : iterable or array-like, depending on transformers Input data, used to fit transformers. - y : array-like, shape (n_samples, ...), optional + y : array-like of shape (n_samples, n_outputs), default=None Targets for supervised learning. Returns @@ -924,12 +924,13 @@ def fit_transform(self, X, y=None, **fit_params): X : iterable or array-like, depending on transformers Input data to be transformed. - y : array-like, shape (n_samples, ...), optional + y : array-like of shape (n_samples, n_outputs), default=None Targets for supervised learning. Returns ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) + X_t : array-like or sparse matrix of \ + shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ @@ -975,7 +976,8 @@ def transform(self, X): Returns ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) + X_t : array-like or sparse matrix of \ + shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ @@ -1010,13 +1012,13 @@ def make_union(*transformers, **kwargs): ---------- *transformers : list of estimators - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : boolean, optional(default=False) + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. From 1382831f4158f358701a00c6ad216d7814c5716f Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 28 Jan 2020 17:13:58 +0100 Subject: [PATCH 261/448] DOC clarifications on the release process (#15759) * some extra comments * reorganize steps * more info * more clarification * further instructions * be cautious with tagging * add pypi note * git log copy * upstream * more explanation * fix typo * clarify * typo * further clarifications, address comments * add link to pep101 * explain branch names * apply comments * cover more comments * clarify branching, remove Dash ref * Update maintainer.rst * address some of Roman's comments * address some of Nocolas's suggestions * trying to address comments * CLN Adds reference to conda-forge recipe/meta.yml * DOC Explicitly push the tag by name * DOC Explicitly git add folders before commiting * fixed ref and removed redundant sentence (I think) * Added note about making sure derecations are handled * fixed title headers, this file is not just about releasing * removed redundant paragraph * major changes * minor comments Co-authored-by: Hanmin Qin Co-authored-by: Thomas J Fan Co-authored-by: Roman Yurchak Co-authored-by: Nicolas Hug --- doc/developers/maintainer.rst | 164 +++++++++++++++++++++++++--------- 1 file changed, 121 insertions(+), 43 deletions(-) diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index 66d5250af1644..f400989a7d877 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -1,14 +1,27 @@ Maintainer / core-developer information ======================================== + +Releasing +--------- + +This section is about preparing a major release, incrementing the minor +version, or a bug fix release incrementing the patch version. Our convention is +that we release one or more release candidates (0.RRrcN) before releasing the +final distributions. We follow the `PEP101 +`_ to indicate release candidates, +post, and minor releases. + Before a release ----------------- +................ 1. Update authors table:: $ cd build_tools; make authors; cd .. - and commit. + and commit. This is only needed if the authors have changed since the last + release. This step is sometimes done independent of the release. This + updates the maintainer list and is not the contributor list for the release. 2. Confirm any blockers tagged for the milestone are resolved, and that other issues tagged for the milestone can be postponed. @@ -17,61 +30,96 @@ Before a release change log is reasonably well curated. Some tools for these tasks include: - ``maint_tools/sort_whats_new.py`` can put what's new entries into - sections. + sections. It's not perfect, and requires manual checking of the changes. + If the whats new list is well curated, it may not be necessary. - The ``maint_tools/whats_missing.sh`` script may be used to identify pull requests that were merged but likely missing from What's New. -Preparing a bug-fix-release -........................... +4. Make sure the deprecations, FIXME and TODOs tagged for the release have + been taken care of. + +**Permissions** + +The release manager requires a set of permissions on top of the usual +permissions given to maintainers, which includes: + +- *maintainer* role on ``scikit-learn`` projects on ``pypi.org`` and + ``test.pypi.org``, separately. +- become a member of the *scikit-learn* team on conda-forge by editing the + ``recipe/meta.yaml`` file on + ``https://github.com/conda-forge/scikit-learn-feedstock`` +- *maintainer* on ``https://github.com/MacPython/scikit-learn-wheels`` + -Since any commits to a released branch (e.g. 0.999.X) will automatically update -the web site documentation, it is best to develop a bug-fix release with a pull -request in which 0.999.X is the base. It also allows you to keep track of any -tasks towards release with a TO DO list. +.. _preparing_a_release_pr: -Most development of the bug fix release, and its documentation, should -happen in master to avoid asynchrony. To select commits from master for use in -the bug fix (version 0.999.3), you can use:: +Preparing a release PR +...................... +Releasing the first RC of e.g. version `0.99` involves creating the release +branch `0.99.X` directly on the main repo, where `X` really is the letter X, +**not a placeholder**. This is considered the *feature freeze*. The +development for the major and minor releases of 0.99 should +**also** happen under `0.99.X`. Each release (rc, major, or minor) is a tag +under that branch. + +In terms of including changes, the first RC ideally counts as a *feature +freeze*. Each coming release candidate and the final release afterwards will +include minor documentation changes and bug fixes. Any major enhancement or +feature should be excluded. + +The minor releases should include bug fixes and some relevant documentation +changes only. Any PR resulting in a behavior change which is not a bug fix +should be excluded. + +First, create a branch, **on your own fork** (to release e.g. `0.999.3`):: + + $ # assuming master and upstream/master are the same $ git checkout -b release-0.999.3 master - $ git rebase -i 0.999.X -Then pick the commits for release and resolve any issues, and create a pull -request with 0.999.X as base. Add a commit updating ``sklearn.__version__``. -Additional commits can be cherry-picked into the ``release-0.999.3`` branch -while preparing the release. +Then, create a PR **to the** `scikit-learn/0.999.X` **branch** (not to +master!) with all the desired changes:: + + $ git rebase -i upstream/0.999.2 + +It's nice to have a copy of the ``git rebase -i`` log in the PR to help others +understand what's included. Making a release ----------------- +................ + +0. Create the release branch on the main repo, if it does not exist. This is + done only once, as the major and minor releases happen on the same branch:: -1. Update docs: + $ git checkout -b 0.99.X + + Again, `X` is literal here, and `99` is replaced by the release number. + The branches are called ``0.19.X``, ``0.20.X``, etc. + +1. Update docs. Note that this is for the final release, not necessarily for + the RC releases. These changes should be made in master and cherry-picked + into the release branch, only before the final release. - Edit the doc/whats_new.rst file to add release title and commit statistics. You can retrieve commit statistics with:: $ git shortlog -s 0.99.33.. | cut -f2- | sort --ignore-case | tr '\n' ';' | sed 's/;/, /g;s/, $//' - - Update the release date in whats_new.rst + - Update the release date in ``whats_new.rst`` - - Edit the doc/index.rst to change the 'News' entry of the front page. - - - Note that these changes should be made in master and cherry-picked into - the release branch. + - Edit the doc/templates/index.html to change the 'News' entry of the front + page. 2. On the branch for releasing, update the version number in - sklearn/__init__.py, the ``__version__`` variable by removing ``dev*`` only - when ready to release. - On master, increment the version in the same place (when branching for - release). - -3. Create the tag and push it:: + `sklearn/__init__.py`, the ``__version__`` variable by removing ``dev*`` + only when ready to release. On master, increment the version in the same + place (when branching for release). This means while we're in the release + candidate period, the latest stable is two versions behind the master + branch, instead of one. - $ git tag -a 0.999 - - $ git push git@github.com:scikit-learn/scikit-learn.git --tags - -4. Create the source tarball: +3. At this point all relevant PRs should have been merged into the `0.99.X` + branch. Create the source tarball: - Wipe clean your repo:: @@ -81,10 +129,32 @@ Making a release $ python setup.py sdist + - You can also test a binary dist build using:: + + $ python setup.py bdist_wheel + + - You can test if PyPi is going to accept the package using:: + + $ twine check dist/* + + You can run ``twine check`` after step 5 (fetching artifacts) as well. + The result should be in the `dist/` folder. We will upload it later with the wheels. Check that you can install it in a new virtualenv and that the tests pass. +4. Proceed with caution. Ideally, tags should be created when you're almost + certain that the release is ready, since adding a tag to the main repo can + trigger certain automated processes. You can test upload the ``sdist`` to + ``test.pypi.org``, and test the next step by setting ``BUILD_COMMIT`` to the + branch name (``0.99.X`` for instance) in a PR to the wheel building repo. + Once all works, you can proceed with tagging. Create the tag and push it (if + it's an RC, it can be ``0.xxrc1`` for instance):: + + $ git tag -a 0.99 # in the 0.99.X branch + + $ git push git@github.com:scikit-learn/scikit-learn.git 0.99 + 5. Update the dependency versions and set ``BUILD_COMMIT`` variable to the release tag at: @@ -94,16 +164,20 @@ Making a release packages and upload them to PyPI by running the following commands in the scikit-learn source folder (checked out at the release tag):: - $ rm -r dist + $ rm -r dist # only if there's anything other than the sdist tar.gz there $ pip install -U wheelhouse_uploader twine $ python setup.py fetch_artifacts 6. Check the content of the `dist/` folder: it should contain all the wheels - along with the source tarball ("scikit-learn-XXX.tar.gz"). + along with the source tarball ("scikit-learn-RRR.tar.gz"). Make sure that you do not have developer versions or older versions of the scikit-learn package in that folder. + Before uploading to pypi, you can test upload to test.pypi.org:: + + $ twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/* + Upload everything at once to https://pypi.org:: $ twine upload dist/* @@ -119,21 +193,25 @@ Making a release $ git checkout master $ rm stable $ ln -s 0.999 stable - $ sed -i "s/latestStable = '.*/latestStable = '0.999';" versionwarning.js - $ git commit -m "Update stable to point to 0.999" stable + $ sed -i "s/latestStable = '.*/latestStable = '0.999';/" versionwarning.js + $ git add stable/ versionwarning.js + $ git commit -m "Update stable to point to 0.999" $ git push origin master The following GitHub checklist might be helpful in a release PR:: * [ ] update news and what's new date in master and release branch * [ ] create tag - * [ ] update dependencies and release tag at https://github.com/MacPython/scikit-learn-wheels + * [ ] update dependencies and release tag at + https://github.com/MacPython/scikit-learn-wheels * [ ] twine the wheels to PyPI when that's green * [ ] https://github.com/scikit-learn/scikit-learn/releases draft - * [ ] confirm bot detected at https://github.com/conda-forge/scikit-learn-feedstock and wait for merge + * [ ] confirm bot detected at + https://github.com/conda-forge/scikit-learn-feedstock and wait for merge * [ ] https://github.com/scikit-learn/scikit-learn/releases publish - * [ ] announce on mailing list - * [ ] (regenerate Dash docs: https://github.com/Kapeli/Dash-User-Contributions/tree/master/docsets/Scikit) + * [ ] fix the binder release version in ``.binder/requirement.txt`` (see + #15847) + * [ ] announce on mailing list and on twitter The scikit-learn.org web site ----------------------------- From cc2fbeddfa7f462cad230edcbffaf14e0fa5f965 Mon Sep 17 00:00:00 2001 From: SanthoshBala18 Date: Tue, 28 Jan 2020 23:37:43 +0530 Subject: [PATCH 262/448] =?UTF-8?q?BUG=20max=5Fdepth=3D1=20should=20be=20d?= =?UTF-8?q?ecision=20stump=20in=20HistGradientBoosti=E2=80=A6=20(#16182)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/whats_new/v0.23.rst | 7 +++++++ .../_hist_gradient_boosting/gradient_boosting.py | 8 ++++---- sklearn/ensemble/_hist_gradient_boosting/grower.py | 7 ++++--- .../tests/test_gradient_boosting.py | 3 +-- .../_hist_gradient_boosting/tests/test_grower.py | 12 +++++++++++- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 21bf8a1bd3329..75085088ab1a5 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -90,6 +90,13 @@ Changelog :user:`Reshama Shaikh `, and :user:`Chiara Marmo `. +- |Fix| Changed the convention for `max_depth` parameter of + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. The depth now corresponds to + the number of edges to go from the root to the deepest leaf. + Stumps (trees with one split) are now allowed. + :pr: `16182` by :user:`Santhosh B ` + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index cb708ecc576e7..f3efd3c897a4c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -689,8 +689,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): than 1. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. Must be strictly greater - than 1. Depth isn't constrained by default. + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value @@ -872,8 +872,8 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, than 1. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. Must be strictly greater - than 1. Depth isn't constrained by default. + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index c7d303b8f6201..cd0a4ed1cb34c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -135,7 +135,8 @@ class TreeGrower: maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. min_gain_to_split : float, optional (default=0.) @@ -230,9 +231,9 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, if max_leaf_nodes is not None and max_leaf_nodes <= 1: raise ValueError('max_leaf_nodes={} should not be' ' smaller than 2'.format(max_leaf_nodes)) - if max_depth is not None and max_depth <= 1: + if max_depth is not None and max_depth < 1: raise ValueError('max_depth={} should not be' - ' smaller than 2'.format(max_depth)) + ' smaller than 1'.format(max_depth)) if min_samples_leaf < 1: raise ValueError('min_samples_leaf={} should ' 'not be smaller than 1'.format(min_samples_leaf)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 87950eab38a97..b607cdd23b6c9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -31,8 +31,7 @@ ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'), ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'), ({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'), - ({'max_depth': 0}, 'max_depth=0 should not be smaller than 2'), - ({'max_depth': 1}, 'max_depth=1 should not be smaller than 2'), + ({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'), ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'), ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'), ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'), diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 0cc301b7b1b36..d770b50e7aa30 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -257,7 +257,14 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): assert len(grower.finalized_leaves) == 1 -@pytest.mark.parametrize('max_depth', [2, 3]) +def assert_is_stump(grower): + # To assert that stumps are created when max_depth=1 + for leaf in (grower.root.left_child, grower.root.right_child): + assert leaf.left_child is None + assert leaf.right_child is None + + +@pytest.mark.parametrize('max_depth', [1, 2, 3]) def test_max_depth(max_depth): # Make sure max_depth parameter works as expected rng = np.random.RandomState(seed=0) @@ -279,6 +286,9 @@ def test_max_depth(max_depth): depth = max(leaf.depth for leaf in grower.finalized_leaves) assert depth == max_depth + if max_depth == 1: + assert_is_stump(grower) + def test_input_validation(): From 8ea05cbd4190c2a6fbc65a8026b40301ba62db3f Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 28 Jan 2020 13:47:07 -0500 Subject: [PATCH 263/448] [MRG] API Deprecates probA_ and probB_ in OneClassSVM and svm.SVR (#15558) --- doc/whats_new/v0.23.rst | 7 +++++++ sklearn/svm/_base.py | 26 +++++++++++++++++--------- sklearn/svm/_classes.py | 29 +++++++++++++++++++++++++++++ sklearn/svm/tests/test_svm.py | 15 +++++++++++++++ 4 files changed, 68 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 75085088ab1a5..67f76446cbf80 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -178,6 +178,13 @@ Changelog - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. +:mod:`sklearn.svm` +.................. + +- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and + `probB_`, are now deprecated as they were not useful. :pr:`15558` by + `Thomas Fan`_. + :mod:`sklearn.tree` ................... diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 43dc8b428e4b3..ea9b52ba4ef89 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -246,8 +246,8 @@ def _dense_fit(self, X, y, sample_weight, solver_type, kernel, # we don't pass **self.get_params() to allow subclasses to # add other parameters to __init__ self.support_, self.support_vectors_, self._n_support, \ - self.dual_coef_, self.intercept_, self.probA_, \ - self.probB_, self.fit_status_ = libsvm.fit( + self.dual_coef_, self.intercept_, self._probA, \ + self._probB, self.fit_status_ = libsvm.fit( X, y, svm_type=solver_type, sample_weight=sample_weight, class_weight=self.class_weight_, kernel=kernel, C=self.C, @@ -270,7 +270,7 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, self.support_, self.support_vectors_, dual_coef_data, \ self.intercept_, self._n_support, \ - self.probA_, self.probB_, self.fit_status_ = \ + self._probA, self._probB, self.fit_status_ = \ libsvm_sparse.libsvm_sparse_train( X.shape[1], X.data, X.indices, X.indptr, y, solver_type, kernel_type, self.degree, self._gamma, self.coef0, self.tol, @@ -334,7 +334,7 @@ def _dense_predict(self, X): return libsvm.predict( X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, - self.probA_, self.probB_, svm_type=svm_type, kernel=kernel, + self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, coef0=self.coef0, gamma=self._gamma, cache_size=self.cache_size) @@ -359,7 +359,7 @@ def _sparse_predict(self, X): C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, - self.probA_, self.probB_) + self._probA, self._probB) def _compute_kernel(self, X): """Return the data transformed by a callable kernel""" @@ -413,7 +413,7 @@ def _dense_decision_function(self, X): return libsvm.decision_function( X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, - self.probA_, self.probB_, + self._probA, self._probB, svm_type=LIBSVM_IMPL.index(self._impl), kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma) @@ -438,7 +438,7 @@ def _sparse_decision_function(self, X): self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, - self.probA_, self.probB_) + self._probA, self._probB) def _validate_for_predict(self, X): check_is_fitted(self) @@ -691,7 +691,7 @@ def _dense_predict_proba(self, X): pprob = libsvm.predict_proba( X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, - self.probA_, self.probB_, + self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma) @@ -717,7 +717,7 @@ def _sparse_predict_proba(self, X): self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, - self.probA_, self.probB_) + self._probA, self._probB) def _get_coef(self): if self.dual_coef_.shape[0] == 1: @@ -734,6 +734,14 @@ def _get_coef(self): return coef + @property + def probA_(self): + return self._probA + + @property + def probB_(self): + return self._probB + def _get_liblinear_solver_type(multi_class, penalty, loss, dual): """Find the liblinear magic number for the solver. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 698acb6ae68b3..d21e8523cac2c 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -8,6 +8,7 @@ from ..utils import check_X_y from ..utils.validation import _num_samples from ..utils.multiclass import check_classification_targets +from ..utils.deprecation import deprecated class LinearSVC(BaseEstimator, LinearClassifierMixin, @@ -968,6 +969,20 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale', shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, max_iter=max_iter, random_state=None) + @deprecated( + "The probA_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probA_(self): + return self._probA + + @deprecated( + "The probB_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probB_(self): + return self._probB + class NuSVR(RegressorMixin, BaseLibSVM): """Nu Support Vector Regression. @@ -1287,3 +1302,17 @@ def predict(self, X): """ y = super().predict(X) return np.asarray(y, dtype=np.intp) + + @deprecated( + "The probA_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probA_(self): + return self._probA + + @deprecated( + "The probB_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probB_(self): + return self._probB diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index d789be7f26383..4360c818e0bd7 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -1233,3 +1233,18 @@ def test_n_support_oneclass_svr(): assert reg.n_support_ == reg.support_vectors_.shape[0] assert reg.n_support_.size == 1 assert reg.n_support_ == 4 + + +# TODO: Remove in 0.25 when probA_ and probB_ are deprecated +@pytest.mark.parametrize("SVMClass, data", [ + (svm.OneClassSVM, (X, )), + (svm.SVR, (X, Y)) +]) +@pytest.mark.parametrize("deprecated_prob", ["probA_", "probB_"]) +def test_svm_probA_proB_deprecated(SVMClass, data, deprecated_prob): + clf = SVMClass().fit(*data) + + msg = ("The {} attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.").format(deprecated_prob) + with pytest.warns(FutureWarning, match=msg): + getattr(clf, deprecated_prob) From b92393ab7892d6995489aa831293a7486dfbe926 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 28 Jan 2020 13:26:23 -0800 Subject: [PATCH 264/448] CI Action to auto assign issues (#16197) --- .github/workflows/assign.yml | 16 ++++++++++++++++ .github/workflows/unassign.yml | 14 ++++++++++++++ doc/developers/contributing.rst | 7 ++++++- 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/assign.yml create mode 100644 .github/workflows/unassign.yml diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml new file mode 100644 index 0000000000000..72643d1bf8ae8 --- /dev/null +++ b/.github/workflows/assign.yml @@ -0,0 +1,16 @@ + +name: Assign +on: + issue_comment: + types: created + +jobs: + one: + runs-on: ubuntu-latest + steps: + - if: github.event.comment.body == 'take' + name: + run: | + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted diff --git a/.github/workflows/unassign.yml b/.github/workflows/unassign.yml new file mode 100644 index 0000000000000..96f1360ba3144 --- /dev/null +++ b/.github/workflows/unassign.yml @@ -0,0 +1,14 @@ +name: Unassign +#Runs when a contributor has unassigned themselves from the issue and adds 'help wanted' and 'stalled' tags +on: + issues: + types: unassigned + +jobs: + one: + runs-on: ubuntu-latest + steps: + - name: + run: | + echo "Marking issue ${{ github.event.issue.number }} as stalled" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["help wanted","Stalled"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 16adf4a607d90..d098a80ae8eec 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -181,7 +181,12 @@ Contributing code If in doubt about duplicated work, or if you want to work on a non-trivial feature, it's recommended to first open an issue in the `issue tracker `_ - to get some feedbacks from core developers. + to get some feedbacks from core developers. + + One easy way to find an issue to work on is by applying the "help wanted" + label in your search. This lists all the issues that have been unclaimed + so far. In order to claim an issue for yourself, please comment exactly + ``take`` on it for the CI to automatically assign the issue to you. How to contribute ----------------- From c6fe262bc69ca27e3ba2d93af5193f2be2803997 Mon Sep 17 00:00:00 2001 From: Rick Mackenbach Date: Wed, 29 Jan 2020 10:33:42 +0100 Subject: [PATCH 265/448] ENH Improve error message for not fitted trees in plot_tree (#16253) --- sklearn/tree/_export.py | 4 ++++ sklearn/tree/tests/test_export.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 212ae4e309749..3197995818f81 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -26,6 +26,7 @@ import warnings + def _color_brew(n): """Generate n colors with equally spaced hues. @@ -174,6 +175,8 @@ def plot_tree(decision_tree, max_depth=None, feature_names=None, """ + check_is_fitted(decision_tree) + if rotate != 'deprecated': warnings.warn(("'rotate' has no effect and is deprecated in 0.23. " "It will be removed in 0.25."), @@ -571,6 +574,7 @@ def _make_tree(self, node_id, et, criterion, depth=0): def export(self, decision_tree, ax=None): import matplotlib.pyplot as plt from matplotlib.text import Annotation + if ax is None: ax = plt.gca() ax.clear() diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index f1c080dea4d2a..ad49f81fcf9ac 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -303,11 +303,11 @@ def test_precision(): # check impurity for finding in finditer(pattern, dot_data): assert (len(search(r"\.\d+", finding.group()).group()) == - precision + 1) + precision + 1) # check threshold for finding in finditer(r"<= \d+\.\d+", dot_data): assert (len(search(r"\.\d+", finding.group()).group()) == - precision + 1) + precision + 1) def test_export_text_errors(): @@ -459,3 +459,12 @@ def test_plot_tree_rotate_deprecation(pyplot): "It will be removed in 0.25.") with pytest.warns(FutureWarning, match=match): plot_tree(tree, rotate=True) + + +def test_not_fitted_tree(pyplot): + + # Testing if not fitted tree throws the correct error + clf = DecisionTreeRegressor() + out = StringIO() + with pytest.raises(NotFittedError): + plot_tree(clf, out) From 1fa689c78289296148ff3b281a6aedff950086e7 Mon Sep 17 00:00:00 2001 From: castor Date: Wed, 29 Jan 2020 11:08:11 +0100 Subject: [PATCH 266/448] [MRG] more informative random_state doc in multilayer_perceptron (#16258) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modified random_state doc in MLP * Update sklearn/neural_network/_multilayer_perceptron.py Co-Authored-By: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> * Update sklearn/neural_network/_multilayer_perceptron.py Co-Authored-By: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- .../neural_network/_multilayer_perceptron.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 9cc66bedb46ce..90f9210db5d6b 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -793,11 +793,12 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'. - random_state : int, RandomState instance or None, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. tol : float, default=1e-4 Tolerance for the optimization. When the loss or score is not improving @@ -1185,11 +1186,12 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'. - random_state : int, RandomState instance or None, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. tol : float, default=1e-4 Tolerance for the optimization. When the loss or score is not improving From c33ce762d7ef4a429ee899b0569cde99fe2c3e62 Mon Sep 17 00:00:00 2001 From: Manimaran Date: Wed, 29 Jan 2020 15:47:43 +0530 Subject: [PATCH 267/448] DOC fix docstring in dbscan referencing glossary (#16267) --- sklearn/cluster/_dbscan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index a464e3951673a..dd1de3043d444 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -52,7 +52,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. + X may be a :term:`sparse graph `, in which case only "nonzero" elements may be considered neighbors. metric_params : dict, optional From dfe444935a2a238b4e92dff11bc62b5949b537e3 Mon Sep 17 00:00:00 2001 From: Alonso Silva Allende Date: Wed, 29 Jan 2020 11:26:36 +0100 Subject: [PATCH 268/448] DOC fix conda command to check if compilers and llvm are installed (#16268) --- doc/developers/advanced_installation.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 8fd0f9ecf0273..6b4b0b1141755 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -252,7 +252,9 @@ scikit-learn from source:: You can check that the custom compilers are properly installed from conda forge using the following command:: - conda list compilers llvm-openmp + conda list + +which should include ``compilers`` and ``llvm-openmp``. The compilers meta-package will automatically set custom environment variables:: From 20a431fc41580a44bca6b65e2c6cbabe69ef8fe3 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 29 Jan 2020 11:34:50 +0100 Subject: [PATCH 269/448] DOC add a coc reference to the main page (#16262) --- doc/templates/index.html | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/templates/index.html b/doc/templates/index.html index 4f69829f413e1..f897ae5f7031c 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -183,6 +183,7 @@

Community

Questions? See FAQ and stackoverflow

Mailing list: scikit-learn@python.org

Gitter: gitter.im/scikit-learn

Communication on all channels should respect PSF's code of conduct.

From 84628b0f45938f25e67244676bd5926c79e642a5 Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Wed, 29 Jan 2020 11:48:21 +0100 Subject: [PATCH 270/448] ENH use utility _check_sample_weight in _BaseDiscreteNB (#16263) --- sklearn/naive_bayes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index dd1d9586db6e1..6238aa294530a 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -569,8 +569,9 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): # We convert it to np.float64 to support sample_weight consistently Y = Y.astype(np.float64, copy=False) if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) sample_weight = np.atleast_2d(sample_weight) - Y *= check_array(sample_weight).T + Y *= sample_weight.T class_prior = self.class_prior @@ -621,9 +622,9 @@ def fit(self, X, y, sample_weight=None): # this means we also don't have to cast X to floating point if sample_weight is not None: Y = Y.astype(np.float64, copy=False) - sample_weight = np.asarray(sample_weight) + sample_weight = _check_sample_weight(sample_weight, X) sample_weight = np.atleast_2d(sample_weight) - Y *= check_array(sample_weight).T + Y *= sample_weight.T class_prior = self.class_prior From a3ca523b2551455d71c9a701db0fca5064e054e9 Mon Sep 17 00:00:00 2001 From: SergioDSR Date: Wed, 29 Jan 2020 11:57:27 +0100 Subject: [PATCH 271/448] random_state for iforest update (#16259) * random_state for iforest update * improve wording --- sklearn/ensemble/_iforest.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index df393b628bb02..d91052b27759f 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -107,10 +107,11 @@ class IsolationForest(OutlierMixin, BaseBagging): 0.24. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the pseudo-randomness of the selection of the feature + and split values for each branching step and each tree in the forest. + + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. verbose : int, default=0 Controls the verbosity of the tree building process. From a58c6c8bf237041337977cc15e1f2990ae776318 Mon Sep 17 00:00:00 2001 From: nicolasservel Date: Wed, 29 Jan 2020 11:57:49 +0100 Subject: [PATCH 272/448] DOC remove caching in wikipedia_principal_eigenvector example (#16271) --- examples/applications/wikipedia_principal_eigenvector.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py index da4234936a911..097bab6c7d4d5 100644 --- a/examples/applications/wikipedia_principal_eigenvector.py +++ b/examples/applications/wikipedia_principal_eigenvector.py @@ -42,8 +42,6 @@ from scipy import sparse -from joblib import Memory - from sklearn.decomposition import randomized_svd from urllib.request import urlopen @@ -74,8 +72,6 @@ # ############################################################################# # Loading the redirect files -memory = Memory(cachedir=".") - def index(redirects, index_map, k): """Find the index of an article name after redirect resolution""" @@ -124,8 +120,6 @@ def get_redirects(redirects_filename): return redirects -# disabling joblib as the pickling of large dicts seems much too slow -#@memory.cache def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None): """Extract the adjacency graph as a scipy sparse matrix From 10e7b2b0d8ca850b5e7367d08322bbaa73b774d7 Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Wed, 29 Jan 2020 14:28:38 +0100 Subject: [PATCH 273/448] DOC Improve random_state descriptions for BaggingClassifier (#16264) and BaggingRegressor --- sklearn/ensemble/_bagging.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index ea4e5eedb6079..b1ae443e78bf1 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -464,13 +464,16 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): The number of base estimators in the ensemble. max_samples : int or float, default=1.0 - The number of samples to draw from X to train each base estimator. + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. max_features : int or float, default=1.0 - The number of features to draw from X to train each base estimator. + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. @@ -501,10 +504,12 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. verbose : int, default=0 Controls the verbosity when fitting and predicting. @@ -866,13 +871,16 @@ class BaggingRegressor(RegressorMixin, BaseBagging): The number of base estimators in the ensemble. max_samples : int or float, default=1.0 - The number of samples to draw from X to train each base estimator. + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. max_features : int or float, default=1.0 - The number of features to draw from X to train each base estimator. + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. @@ -900,10 +908,12 @@ class BaggingRegressor(RegressorMixin, BaseBagging): processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. verbose : int, default=0 Controls the verbosity when fitting and predicting. From 6a27d4dc6fe0e6bf44a797c99b9e17c34eed0e4a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jan 2020 08:54:02 -0500 Subject: [PATCH 274/448] DOC Better UG for calibration (#16175) --- doc/modules/calibration.rst | 267 ++++++++++++----------------- sklearn/calibration.py | 36 ++-- sklearn/metrics/_classification.py | 1 + 3 files changed, 122 insertions(+), 182 deletions(-) diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst index 70286521e09c1..19df08ea3b1fe 100644 --- a/doc/modules/calibration.rst +++ b/doc/modules/calibration.rst @@ -19,9 +19,16 @@ Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance, a well calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, -approximately 80% actually belong to the positive class. The following plot compares -how well the probabilistic predictions of different classifiers are calibrated, -using :func:`calibration_curve`: +approximately 80% actually belong to the positive class. + +Calibration curves +------------------ + +The following plot compares how well the probabilistic predictions of +different classifiers are calibrated, using :func:`calibration_curve`. +The x axis represents the average predicted probability in each bin. The +y axis is the *fraction of positives*, i.e. the proportion of samples whose +class is the positive class (in each bin). .. figure:: ../auto_examples/calibration/images/sphx_glr_plot_compare_calibration_001.png :target: ../auto_examples/calibration/plot_compare_calibration.html @@ -35,177 +42,117 @@ with different biases per method: .. currentmodule:: sklearn.naive_bayes -* :class:`GaussianNB` tends to push probabilities to 0 or 1 (note the - counts in the histograms). This is mainly because it makes the assumption - that features are conditionally independent given the class, which is not - the case in this dataset which contains 2 redundant features. +:class:`GaussianNB` tends to push probabilities to 0 or 1 (note the counts +in the histograms). This is mainly because it makes the assumption that +features are conditionally independent given the class, which is not the +case in this dataset which contains 2 redundant features. .. currentmodule:: sklearn.ensemble -* :class:`RandomForestClassifier` shows the opposite behavior: the histograms - show peaks at approximately 0.2 and 0.9 probability, while probabilities close to - 0 or 1 are very rare. An explanation for this is given by Niculescu-Mizil - and Caruana [4]_: "Methods such as bagging and random forests that average - predictions from a base set of models can have difficulty making predictions - near 0 and 1 because variance in the underlying base models will bias - predictions that should be near zero or one away from these values. Because - predictions are restricted to the interval [0,1], errors caused by variance - tend to be one-sided near zero and one. For example, if a model should - predict p = 0 for a case, the only way bagging can achieve this is if all - bagged trees predict zero. If we add noise to the trees that bagging is - averaging over, this noise will cause some trees to predict values larger - than 0 for this case, thus moving the average prediction of the bagged - ensemble away from 0. We observe this effect most strongly with random - forests because the base-level trees trained with random forests have - relatively high variance due to feature subsetting." As a result, the - calibration curve also referred to as the reliability diagram (Wilks 1995 [5]_) shows a - characteristic sigmoid shape, indicating that the classifier could trust its - "intuition" more and return probabilities closer to 0 or 1 typically. +:class:`RandomForestClassifier` shows the opposite behavior: the histograms +show peaks at approximately 0.2 and 0.9 probability, while probabilities +close to 0 or 1 are very rare. An explanation for this is given by +Niculescu-Mizil and Caruana [1]_: "Methods such as bagging and random +forests that average predictions from a base set of models can have +difficulty making predictions near 0 and 1 because variance in the +underlying base models will bias predictions that should be near zero or one +away from these values. Because predictions are restricted to the interval +[0,1], errors caused by variance tend to be one-sided near zero and one. For +example, if a model should predict p = 0 for a case, the only way bagging +can achieve this is if all bagged trees predict zero. If we add noise to the +trees that bagging is averaging over, this noise will cause some trees to +predict values larger than 0 for this case, thus moving the average +prediction of the bagged ensemble away from 0. We observe this effect most +strongly with random forests because the base-level trees trained with +random forests have relatively high variance due to feature subsetting." As +a result, the calibration curve also referred to as the reliability diagram +(Wilks 1995 [2]_) shows a characteristic sigmoid shape, indicating that the +classifier could trust its "intuition" more and return probabilities closer +to 0 or 1 typically. .. currentmodule:: sklearn.svm -* Linear Support Vector Classification (:class:`LinearSVC`) shows an even more sigmoid curve - as the RandomForestClassifier, which is typical for maximum-margin methods - (compare Niculescu-Mizil and Caruana [4]_), which focus on hard samples - that are close to the decision boundary (the support vectors). - -.. currentmodule:: sklearn.calibration - -Two approaches for performing calibration of probabilistic predictions are -provided: a parametric approach based on Platt's sigmoid model and a -non-parametric approach based on isotonic regression (:mod:`sklearn.isotonic`). -Probability calibration should be done on new data not used for model fitting. -The class :class:`CalibratedClassifierCV` uses a cross-validation generator and -estimates for each split the model parameter on the train samples and the -calibration of the test samples. The probabilities predicted for the -folds are then averaged. Already fitted classifiers can be calibrated by -:class:`CalibratedClassifierCV` via the parameter cv="prefit". In this case, -the user has to take care manually that data for model fitting and calibration -are disjoint. - -The following images demonstrate the benefit of probability calibration. -The first image present a dataset with 2 classes and 3 blobs of -data. The blob in the middle contains random samples of each class. -The probability for the samples in this blob should be 0.5. - -.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_001.png - :target: ../auto_examples/calibration/plot_calibration.html - :align: center - -The following image shows on the data above the estimated probability -using a Gaussian naive Bayes classifier without calibration, -with a sigmoid calibration and with a non-parametric isotonic -calibration. One can observe that the non-parametric model -provides the most accurate probability estimates for samples -in the middle, i.e., 0.5. - -.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_002.png - :target: ../auto_examples/calibration/plot_calibration.html - :align: center - -.. currentmodule:: sklearn.metrics - -The following experiment is performed on an artificial dataset for binary -classification with 100,000 samples (1,000 of them are used for model fitting) -with 20 features. Of the 20 features, only 2 are informative and 10 are -redundant. The figure shows the estimated probabilities obtained with -logistic regression, a linear support-vector classifier (SVC), and linear SVC with -both isotonic calibration and sigmoid calibration. -The Brier score is a metric which is a combination of calibration loss and refinement loss, -:func:`brier_score_loss`, reported in the legend (the smaller the better). -Calibration loss is defined as the mean squared deviation from empirical probabilities -derived from the slope of ROC segments. Refinement loss can be defined as the expected -optimal loss as measured by the area under the optimal cost curve. - -.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_curve_002.png - :target: ../auto_examples/calibration/plot_calibration_curve.html - :align: center +Linear Support Vector Classification (:class:`LinearSVC`) shows an even more +sigmoid curve as the RandomForestClassifier, which is typical for +maximum-margin methods (compare Niculescu-Mizil and Caruana [1]_), which +focus on hard samples that are close to the decision boundary (the support +vectors). -One can observe here that logistic regression is well calibrated as its curve is -nearly diagonal. Linear SVC's calibration curve or reliability diagram has a -sigmoid curve, which is typical for an under-confident classifier. In the case of -LinearSVC, this is caused by the margin property of the hinge loss, which lets -the model focus on hard samples that are close to the decision boundary -(the support vectors). Both kinds of calibration can fix this issue and yield -nearly identical results. The next figure shows the calibration curve of -Gaussian naive Bayes on the same data, with both kinds of calibration and also -without calibration. - -.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_curve_001.png - :target: ../auto_examples/calibration/plot_calibration_curve.html - :align: center - -One can see that Gaussian naive Bayes performs very badly but does so in an -other way than linear SVC: While linear SVC exhibited a sigmoid calibration -curve, Gaussian naive Bayes' calibration curve has a transposed-sigmoid shape. -This is typical for an over-confident classifier. In this case, the classifier's -overconfidence is caused by the redundant features which violate the naive Bayes -assumption of feature-independence. - -Calibration of the probabilities of Gaussian naive Bayes with isotonic -regression can fix this issue as can be seen from the nearly diagonal -calibration curve. Sigmoid calibration also improves the brier score slightly, -albeit not as strongly as the non-parametric isotonic calibration. This is an -intrinsic limitation of sigmoid calibration, whose parametric form assumes a -sigmoid rather than a transposed-sigmoid curve. The non-parametric isotonic -calibration model, however, makes no such strong assumptions and can deal with -either shape, provided that there is sufficient calibration data. In general, -sigmoid calibration is preferable in cases where the calibration curve is sigmoid -and where there is limited calibration data, while isotonic calibration is -preferable for non-sigmoid calibration curves and in situations where large -amounts of data are available for calibration. +Calibrating a classifier +------------------------ .. currentmodule:: sklearn.calibration -:class:`CalibratedClassifierCV` can also deal with classification tasks that -involve more than two classes if the base estimator can do so. In this case, -the classifier is calibrated first for each class separately in an one-vs-rest -fashion. When predicting probabilities for unseen data, the calibrated -probabilities for each class are predicted separately. As those probabilities -do not necessarily sum to one, a postprocessing is performed to normalize them. - -The next image illustrates how sigmoid calibration changes predicted -probabilities for a 3-class classification problem. Illustrated is the standard -2-simplex, where the three corners correspond to the three classes. Arrows point -from the probability vectors predicted by an uncalibrated classifier to the -probability vectors predicted by the same classifier after sigmoid calibration -on a hold-out validation set. Colors indicate the true class of an instance -(red: class 1, green: class 2, blue: class 3). - -.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_multiclass_001.png - :target: ../auto_examples/calibration/plot_calibration_multiclass.html - :align: center - -The base classifier is a random forest classifier with 25 base estimators -(trees). If this classifier is trained on all 800 training datapoints, it is -overly confident in its predictions and thus incurs a large log-loss. -Calibrating an identical classifier, which was trained on 600 datapoints, with -method='sigmoid' on the remaining 200 datapoints reduces the confidence of the -predictions, i.e., moves the probability vectors from the edges of the simplex -towards the center: - -.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_multiclass_002.png - :target: ../auto_examples/calibration/plot_calibration_multiclass.html - :align: center - -This calibration results in a lower log-loss. Note that an alternative would -have been to increase the number of base estimators which would have resulted in -a similar decrease in log-loss. +Calibrating a classifier consists in fitting a regressor (called a +*calibrator*) that maps the output of the classifier (as given by +:term:`predict` or :term:`predict_proba`) to a calibrated probability in [0, +1]. Denoting the output of the classifier for a given sample by :math:`f_i`, +the calibrator tries to predict :math:`p(y_i = 1 | f_i)`. + +The samples that are used to train the calibrator should not be used to +train the target classifier. + +Usage +----- + +The :class:`CalibratedClassifierCV` class is used to calibrate a classifier. + +:class:`CalibratedClassifierCV` uses a cross-validation approach to fit both +the classifier and the regressor. For each of the k `(trainset, testset)` +couple, a classifier is trained on the train set, and its predictions on the +test set are used to fit a regressor. We end up with k +`(classifier, regressor)` couples where each regressor maps the output of +its corresponding classifier into [0, 1]. Each couple is exposed in the +`calibrated_classifiers_` attribute, where each entry is a calibrated +classifier with a :term:`predict_proba` method that outputs calibrated +probabilities. The output of :term:`predict_proba` for the main +:class:`CalibratedClassifierCV` instance corresponds to the average of the +predicted probabilities of the `k` estimators in the +`calibrated_classifiers_` list. The output of :term:`predict` is the class +that has the highest probability. + +The regressor that is used for calibration depends on the `method` +parameter. `'sigmoid'` corresponds to a parametric approach based on Platt's +logistic model [3]_, i.e. :math:`p(y_i = 1 | f_i)` is modeled as +:math:`\sigma(A f_i + B)` where :math:`\sigma` is the logistic function, and +:math:`A` and :math:`B` are real numbers to be determined when fitting the +regressor via maximum likelihood. `'isotonic'` will instead fit a +non-parametric isotonic regressor, which outputs a step-wise non-decreasing +function (see :mod:`sklearn.isotonic`). + +An already fitted classifier can be calibrated by setting `cv="prefit"`. In +this case, the data is only used to fit the regressor. It is up to the user +make sure that the data used for fitting the classifier is disjoint from the +data used for fitting the regressor. + +:class:`CalibratedClassifierCV` can calibrate probabilities in a multiclass +setting if the base estimator supports multiclass predictions. The classifier +is calibrated first for each class separately in a one-vs-rest fashion [4]_. +When predicting probabilities, the calibrated probabilities for each class +are predicted separately. As those probabilities do not necessarily sum to +one, a postprocessing is performed to normalize them. + +The :func:`sklearn.metrics.brier_score_loss` may be used to evaluate how +well a classifier is calibrated. + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_calibration_plot_calibration_curve.py` + * :ref:`sphx_glr_auto_examples_calibration_plot_calibration_multiclass.py` + * :ref:`sphx_glr_auto_examples_calibration_plot_calibration.py` + * :ref:`sphx_glr_auto_examples_calibration_plot_compare_calibration.py` .. topic:: References: - * Obtaining calibrated probability estimates from decision trees - and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 - - * Transforming Classifier Scores into Accurate Multiclass - Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) - - * Probabilistic Outputs for Support Vector Machines and Comparisons to - Regularized Likelihood Methods, J. Platt, (1999) - - .. [4] Predicting Good Probabilities with Supervised Learning, + .. [1] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 - .. [5] On the combination of forecast probabilities for + .. [2] On the combination of forecast probabilities for consecutive precipitation periods. Wea. Forecasting, 5, 640–650., Wilks, D. S., 1990a + + .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons + to Regularized Likelihood Methods, J. Platt, (1999) + + .. [4] Transforming Classifier Scores into Accurate Multiclass + Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index d4f6fea8c93b5..e90207bb5eca7 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -31,34 +31,25 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): - """Probability calibration with isotonic regression or sigmoid. - - See glossary entry for :term:`cross-validation estimator`. + """Probability calibration with isotonic regression or logistic regression. - With this class, the base_estimator is fit on the train set of the - cross-validation generator and the test set is used for calibration. - The probabilities for each of the folds are then averaged - for prediction. In case that cv="prefit" is passed to __init__, - it is assumed that base_estimator has been fitted already and all - data is used for calibration. Note that data for fitting the - classifier and for calibrating it must be disjoint. + The calibration is based on the :term:`decision_function` method of the + `base_estimator` if it exists, else on :term:`predict_proba`. Read more in the :ref:`User Guide `. Parameters ---------- base_estimator : instance BaseEstimator - The classifier whose output decision function needs to be calibrated - to offer more accurate predict_proba outputs. If cv=prefit, the - classifier must have been fit already on data. + The classifier whose output need to be calibrated to provide more + accurate `predict_proba` outputs. method : 'sigmoid' or 'isotonic' The method to use for calibration. Can be 'sigmoid' which - corresponds to Platt's method or 'isotonic' which is a - non-parametric approach. It is not advised to use isotonic calibration - with too few calibration samples ``(<<1000)`` since it tends to - overfit. - Use sigmoids (Platt's calibration) in this case. + corresponds to Platt's method (i.e. a logistic regression model) or + 'isotonic' which is a non-parametric approach. It is not advised to + use isotonic calibration with too few calibration samples + ``(<<1000)`` since it tends to overfit. cv : integer, cross-validation generator, iterable or "prefit", optional Determines the cross-validation splitting strategy. @@ -77,7 +68,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin, Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. - If "prefit" is passed, it is assumed that base_estimator has been + If "prefit" is passed, it is assumed that `base_estimator` has been fitted already and all data is used for calibration. .. versionchanged:: 0.22 @@ -89,7 +80,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin, The class labels. calibrated_classifiers_ : list (len() equal to cv or 1 if cv == "prefit") - The list of calibrated classifiers, one for each crossvalidation fold, + The list of calibrated classifiers, one for each cross-validation fold, which has been fitted on all but the validation fold and calibrated on the validation fold. @@ -223,8 +214,9 @@ def predict_proba(self, X): return mean_proba def predict(self, X): - """Predict the target of new samples. Can be different from the - prediction of the uncalibrated classifier. + """Predict the target of new samples. The predicted class is the + class that has the highest probability, and can thus be different + from the prediction of the uncalibrated classifier. Parameters ---------- diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 15d5aa6687131..2ee31e060e1ee 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -2333,6 +2333,7 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): """Compute the Brier score. + The smaller the Brier score, the better, hence the naming with "loss". Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned From df64d1e5a588138e3108118f2185c1f04b239529 Mon Sep 17 00:00:00 2001 From: judithabk6 Date: Wed, 29 Jan 2020 15:06:59 +0100 Subject: [PATCH 275/448] DOC ensure all attributes are documented for PassiveAggressiveClassifier (#16272) --- sklearn/linear_model/_passive_aggressive.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index ed0b1c68e1d25..3b8354f5a7352 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -130,6 +130,9 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): Number of weight updates performed during training. Same as ``(n_iter_ * n_samples)``. + loss_function_ : callable + Loss function used by the algorithm. + Examples -------- >>> from sklearn.linear_model import PassiveAggressiveClassifier From 46bd39d70465d328136d9415c32abee0e5d5ceff Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Wed, 29 Jan 2020 17:27:27 +0100 Subject: [PATCH 276/448] MNT avoid running doctests locally on Windows (#16269) --- conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conftest.py b/conftest.py index b98bb4b271aca..17c3f4b144346 100644 --- a/conftest.py +++ b/conftest.py @@ -6,6 +6,7 @@ # the one from site-packages. import platform +import sys from distutils.version import LooseVersion import os @@ -61,6 +62,10 @@ def pytest_collection_modifyitems(config, items): reason = ('doctest are only run when the default numpy int is ' '64 bits.') skip_doctests = True + elif sys.platform.startswith("win32"): + reason = ("doctests are not run for Windows because numpy arrays " + "repr is inconsistent across platforms.") + skip_doctests = True except ImportError: pass From fe7edc3660a174a9390cc610c700086fca29a87c Mon Sep 17 00:00:00 2001 From: gholdman1 <48828869+gholdman1@users.noreply.github.com> Date: Wed, 29 Jan 2020 16:34:42 +0000 Subject: [PATCH 277/448] Fix typos in doc/developers/develop.rst (#16162) --- doc/developers/develop.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index a14d880921a03..85f1515d77d97 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -74,7 +74,7 @@ multiple interfaces): Estimators ---------- -The API has one predominant object: the estimator. A estimator is an +The API has one predominant object: the estimator. An estimator is an object that fits a model based on some training data and is capable of inferring some properties on new data. It can be, for instance, a classifier or a regressor. All estimators implement the fit method:: @@ -220,7 +220,7 @@ an integer called ``n_iter``. Pairwise Attributes ^^^^^^^^^^^^^^^^^^^ -An estimator that accept ``X`` of shape ``(n_samples, n_samples)`` and defines +An estimator that accepts ``X`` of shape ``(n_samples, n_samples)`` and defines a :term:`_pairwise` property equal to ``True`` allows for cross-validation of the dataset, e.g. when ``X`` is a precomputed kernel matrix. Specifically, the :term:`_pairwise` property is used by ``utils.metaestimators._safe_split`` From 605c6fee8e6a530624593d53eca1b1a58fd684c1 Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Wed, 29 Jan 2020 18:02:36 +0100 Subject: [PATCH 278/448] [DOC] Make random_state descriptions for AdaBoost (#16278) --- sklearn/ensemble/_weight_boosting.py | 31 +++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 77337c1b662a1..a654ed9de2ef5 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -131,6 +131,8 @@ def fit(self, X, y, sample_weight=None): self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) + # Initializion of the random number instance that will be used to + # generate a seed at each iteration random_state = check_random_state(self.random_state) for iboost in range(self.n_estimators): @@ -322,10 +324,11 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting): achieving a lower test error with fewer boosting iterations. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random seed given at each `base_estimator` at each + boosting iteration. + Thus, it is only used when `base_estimator` exposes a `random_state`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- @@ -477,7 +480,8 @@ def _boost(self, iboost, X, y, sample_weight, random_state): The current sample weights. random_state : RandomState - The current random number generator + The RandomState instance used if the base estimator accepts a + `random_state` attribute. Returns ------- @@ -898,10 +902,13 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting): boosting iteration. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random seed given at each `base_estimator` at each + boosting iteration. + Thus, it is only used when `base_estimator` exposes a `random_state`. + In addition, it controls the bootstrap of the weights used to train the + `base_estimator` at each boosting iteration. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- @@ -1020,7 +1027,11 @@ def _boost(self, iboost, X, y, sample_weight, random_state): The current sample weights. random_state : RandomState - The current random number generator + The RandomState instance used if the base estimator accepts a + `random_state` attribute. + Controls also the bootstrap of the weights used to train the weak + learner. + replacement. Returns ------- From e61fc6cdd1c245c30dac126c2be595773884d039 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jan 2020 13:32:16 -0500 Subject: [PATCH 279/448] DOC Fix alpha param of ridge estimators (#16288) --- sklearn/linear_model/_ridge.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 15469391388b1..a1f6b89230b94 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -256,8 +256,9 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. - Alpha corresponds to ``C^-1`` in other linear models such as - LogisticRegression or LinearSVC. If an array is passed, penalties are + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`sklearn.svm.LinearSVC`. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. @@ -618,8 +619,9 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. - Alpha corresponds to ``C^-1`` in other linear models such as - LogisticRegression or LinearSVC. If an array is passed, penalties are + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`sklearn.svm.LinearSVC`. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. @@ -773,8 +775,9 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. - Alpha corresponds to ``C^-1`` in other linear models such as - LogisticRegression or LinearSVC. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`sklearn.svm.LinearSVC`. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no @@ -1636,8 +1639,9 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. - Alpha corresponds to ``C^-1`` in other linear models such as - LogisticRegression or LinearSVC. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`sklearn.svm.LinearSVC`. If using generalized cross-validation, alphas must be positive. fit_intercept : bool, default=True @@ -1752,8 +1756,9 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. - Alpha corresponds to ``C^-1`` in other linear models such as - LogisticRegression or LinearSVC. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`sklearn.svm.LinearSVC`. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set From 3109add033757aa8c69cfa2167316ca236b15489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kie=C3=9Fling?= Date: Wed, 29 Jan 2020 19:37:38 +0100 Subject: [PATCH 280/448] [MRG] Fix FutureWarning in plot_partial_dependence_visualization_api.py (#16256) --- sklearn/inspection/_partial_dependence.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 2f42dece3f40a..e24b172e30b2f 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -25,7 +25,6 @@ from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted -from ..tree._tree import DTYPE from ..exceptions import NotFittedError from ..ensemble._gb import BaseGradientBoosting from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import ( @@ -592,7 +591,9 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, from matplotlib.ticker import ScalarFormatter # noqa # set target_idx for multi-class estimators - if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2: + if (is_classifier(estimator) and + hasattr(estimator, 'classes_') and + np.size(estimator.classes_) > 2): if target is None: raise ValueError('target must be specified for multi-class') target_idx = np.searchsorted(estimator.classes_, target) From 7d10be4c3c8085e57fbd5317b4bf7a5081e87709 Mon Sep 17 00:00:00 2001 From: alexshacked Date: Wed, 29 Jan 2020 20:40:22 +0200 Subject: [PATCH 281/448] =?UTF-8?q?ENH=20Changed=20implementation=20of=20B?= =?UTF-8?q?irch.predict=20to=20use=20pairwise=5Fdistances=E2=80=A6=20(#161?= =?UTF-8?q?49)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/cluster/_birch.py | 11 +++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 67f76446cbf80..45219b8346b35 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -51,6 +51,12 @@ Changelog more memory efficient implementation of single linkage clustering. :pr:`11514` by :user:`Leland McInnes `. +- |Efficiency| :class:`cluster.Birch` implementation of the predict method + avoids high memory footprint by calculating the distances matrix using + a chunked scheme. + :pr:`16149` by :user:`Jeremie du Boisberranger ` and + :user:`Alex Shacked `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 0a16586caae9a..299a1c57f9f19 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -8,6 +8,7 @@ from scipy import sparse from math import sqrt +from ..metrics import pairwise_distances_argmin from ..metrics.pairwise import euclidean_distances from ..base import TransformerMixin, ClusterMixin, BaseEstimator from ..utils import check_array @@ -579,10 +580,12 @@ def predict(self, X): """ X = check_array(X, accept_sparse='csr') self._check_fit(X) - reduced_distance = safe_sparse_dot(X, self.subcluster_centers_.T) - reduced_distance *= -2 - reduced_distance += self._subcluster_norms - return self.subcluster_labels_[np.argmin(reduced_distance, axis=1)] + kwargs = {'Y_norm_squared': self._subcluster_norms} + return self.subcluster_labels_[ + pairwise_distances_argmin(X, + self.subcluster_centers_, + metric_kwargs=kwargs) + ] def transform(self, X): """ From 00841fab693de82432ef153724a2667d328cc3a1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 29 Jan 2020 17:44:20 -0500 Subject: [PATCH 282/448] DOC Minor doc update to KernelRidge (#16295) --- doc/modules/kernel_ridge.rst | 9 ++++---- sklearn/kernel_ridge.py | 41 ++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst index a193efaaf2d67..a67733b1ca5a5 100644 --- a/doc/modules/kernel_ridge.rst +++ b/doc/modules/kernel_ridge.rst @@ -7,10 +7,11 @@ Kernel ridge regression .. currentmodule:: sklearn.kernel_ridge Kernel ridge regression (KRR) [M2012]_ combines :ref:`ridge_regression` -(linear least squares with l2-norm regularization) with the kernel trick. It -thus learns a linear function in the space induced by the respective kernel and -the data. For non-linear kernels, this corresponds to a non-linear -function in the original space. +(linear least squares with l2-norm regularization) with the `kernel trick +`_. It thus learns a linear +function in the space induced by the respective kernel and the data. For +non-linear kernels, this corresponds to a non-linear function in the original +space. The form of the model learned by :class:`KernelRidge` is identical to support vector regression (:class:`~sklearn.svm.SVR`). However, different loss diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index dd7cc73124235..d8f0e16d6f2b9 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -38,19 +38,29 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): Parameters ---------- - alpha : {float, array-like}, shape = [n_targets] - Small positive values of alpha improve the conditioning of the problem - and reduce the variance of the estimates. Alpha corresponds to - ``(2*C)^-1`` in other linear models such as LogisticRegression or - LinearSVC. If an array is passed, penalties are assumed to be specific - to the targets. Hence they must correspond in number. + alpha : float or array-like of shape (n_targets,) + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`sklearn.svm.LinearSVC`. If an array is passed, penalties are + assumed to be specific to the targets. Hence they must correspond in + number. See :ref:`ridge_regression` for formula. kernel : string or callable, default="linear" - Kernel mapping used internally. A callable should accept two arguments - and the keyword arguments passed to this object as kernel_params, and - should return a floating point number. Set to "precomputed" in - order to pass a precomputed kernel matrix to the estimator - methods instead of samples. + Kernel mapping used internally. This parameter is directly passed to + :class:`sklearn.metrics.pairwise.pairwise_kernel`. + If `kernel` is a string, it must be one of the metrics + in `pairwise.PAIRWISE_KERNEL_FUNCTIONS`. + If `kernel` is "precomputed", X is assumed to be a kernel matrix. + Alternatively, if `kernel` is a callable function, it is called on + each pair of instances (rows) and the resulting value recorded. The + callable should take two rows from X as input and return the + corresponding kernel value as a single number. This means that + callables from :mod:`sklearn.metrics.pairwise` are not allowed, as + they operate on matrices, not single samples. Use the string + identifying the kernel instead. gamma : float, default=None Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 @@ -71,13 +81,13 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): Attributes ---------- - dual_coef_ : array, shape = [n_samples] or [n_samples, n_targets] + dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets) Representation of weight vector(s) in kernel space - X_fit_ : {array-like, sparse matrix} of shape (n_samples, n_features) + X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data, which is also required for prediction. If kernel == "precomputed" this is instead the precomputed - training matrix, shape = [n_samples, n_samples]. + training matrix, of shape (n_samples, n_samples). References ---------- @@ -134,8 +144,7 @@ def fit(self, X, y=None, sample_weight=None): ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. If kernel == "precomputed" this is instead - a precomputed kernel matrix, shape = [n_samples, - n_samples]. + a precomputed kernel matrix, of shape (n_samples, n_samples). y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values From 6d2b7bc6da26249cfcc99a45ee78a76b459fc3da Mon Sep 17 00:00:00 2001 From: Vandana Iyer Date: Thu, 30 Jan 2020 00:03:53 -0800 Subject: [PATCH 283/448] Fix User guide description for the Linnerud dataset #16294 (#16297) 2 words - "physiological" and "exercise" were swapped in the description, which has been fixed. --- sklearn/datasets/descr/linnerud.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/descr/linnerud.rst b/sklearn/datasets/descr/linnerud.rst index 5585b50a7e42b..b7d233b0f3599 100644 --- a/sklearn/datasets/descr/linnerud.rst +++ b/sklearn/datasets/descr/linnerud.rst @@ -11,10 +11,10 @@ Linnerrud dataset The Linnerud dataset constains two small dataset: -- *physiological* - CSV containing 20 observations on 3 exercise variables: +- *physiological* - CSV containing 20 observations on 3 physiological variables: Weight, Waist and Pulse. -- *exercise* - CSV containing 20 observations on 3 physiological variables: +- *exercise* - CSV containing 20 observations on 3 exercise variables: Chins, Situps and Jumps. .. topic:: References From 98f0eb8137ed7a59d8b65ea21bc06955d60bf690 Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Thu, 30 Jan 2020 10:16:48 +0100 Subject: [PATCH 284/448] DOC improve random state docstring in ClassifierChain and RegressorChain (#16291) --- sklearn/multioutput.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 90e393e19503a..b2be60a434cb5 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -547,12 +547,13 @@ class ClassifierChain(MetaEstimatorMixin, ClassifierMixin, _BaseChain): - An iterable yielding (train, test) splits as arrays of indices. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - The random number generator is used to generate random chain orders. + If ``order='random'``, determines random number generation for the + chain orders. + In addition, it controls the random seed given at each `base_estimator` + at each chaining iteration. Thus, it is only used when `base_estimator` + exposes a `random_state`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- @@ -707,12 +708,13 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): - An iterable yielding (train, test) splits as arrays of indices. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - The random number generator is used to generate random chain orders. + If ``order='random'``, determines random number generation for the + chain orders. + In addition, it controls the random seed given at each `base_estimator` + at each chaining iteration. Thus, it is only used when `base_estimator` + exposes a `random_state`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- From e336d803f1a283194b68f5a3797014d8ed8350b3 Mon Sep 17 00:00:00 2001 From: Mohamed Maskani Date: Thu, 30 Jan 2020 11:40:50 +0100 Subject: [PATCH 285/448] Specify compilers version to work-around issues on MacOS (#16282) --- build_tools/azure/install.sh | 2 +- doc/developers/advanced_installation.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 1ef981b5dd6e8..f119b280041d7 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -44,7 +44,7 @@ if [[ "$DISTRIB" == "conda" ]]; then if [[ "$UNAMESTR" == "Darwin" ]]; then if [[ "$SKLEARN_TEST_NO_OPENMP" != "true" ]]; then # on macOS, install an OpenMP-enabled clang/llvm from conda-forge. - TO_INSTALL="$TO_INSTALL conda-forge::compilers \ + TO_INSTALL="$TO_INSTALL conda-forge::compilers>=1.0.4 \ conda-forge::llvm-openmp" fi fi diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 6b4b0b1141755..d86e59de0e746 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -237,7 +237,7 @@ It is recommended to use a dedicated `conda environment`_ to build scikit-learn from source:: conda create -n sklearn-dev python numpy scipy cython joblib pytest \ - conda-forge::compilers conda-forge::llvm-openmp + "conda-forge::compilers>=1.0.4" conda-forge::llvm-openmp conda activate sklearn-dev make clean pip install --verbose --editable . From 323b4a1ea87dd42ce43c72055a859e70a23f3571 Mon Sep 17 00:00:00 2001 From: lopusz Date: Thu, 30 Jan 2020 14:20:10 +0100 Subject: [PATCH 286/448] DOC Cleaning parameter docstrings in discriminant_analysis (#15761) (#16304) --- sklearn/discriminant_analysis.py | 110 +++++++++++++++---------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 1495d00620911..369214757a858 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -34,10 +34,10 @@ def _cov(X, shrinkage=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Input data. - shrinkage : string or float, optional + shrinkage : {'empirical', 'auto'} or float, default=None Shrinkage parameter, possible values: - None or 'empirical': no shrinkage (default). - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. @@ -45,7 +45,7 @@ def _cov(X, shrinkage=None): Returns ------- - s : array, shape (n_features, n_features) + s : ndarray of shape (n_features, n_features) Estimated covariance matrix. """ shrinkage = "empirical" if shrinkage is None else shrinkage @@ -74,15 +74,15 @@ def _class_means(X, y): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Input data. - y : array-like, shape (n_samples,) or (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. Returns ------- - means : array-like, shape (n_classes, n_features) + means : array-like of shape (n_classes, n_features) Class means. """ classes, y = np.unique(y, return_inverse=True) @@ -98,16 +98,16 @@ def _class_cov(X, y, priors, shrinkage=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Input data. - y : array-like, shape (n_samples,) or (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. - priors : array-like, shape (n_classes,) + priors : array-like of shape (n_classes,) Class priors. - shrinkage : string or float, optional + shrinkage : 'auto' or float, default=None Shrinkage parameter, possible values: - None: no shrinkage (default). - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. @@ -115,7 +115,7 @@ def _class_cov(X, y, priors, shrinkage=None): Returns ------- - cov : array-like, shape (n_features, n_features) + cov : array-like of shape (n_features, n_features) Class covariance matrix. """ classes = np.unique(y) @@ -146,7 +146,7 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, Parameters ---------- - solver : string, optional + solver : {'svd', 'lsqr', 'eigen'}, default='svd' Solver to use, possible values: - 'svd': Singular value decomposition (default). Does not compute the covariance matrix, therefore this solver is @@ -154,7 +154,7 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, - 'lsqr': Least squares solution, can be combined with shrinkage. - 'eigen': Eigenvalue decomposition, can be combined with shrinkage. - shrinkage : string or float, optional + shrinkage : 'auto' or float, default=None Shrinkage parameter, possible values: - None: no shrinkage (default). - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. @@ -162,55 +162,55 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, Note that shrinkage works only with 'lsqr' and 'eigen' solvers. - priors : array, optional, shape (n_classes,) + priors : array-like of shape (n_classes,), default=None Class priors. - n_components : int, optional (default=None) + n_components : int, default=None Number of components (<= min(n_classes - 1, n_features)) for dimensionality reduction. If None, will be set to min(n_classes - 1, n_features). - store_covariance : bool, optional + store_covariance : bool, default=False Additionally compute class covariance matrix (default False), used only in 'svd' solver. .. versionadded:: 0.17 - tol : float, optional, (default 1.0e-4) + tol : float, default=1.0e-4 Threshold used for rank estimation in SVD solver. .. versionadded:: 0.17 Attributes ---------- - coef_ : array, shape (n_features,) or (n_classes, n_features) + coef_ : ndarray of shape (n_features,) or (n_classes, n_features) Weight vector(s). - intercept_ : array, shape (n_classes,) + intercept_ : ndarray of shape (n_classes,) Intercept term. - covariance_ : array-like, shape (n_features, n_features) + covariance_ : array-like of shape (n_features, n_features) Covariance matrix (shared by all classes). - explained_variance_ratio_ : array, shape (n_components,) + explained_variance_ratio_ : ndarray of shape (n_components,) Percentage of variance explained by each of the selected components. If ``n_components`` is not set then all components are stored and the sum of explained variances is equal to 1.0. Only available when eigen or svd solver is used. - means_ : array-like, shape (n_classes, n_features) + means_ : array-like of shape (n_classes, n_features) Class means. - priors_ : array-like, shape (n_classes,) + priors_ : array-like of shape (n_classes,) Class priors (sum to 1). - scalings_ : array-like, shape (rank, n_classes - 1) + scalings_ : array-like of shape (rank, n_classes - 1) Scaling of the features in the space spanned by the class centroids. - xbar_ : array-like, shape (n_features,) + xbar_ : array-like of shape (n_features,) Overall mean. - classes_ : array-like, shape (n_classes,) + classes_ : array-like of shape (n_classes,) Unique class labels. See also @@ -267,15 +267,15 @@ def _solve_lsqr(self, X, y, shrinkage): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data. - y : array-like, shape (n_samples,) or (n_samples, n_classes) + y : array-like of shape (n_samples,) or (n_samples, n_classes) Target values. - shrinkage : string or float, optional + shrinkage : 'auto', float or None Shrinkage parameter, possible values: - - None: no shrinkage (default). + - None: no shrinkage. - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage parameter. @@ -305,15 +305,15 @@ class scatter). This solver supports both classification and Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data. - y : array-like, shape (n_samples,) or (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. - shrinkage : string or float, optional + shrinkage : 'auto', float or None Shrinkage parameter, possible values: - - None: no shrinkage (default). + - None: no shrinkage. - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage constant. @@ -349,10 +349,10 @@ def _solve_svd(self, X, y): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data. - y : array-like, shape (n_samples,) or (n_samples, n_targets) + y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. """ n_samples, n_features = X.shape @@ -417,10 +417,10 @@ def fit(self, X, y): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data. - y : array, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. """ X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self, @@ -483,12 +483,12 @@ def transform(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Input data. Returns ------- - X_new : array, shape (n_samples, n_components) + X_new : ndarray of shape (n_samples, n_components) Transformed data. """ if self.solver == 'lsqr': @@ -509,12 +509,12 @@ def predict_proba(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Input data. Returns ------- - C : array, shape (n_samples, n_classes) + C : ndarray of shape (n_samples, n_classes) Estimated probabilities. """ check_is_fitted(self) @@ -531,12 +531,12 @@ def predict_log_proba(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Input data. Returns ------- - C : array, shape (n_samples, n_classes) + C : ndarray of shape (n_samples, n_classes) Estimated log probabilities. """ return np.log(self.predict_proba(X)) @@ -558,20 +558,20 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): Parameters ---------- - priors : array, optional, shape = [n_classes] + priors : ndarray of shape (n_classes,), default=None Priors on classes - reg_param : float, optional + reg_param : float, default=0.0 Regularizes the covariance estimate as ``(1-reg_param)*Sigma + reg_param*np.eye(n_features)`` - store_covariance : boolean + store_covariance : bool, default=False If True the covariance matrices are computed and stored in the `self.covariance_` attribute. .. versionadded:: 0.17 - tol : float, optional, default 1.0e-4 + tol : float, default=1.0e-4 Threshold used for rank estimation. .. versionadded:: 0.17 @@ -584,21 +584,21 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): means_ : array-like of shape (n_classes, n_features) Class means. - priors_ : array-like of shape (n_classes) + priors_ : array-like of shape (n_classes,) Class priors (sum to 1). - rotations_ : list of arrays - For each class k an array of shape [n_features, n_k], with + rotations_ : list of ndarrays + For each class k an array of shape (n_features, n_k), with ``n_k = min(n_features, number of elements in class k)`` It is the rotation of the Gaussian distribution, i.e. its principal axis. - scalings_ : list of arrays - For each class k an array of shape [n_k]. It contains the scaling + scalings_ : list of ndarrays + For each class k an array of shape (n_k,). It contains the scaling of the Gaussian distributions along its principal axes, i.e. the variance in the rotated coordinate system. - classes_ : array-like, shape (n_classes,) + classes_ : array-like of shape (n_classes,) Unique class labels. Examples @@ -642,7 +642,7 @@ def fit(self, X, y): Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array, shape = [n_samples] + y : array-like of shape (n_samples,) Target values (integers) """ X, y = check_X_y(X, y) From 034c021fb274076982cb342cf1d9aaad72f0873e Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Thu, 30 Jan 2020 14:48:22 +0100 Subject: [PATCH 287/448] Fix Typo in random_state for multioutput.py (#16309) --- sklearn/multioutput.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index b2be60a434cb5..4cf7783c14e65 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -548,7 +548,7 @@ class ClassifierChain(MetaEstimatorMixin, ClassifierMixin, _BaseChain): random_state : int, RandomState instance or None, optional (default=None) If ``order='random'``, determines random number generation for the - chain orders. + chain order. In addition, it controls the random seed given at each `base_estimator` at each chaining iteration. Thus, it is only used when `base_estimator` exposes a `random_state`. @@ -709,7 +709,7 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): random_state : int, RandomState instance or None, optional (default=None) If ``order='random'``, determines random number generation for the - chain orders. + chain order. In addition, it controls the random seed given at each `base_estimator` at each chaining iteration. Thus, it is only used when `base_estimator` exposes a `random_state`. From d2b85426342ea734cd0e04fe7e29fe330795cf92 Mon Sep 17 00:00:00 2001 From: Tiphaine Viard Date: Thu, 30 Jan 2020 16:43:02 +0100 Subject: [PATCH 288/448] EXA change max_iter and enable early_stopping to get rid of convergence warning in plot_mlp_alpha (#16260) --- examples/neural_networks/plot_mlp_alpha.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/neural_networks/plot_mlp_alpha.py b/examples/neural_networks/plot_mlp_alpha.py index 7f718539131d4..4d18fdcaa314c 100644 --- a/examples/neural_networks/plot_mlp_alpha.py +++ b/examples/neural_networks/plot_mlp_alpha.py @@ -28,6 +28,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import make_pipeline h = .02 # step size in the mesh @@ -36,8 +37,13 @@ classifiers = [] for i in alphas: - classifiers.append(MLPClassifier(solver='lbfgs', alpha=i, random_state=1, - hidden_layer_sizes=[100, 100])) + classifiers.append(make_pipeline( + StandardScaler(), + MLPClassifier(solver='lbfgs', alpha=i, + random_state=1, max_iter=2000, + early_stopping=True, + hidden_layer_sizes=[100, 100]) + )) X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=0, n_clusters_per_class=1) From 9c2d889d1feb1d6401a20a3fb30de45e4e5c2577 Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Thu, 30 Jan 2020 17:06:05 +0100 Subject: [PATCH 289/448] Update random_state gradient_boosting doc (#16315) --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f3efd3c897a4c..2a6f6a90c4e03 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -737,7 +737,9 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): optional (default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping - is enabled. See :term:`random_state`. + is enabled. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- @@ -919,7 +921,9 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, optional (default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping - is enabled. See :term:`random_state`. + is enabled. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. Attributes ---------- From 5ea6a26515951c3e9d66a96ae7c76a6dc21bf1f5 Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Thu, 30 Jan 2020 17:21:06 +0100 Subject: [PATCH 290/448] ENH use utility _check_sample_weight in IsotonicRegression (#16322) --- sklearn/isotonic.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index dbaa21b05f40d..cf226b8427548 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -8,6 +8,7 @@ from scipy.stats import spearmanr from .base import BaseEstimator, TransformerMixin, RegressorMixin from .utils import check_array, check_consistent_length +from .utils.validation import _check_sample_weight from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique import warnings import math @@ -121,10 +122,8 @@ def isotonic_regression(y, sample_weight=None, y_min=None, y_max=None, order = np.s_[:] if increasing else np.s_[::-1] y = check_array(y, ensure_2d=False, dtype=[np.float64, np.float32]) y = np.array(y[order], dtype=y.dtype) - if sample_weight is None: - sample_weight = np.ones(len(y), dtype=y.dtype) - else: - sample_weight = np.array(sample_weight[order], dtype=y.dtype) + sample_weight = _check_sample_weight(sample_weight, y, dtype=y.dtype) + sample_weight = np.ascontiguousarray(sample_weight[order]) _inplace_contiguous_isotonic_regression(y, sample_weight) if y_min is not None or y_max is not None: @@ -261,13 +260,9 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True): # If sample_weights is passed, removed zero-weight values and clean # order - if sample_weight is not None: - sample_weight = check_array(sample_weight, ensure_2d=False, - dtype=X.dtype) - mask = sample_weight > 0 - X, y, sample_weight = X[mask], y[mask], sample_weight[mask] - else: - sample_weight = np.ones(len(y), dtype=X.dtype) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + mask = sample_weight > 0 + X, y, sample_weight = X[mask], y[mask], sample_weight[mask] order = np.lexsort((y, X)) X, y, sample_weight = [array[order] for array in [X, y, sample_weight]] From 06b1a19465adfad78d14cc2ae56c21ead5c92475 Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Thu, 30 Jan 2020 17:40:05 +0100 Subject: [PATCH 291/448] [DOC] Make random_state descriptions for Mixture Models (#16307) --- sklearn/mixture/_base.py | 3 ++- sklearn/mixture/_bayesian_mixture.py | 10 ++++++---- sklearn/mixture/_gaussian_mixture.py | 10 ++++++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index 4bb98a1d54e4a..07f3669db27ef 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -137,7 +137,8 @@ def _initialize_parameters(self, X, random_state): X : array-like, shape (n_samples, n_features) random_state : RandomState - A random number generator instance. + A random number generator instance that controls the random seed + used for the method chosen to initialize the parameters. """ n_samples, _ = X.shape diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py index cecbb0f36a201..d69b7d1958183 100644 --- a/sklearn/mixture/_bayesian_mixture.py +++ b/sklearn/mixture/_bayesian_mixture.py @@ -164,10 +164,12 @@ class BayesianGaussianMixture(BaseMixture): float if 'spherical' random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random seed given to the method chosen to initialize the + parameters (see `init_params`). + In addition, it controls the generation of random samples from the + fitted distribution (see the method `sample`). + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. warm_start : bool, default to False. If 'warm_start' is True, the solution of the last fitting is used as diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py index 8603115fd202f..1c563984ba00b 100644 --- a/sklearn/mixture/_gaussian_mixture.py +++ b/sklearn/mixture/_gaussian_mixture.py @@ -502,10 +502,12 @@ class GaussianMixture(BaseMixture): (n_components, n_features, n_features) if 'full' random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random seed given to the method chosen to initialize the + parameters (see `init_params`). + In addition, it controls the generation of random samples from the + fitted distribution (see the method `sample`). + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. warm_start : bool, default to False. If 'warm_start' is True, the solution of the last fitting is used as From 43fd8c9246065a30b38033460c8cb2f2759b0672 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 30 Jan 2020 19:02:06 +0100 Subject: [PATCH 292/448] DOC exchanged boston for diabetes dataset in plot_cv_predict (#16312) --- examples/model_selection/plot_cv_predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_cv_predict.py b/examples/model_selection/plot_cv_predict.py index 2cff4c8c05b39..ee3e82f42cba1 100644 --- a/examples/model_selection/plot_cv_predict.py +++ b/examples/model_selection/plot_cv_predict.py @@ -14,7 +14,7 @@ import matplotlib.pyplot as plt lr = linear_model.LinearRegression() -X, y = datasets.load_boston(return_X_y=True) +X, y = datasets.load_diabetes(return_X_y=True) # cross_val_predict returns an array of the same size as `y` where each entry # is a prediction obtained by cross validation: From 932c60648d49dcda3784047d6df595421e8c32c8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 30 Jan 2020 19:06:00 +0100 Subject: [PATCH 293/448] MNT update imputer example to remove FutureWarning (#16302) --- examples/impute/plot_iterative_imputer_variants_comparison.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 06fab08c381f2..90e8e4cad1a9b 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -127,6 +127,6 @@ ax.set_title('California Housing Regression with Different Imputation Methods') ax.set_xlabel('MSE (smaller is better)') ax.set_yticks(np.arange(means.shape[0])) -ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()]) +ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()]) plt.tight_layout(pad=1) plt.show() From e05b9e146006236aa41349105e8a01ac9535fba7 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 30 Jan 2020 16:10:07 -0500 Subject: [PATCH 294/448] MNT Comment out instructions in bug report template (#16325) --- .github/ISSUE_TEMPLATE/bug_report.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 9144526ec8185..f980a6d167bf8 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,8 +7,10 @@ assignees: '' --- + #### Describe the bug (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = 0.5 * np.power(y_true - raw_predictions, 2) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train) + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): + return np.average(y_train, weights=sample_weight) @staticmethod def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_squares(gradients, y_true, raw_predictions) + if sample_weight is None: + _update_gradients_least_squares(gradients, y_true, raw_predictions) + else: + hessians = hessians.reshape(-1) + _update_gradients_hessians_least_squares(gradients, hessians, + y_true, raw_predictions, + sample_weight) class LeastAbsoluteDeviation(BaseLoss): @@ -160,8 +199,12 @@ class LeastAbsoluteDeviation(BaseLoss): loss(x_i) = |y_true_i - raw_pred_i| """ + def __init__(self, sample_weight): + # If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessians are + # equal to sample weights. + super().__init__(hessians_are_constant=sample_weight is None) - hessians_are_constant = True # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to # predict a Newton-Raphson step (see grower._finalize_leaf()). But for @@ -172,30 +215,39 @@ class LeastAbsoluteDeviation(BaseLoss): # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. need_update_leaves_values = True - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = np.abs(y_true - raw_predictions) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): - return np.median(y_train) + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): + if sample_weight is None: + return np.median(y_train) + else: + return _weighted_percentile(y_train, sample_weight, 50) @staticmethod def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_absolute_deviation(gradients, y_true, - raw_predictions) + if sample_weight is None: + _update_gradients_least_absolute_deviation(gradients, y_true, + raw_predictions) + else: + hessians = hessians.reshape(-1) + _update_gradients_hessians_least_absolute_deviation( + gradients, hessians, y_true, raw_predictions, sample_weight) - def update_leaves_values(self, grower, y_true, raw_predictions): + def update_leaves_values(self, grower, y_true, raw_predictions, + sample_weight): # Update the values predicted by the tree with # median(y_true - raw_predictions). # See note about need_update_leaves_values in BaseLoss. @@ -205,7 +257,14 @@ def update_leaves_values(self, grower, y_true, raw_predictions): # requires a cython version of median() for leaf in grower.finalized_leaves: indices = leaf.sample_indices - median_res = np.median(y_true[indices] - raw_predictions[indices]) + if sample_weight is None: + median_res = np.median(y_true[indices] + - raw_predictions[indices]) + else: + median_res = _weighted_percentile(y_true[indices] + - raw_predictions[indices], + sample_weight=sample_weight, + percentile=50) leaf.value = grower.shrinkage * median_res # Note that the regularization is ignored here @@ -222,24 +281,26 @@ class BinaryCrossEntropy(BaseLoss): section 4.4.1 (about logistic regression). """ - hessians_are_constant = False + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) + inverse_link_function = staticmethod(expit) - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) # logaddexp(0, x) = log(1 + exp(x)) loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if prediction_dim > 2: raise ValueError( "loss='binary_crossentropy' is not defined for multiclass" " classification with n_classes=%d, use" " loss='categorical_crossentropy' instead" % prediction_dim) - proba_positive_class = np.mean(y_train) + proba_positive_class = np.average(y_train, weights=sample_weight) eps = np.finfo(y_train.dtype).eps proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) # log(x / 1 - x) is the anti function of sigmoid, or the link function @@ -247,14 +308,14 @@ def get_baseline_prediction(self, y_train, prediction_dim): return np.log(proba_positive_class / (1 - proba_positive_class)) def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) hessians = hessians.reshape(-1) _update_gradients_hessians_binary_crossentropy( - gradients, hessians, y_true, raw_predictions) + gradients, hessians, y_true, raw_predictions, sample_weight) def predict_proba(self, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to @@ -274,9 +335,10 @@ class CategoricalCrossEntropy(BaseLoss): cross-entropy to more than 2 classes. """ - hessians_are_constant = False + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): one_hot_true = np.zeros_like(raw_predictions) prediction_dim = raw_predictions.shape[0] for k in range(prediction_dim): @@ -284,22 +346,23 @@ def __call__(self, y_true, raw_predictions, average=True): loss = (logsumexp(raw_predictions, axis=0) - (one_hot_true * raw_predictions).sum(axis=0)) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) eps = np.finfo(y_train.dtype).eps for k in range(prediction_dim): - proba_kth_class = np.mean(y_train == k) + proba_kth_class = np.average(y_train == k, + weights=sample_weight) proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) init_value[k, :] += np.log(proba_kth_class) return init_value def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): _update_gradients_hessians_categorical_crossentropy( - gradients, hessians, y_true, raw_predictions) + gradients, hessians, y_true, raw_predictions, sample_weight) def predict_proba(self, raw_predictions): # TODO: This could be done in parallel diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0850ac6638a80..c5b4a143591d6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,6 +1,6 @@ import numpy as np import pytest -from numpy.testing import assert_allclose +from numpy.testing import assert_allclose, assert_array_equal from sklearn.datasets import make_classification, make_regression from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler from sklearn.model_selection import train_test_split @@ -11,6 +11,8 @@ from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.utils import shuffle @@ -301,7 +303,8 @@ def test_small_trainset(): gb = HistGradientBoostingClassifier() # Compute the small training set - X_small, y_small = gb._get_small_trainset(X, y, seed=42) + X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42, + sample_weight_train=None) # Compute the class distribution in the small training set unique, counts = np.unique(y_small, return_counts=True) @@ -435,6 +438,20 @@ def test_infinite_values(): np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4) +def test_consistent_lengths(): + X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) + y = np.array([0, 0, 1, 1]) + sample_weight = np.array([.1, .3, .1]) + gbdt = HistGradientBoostingRegressor() + with pytest.raises(ValueError, + match=r"sample_weight.shape == $3,$, expected"): + gbdt.fit(X, y, sample_weight) + + with pytest.raises(ValueError, + match="Found input variables with inconsistent number"): + gbdt.fit(X, y[1:]) + + def test_infinite_values_missing_values(): # High level test making sure that inf and nan values are properly handled # when both are present. This is similar to @@ -474,6 +491,148 @@ def test_string_target_early_stopping(scoring): gbrt.fit(X, y) +def test_zero_sample_weights_regression(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingRegressor(min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert gb.predict([[1, 0]])[0] > 0.5 + + +def test_zero_sample_weights_classification(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingClassifier(loss='binary_crossentropy', + min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1], + [1, 1]] + y = [0, 0, 1, 0, 2] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1, 1] + gb = HistGradientBoostingClassifier(loss='categorical_crossentropy', + min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + +@pytest.mark.parametrize('problem', ( + 'regression', + 'binary_classification', + 'multiclass_classification' +)) +@pytest.mark.parametrize('duplication', ('half', 'all')) +def test_sample_weight_effect(problem, duplication): + # High level test to make sure that duplicating a sample is equivalent to + # giving it weight of 2. + + # fails for n_samples > 255 because binning does not take sample weights + # into account. Keeping n_samples <= 255 makes + # sure only unique values are used so SW have no effect on binning. + n_samples = 255 + n_features = 2 + if problem == 'regression': + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features, random_state=0) + Klass = HistGradientBoostingRegressor + else: + n_classes = 2 if problem == 'binary_classification' else 3 + X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_informative=n_features, n_redundant=0, + n_clusters_per_class=1, + n_classes=n_classes, random_state=0) + Klass = HistGradientBoostingClassifier + + # This test can't pass if min_samples_leaf > 1 because that would force 2 + # samples to be in the same node in est_sw, while these samples would be + # free to be separate in est_dup: est_dup would just group together the + # duplicated samples. + est = Klass(min_samples_leaf=1) + + # Create dataset with duplicate and corresponding sample weights + if duplication == 'half': + lim = n_samples // 2 + else: + lim = n_samples + X_dup = np.r_[X, X[:lim]] + y_dup = np.r_[y, y[:lim]] + sample_weight = np.ones(shape=(n_samples)) + sample_weight[:lim] = 2 + + est_sw = clone(est).fit(X, y, sample_weight=sample_weight) + est_dup = clone(est).fit(X_dup, y_dup) + + # checking raw_predict is stricter than just predict for classification + assert np.allclose(est_sw._raw_predict(X_dup), + est_dup._raw_predict(X_dup)) + + +@pytest.mark.parametrize('loss_name', ('least_squares', + 'least_absolute_deviation')) +def test_sum_hessians_are_sample_weight(loss_name): + # For losses with constant hessians, the sum_hessians field of the + # histograms must be equal to the sum of the sample weight of samples at + # the corresponding bin. + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=rng) + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + sample_weight = rng.normal(size=n_samples) + + loss = _LOSSES[loss_name](sample_weight=sample_weight) + gradients, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight) + raw_predictions = rng.normal(size=(1, n_samples)) + loss.update_gradients_and_hessians(gradients, hessians, y, + raw_predictions, sample_weight) + + # build sum_sample_weight which contains the sum of the sample weights at + # each bin (for each feature). This must be equal to the sum_hessians + # field of the corresponding histogram + sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins)) + for feature_idx in range(n_features): + for sample_idx in range(n_samples): + sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += ( + sample_weight[sample_idx]) + + # Build histogram + grower = TreeGrower(X_binned, gradients[0], hessians[0], + n_bins=bin_mapper.n_bins) + histograms = grower.histogram_builder.compute_histograms_brute( + grower.root.sample_indices) + + for feature_idx in range(n_features): + for bin_idx in range(bin_mapper.n_bins): + assert histograms[feature_idx, bin_idx]['sum_hessians'] == ( + pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5)) + + def test_max_depth_max_leaf_nodes(): # Non regression test for # https://github.com/scikit-learn/scikit-learn/issues/16179 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 8c300db993d3d..915dc300e4760 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -20,7 +20,7 @@ def get_gradients(y_true, raw_predictions): gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, - raw_predictions) + raw_predictions, None) return gradients def get_hessians(y_true, raw_predictions): @@ -28,7 +28,7 @@ def get_hessians(y_true, raw_predictions): gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, - raw_predictions) + raw_predictions, None) if loss.__class__.__name__ == 'LeastSquares': # hessians aren't updated because they're constant: @@ -62,13 +62,13 @@ def test_derivatives(loss, x0, y_true): # using Halley's method with the first and second order derivatives # computed by the Loss instance. - loss = _LOSSES[loss]() + loss = _LOSSES[loss](sample_weight=None) y_true = np.array([y_true], dtype=Y_DTYPE) x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1) get_gradients, get_hessians = get_derivatives_helper(loss) def func(x): - return loss(y_true, x) + return loss.pointwise_loss(y_true, x) def fprime(x): return get_gradients(y_true, x) @@ -78,7 +78,7 @@ def fprime2(x): optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2) assert np.allclose(loss.inverse_link_function(optimum), y_true) - assert np.allclose(loss(y_true, optimum), 0) + assert np.allclose(loss.pointwise_loss(y_true, optimum), 0) assert np.allclose(get_gradients(y_true, optimum), 0) @@ -105,7 +105,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): raw_predictions = rng.normal( size=(prediction_dim, n_samples) ).astype(Y_DTYPE) - loss = _LOSSES[loss]() + loss = _LOSSES[loss](sample_weight=None) get_gradients, get_hessians = get_derivatives_helper(loss) # only take gradients and hessians of first tree / class. @@ -120,16 +120,16 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): eps = 1e-9 offset = np.zeros_like(raw_predictions) offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) - f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) + f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2) + f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2) numerical_gradients = (f_plus_eps - f_minus_eps) / eps # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset, average=False) - f_minus_eps = loss(y_true, raw_predictions - offset, average=False) - f = loss(y_true, raw_predictions, average=False) + f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset) + f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset) + f = loss.pointwise_loss(y_true, raw_predictions) numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2 assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7) @@ -139,9 +139,9 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): def test_baseline_least_squares(): rng = np.random.RandomState(0) - loss = _LOSSES['least_squares']() + loss = _LOSSES['least_squares'](sample_weight=None) y_train = rng.normal(size=100) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the mean of all targets @@ -153,9 +153,9 @@ def test_baseline_least_squares(): def test_baseline_least_absolute_deviation(): rng = np.random.RandomState(0) - loss = _LOSSES['least_absolute_deviation']() + loss = _LOSSES['least_absolute_deviation'](sample_weight=None) y_train = rng.normal(size=100) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the median of all targets @@ -167,10 +167,10 @@ def test_baseline_least_absolute_deviation(): def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) - loss = _LOSSES['binary_crossentropy']() + loss = _LOSSES['binary_crossentropy'](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert_all_finite(baseline_prediction) assert np.allclose(loss.inverse_link_function(baseline_prediction), y_train[0]) @@ -181,7 +181,7 @@ def test_baseline_binary_crossentropy(): # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) y_train = rng.randint(0, 2, size=100).astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype p = y_train.mean() @@ -192,10 +192,10 @@ def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) prediction_dim = 4 - loss = _LOSSES['categorical_crossentropy']() + loss = _LOSSES['categorical_crossentropy'](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, + baseline_prediction = loss.get_baseline_prediction(y_train, None, prediction_dim) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) @@ -203,8 +203,85 @@ def test_baseline_categorical_crossentropy(): # Same logic as for above test. Here inverse_link_function = softmax and # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) - baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) + baseline_prediction = loss.get_baseline_prediction(y_train, None, + prediction_dim) assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() assert np.allclose(baseline_prediction[k, :], np.log(p)) + + +@pytest.mark.parametrize('loss, problem', [ + ('least_squares', 'regression'), + ('least_absolute_deviation', 'regression'), + ('binary_crossentropy', 'classification'), + ('categorical_crossentropy', 'classification') + ]) +@pytest.mark.parametrize('sample_weight', ['ones', 'random']) +def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): + # Make sure that passing sample weights to the gradient and hessians + # computation methods is equivalent to multiplying by the weights. + + rng = np.random.RandomState(42) + n_samples = 1000 + + if loss == 'categorical_crossentropy': + n_classes = prediction_dim = 3 + else: + n_classes = prediction_dim = 1 + + if problem == 'regression': + y_true = rng.normal(size=n_samples).astype(Y_DTYPE) + else: + y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) + + if sample_weight == 'ones': + sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE) + else: + sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE) + + loss_ = _LOSSES[loss](sample_weight=sample_weight) + + baseline_prediction = loss_.get_baseline_prediction( + y_true, None, prediction_dim + ) + raw_predictions = np.zeros(shape=(prediction_dim, n_samples), + dtype=baseline_prediction.dtype) + raw_predictions += baseline_prediction + + gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + loss_.update_gradients_and_hessians(gradients, hessians, y_true, + raw_predictions, None) + + gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true, + raw_predictions, sample_weight) + + assert np.allclose(gradients * sample_weight, gradients_sw) + assert np.allclose(hessians * sample_weight, hessians_sw) + + +def test_init_gradient_and_hessians_sample_weight(): + # Make sure that passing sample_weight to a loss correctly influences the + # hessians_are_constant attribute, and consequently the shape of the + # hessians array. + + prediction_dim = 2 + n_samples = 5 + sample_weight = None + loss = _LOSSES['least_squares'](sample_weight=sample_weight) + _, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=prediction_dim, + sample_weight=None) + assert loss.hessians_are_constant + assert hessians.shape == (1, 1) + + sample_weight = np.ones(n_samples) + loss = _LOSSES['least_squares'](sample_weight=sample_weight) + _, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=prediction_dim, + sample_weight=sample_weight) + assert not loss.hessians_are_constant + assert hessians.shape == (prediction_dim, n_samples) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 7f9c37dfca642..530a53b83dce4 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -520,6 +520,16 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 +def test_hist_gbdt_sw_not_supported(): + # TODO: remove/fix when PDP supports HGBT with sample weights + clf = HistGradientBoostingRegressor(random_state=1) + clf.fit(X, y, sample_weight=np.ones(len(X))) + + with pytest.raises(NotImplementedError, + match="does not support partial dependence"): + partial_dependence(clf, X, features=[1]) + + # TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates @ignore_warnings(category=FutureWarning) def test_partial_dependence_pipeline(): From 136ef7988402aee1bd839eba21ea989e2c1e77cb Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 24 Feb 2020 16:27:21 -0500 Subject: [PATCH 394/448] TST Checks can now skip test based on estimator tag _xfail_test (#16510) --- sklearn/tests/test_common.py | 7 ------- sklearn/utils/estimator_checks.py | 33 ++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 508356155aaf7..d769bb630bd03 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -93,13 +93,6 @@ def test_estimators(estimator, check, request): ConvergenceWarning, UserWarning, FutureWarning)): _set_checking_parameters(estimator) - - xfail_checks = _safe_tags(estimator, '_xfail_test') - check_name = _set_check_estimator_ids(check) - if xfail_checks: - if check_name in xfail_checks: - msg = xfail_checks[check_name] - request.applymarker(pytest.mark.xfail(reason=msg)) check(estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1e86f68d4ca3c..d376f1edb3097 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -356,6 +356,23 @@ def _generate_class_checks(Estimator): yield from _generate_instance_checks(name, estimator) +def _mark_xfail_checks(estimator, check, pytest): + """Mark estimator check pairs with xfail""" + + xfail_checks = _safe_tags(estimator, '_xfail_test') + if not xfail_checks: + return estimator, check + + check_name = _set_check_estimator_ids(check) + msg = xfail_checks.get(check_name, None) + + if msg is None: + return estimator, check + + return pytest.param( + estimator, check, marks=pytest.mark.xfail(reason=msg)) + + def parametrize_with_checks(estimators): """Pytest specific decorator for parametrizing estimator checks. @@ -374,11 +391,17 @@ def parametrize_with_checks(estimators): decorator : `pytest.mark.parametrize` """ import pytest - return pytest.mark.parametrize( - "estimator, check", - chain.from_iterable(check_estimator(estimator, generate_only=True) - for estimator in estimators), - ids=_set_check_estimator_ids) + + checks_generator = chain.from_iterable( + check_estimator(estimator, generate_only=True) + for estimator in estimators) + + checks_with_marks = ( + _mark_xfail_checks(estimator, check, pytest) + for estimator, check in checks_generator) + + return pytest.mark.parametrize("estimator, check", checks_with_marks, + ids=_set_check_estimator_ids) def check_estimator(Estimator, generate_only=False): From 4c29be44facbdaef188f84bdc8bf1190b2eebe07 Mon Sep 17 00:00:00 2001 From: Pete Green <42935860+plgreenLIRU@users.noreply.github.com> Date: Tue, 25 Feb 2020 08:53:50 +0000 Subject: [PATCH 395/448] FIX Predicted standard deviation values of Gaussian Processes are only within [0, 1] (#15782) * Initial work for WIP pull request * First commit of full solution and new tests * Changes from PR review * Updated changelog and normalize_y description * Updating v0.23.rst with upstream master * Small update to whatsnew Co-authored-by: Joel Nothman --- doc/whats_new/v0.23.rst | 12 ++- sklearn/gaussian_process/_gpr.py | 38 +++++++--- sklearn/gaussian_process/tests/test_gpr.py | 86 ++++++++++++++++++++-- 3 files changed, 114 insertions(+), 22 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 720a19cd862f6..19fd78444aad1 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -196,7 +196,12 @@ Changelog ............................... - |Enhancement| :func:`gaussian_process.kernels.Matern` returns the RBF kernel when ``nu=np.inf``. - :pr:`15503` by :user:`Sam Dixon `. + :pr:`15503` by :user:`Sam Dixon` . + +- |Fix| Fixed bug in :class:`gaussian_process.GaussianProcessRegressor` that + caused predicted standard deviations to only be between 0 and 1 when + WhiteKernel is not used. :pr:`15782` + by :user:`plgreenLIRU`. :mod:`sklearn.impute` ..................... @@ -218,7 +223,6 @@ Changelog :class:`tree.DecisionTreeRegressor`. :pr:`15864` by `Nicolas Hug`_. - :mod:`sklearn.linear_model` ........................... @@ -393,6 +397,6 @@ Changelog :mod:`sklearn.cluster` ...................... -- |Fix| :class:`cluster.AgglomerativeClustering` add specific error when - distance matrix is not square and `affinity=precomputed`. +- |Fix| :class:`cluster.AgglomerativeClustering` add specific error when + distance matrix is not square and `affinity=precomputed`. :pr:`16257` by :user:`Simona Maggio `. diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 432e6937e1951..0c1db0d209458 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -1,7 +1,7 @@ """Gaussian processes regression. """ # Authors: Jan Hendrik Metzen -# +# Modified by: Pete Green # License: BSD 3 clause import warnings @@ -92,13 +92,14 @@ def optimizer(obj_func, initial_theta, bounds): must be finite. Note that n_restarts_optimizer == 0 implies that one run is performed. - normalize_y : bool, default=False - Whether the target values y are normalized, i.e., the mean of the - observed target values become zero. This parameter should be set to - True if the target values' mean is expected to differ considerable from - zero. When enabled, the normalization effectively modifies the GP's - prior based on the data, which contradicts the likelihood principle; - normalization is thus disabled per default. + normalize_y : boolean, optional (default: False) + Whether the target values y are normalized, the mean and variance of + the target values are set equal to 0 and 1 respectively. This is + recommended for cases where zero-mean, unit-variance priors are used. + Note that, in this implementation, the normalisation is reversed + before the GP predictions are reported. + + .. versionchanged:: 0.23 copy_X_train : bool, default=True If True, a persistent copy of the training data is stored in the @@ -192,10 +193,14 @@ def fit(self, X, y): # Normalize target value if self.normalize_y: self._y_train_mean = np.mean(y, axis=0) - # demean y - y = y - self._y_train_mean + self._y_train_std = np.std(y, axis=0) + + # Remove mean and make unit variance + y = (y - self._y_train_mean) / self._y_train_std + else: self._y_train_mean = np.zeros(1) + self._y_train_std = 1 if np.iterable(self.alpha) \ and self.alpha.shape[0] != y.shape[0]: @@ -330,10 +335,17 @@ def predict(self, X, return_std=False, return_cov=False): else: # Predict based on GP posterior K_trans = self.kernel_(X, self.X_train_) y_mean = K_trans.dot(self.alpha_) # Line 4 (y_mean = f_star) - y_mean = self._y_train_mean + y_mean # undo normal. + + # undo normalisation + y_mean = self._y_train_std * y_mean + self._y_train_mean + if return_cov: v = cho_solve((self.L_, True), K_trans.T) # Line 5 y_cov = self.kernel_(X) - K_trans.dot(v) # Line 6 + + # undo normalisation + y_cov = y_cov * self._y_train_std**2 + return y_mean, y_cov elif return_std: # cache result of K_inv computation @@ -356,6 +368,10 @@ def predict(self, X, return_std=False, return_cov=False): warnings.warn("Predicted variances smaller than 0. " "Setting those variances to 0.") y_var[y_var_negative] = 0.0 + + # undo normalisation + y_var = y_var * self._y_train_std**2 + return y_mean, np.sqrt(y_var) else: return y_mean diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index 4ed105db04246..4bdd94e669eb4 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -1,6 +1,7 @@ """Testing for Gaussian process regression """ # Author: Jan Hendrik Metzen +# Modified by: Pete Green # License: BSD 3 clause import sys @@ -19,7 +20,8 @@ from sklearn.utils._testing \ import (assert_array_less, assert_almost_equal, assert_raise_message, - assert_array_almost_equal, assert_array_equal) + assert_array_almost_equal, assert_array_equal, + assert_allclose) def f(x): @@ -232,33 +234,103 @@ def test_random_starts(): @pytest.mark.parametrize('kernel', kernels) def test_y_normalization(kernel): - # Test normalization of the target values in GP + """ + Test normalization of the target values in GP - # Fitting non-normalizing GP on normalized y and fitting normalizing GP - # on unnormalized y should yield identical results - y_mean = y.mean(0) - y_norm = y - y_mean + Fitting non-normalizing GP on normalized y and fitting normalizing GP + on unnormalized y should yield identical results. Note that, here, + 'normalized y' refers to y that has been made zero mean and unit + variance. + + """ + + y_mean = np.mean(y) + y_std = np.std(y) + y_norm = (y - y_mean) / y_std # Fit non-normalizing GP on normalized y gpr = GaussianProcessRegressor(kernel=kernel) gpr.fit(X, y_norm) + # Fit normalizing GP on unnormalized y gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True) gpr_norm.fit(X, y) # Compare predicted mean, std-devs and covariances y_pred, y_pred_std = gpr.predict(X2, return_std=True) - y_pred = y_mean + y_pred + y_pred = y_pred * y_std + y_mean + y_pred_std = y_pred_std * y_std y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True) assert_almost_equal(y_pred, y_pred_norm) assert_almost_equal(y_pred_std, y_pred_std_norm) _, y_cov = gpr.predict(X2, return_cov=True) + y_cov = y_cov * y_std**2 _, y_cov_norm = gpr_norm.predict(X2, return_cov=True) + assert_almost_equal(y_cov, y_cov_norm) +def test_large_variance_y(): + """ + Here we test that, when noramlize_y=True, our GP can produce a + sensible fit to training data whose variance is significantly + larger than unity. This test was made in response to issue #15612. + + GP predictions are verified against predictions that were made + using GPy which, here, is treated as the 'gold standard'. Note that we + only investigate the RBF kernel here, as that is what was used in the + GPy implementation. + + The following code can be used to recreate the GPy data: + + -------------------------------------------------------------------------- + import GPy + + kernel_gpy = GPy.kern.RBF(input_dim=1, lengthscale=1.) + gpy = GPy.models.GPRegression(X, np.vstack(y_large), kernel_gpy) + gpy.optimize() + y_pred_gpy, y_var_gpy = gpy.predict(X2) + y_pred_std_gpy = np.sqrt(y_var_gpy) + -------------------------------------------------------------------------- + """ + + # Here we utilise a larger variance version of the training data + y_large = 10 * y + + # Standard GP with normalize_y=True + RBF_params = {'length_scale': 1.0} + kernel = RBF(**RBF_params) + gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr.fit(X, y_large) + y_pred, y_pred_std = gpr.predict(X2, return_std=True) + + # 'Gold standard' mean predictions from GPy + y_pred_gpy = np.array([15.16918303, + -27.98707845, + -39.31636019, + 14.52605515, + 69.18503589]) + + # 'Gold standard' std predictions from GPy + y_pred_std_gpy = np.array([7.78860962, + 3.83179178, + 0.63149951, + 0.52745188, + 0.86170042]) + + # Based on numerical experiments, it's reasonable to expect our + # GP's mean predictions to get within 7% of predictions of those + # made by GPy. + assert_allclose(y_pred, y_pred_gpy, rtol=0.07, atol=0) + + # Based on numerical experiments, it's reasonable to expect our + # GP's std predictions to get within 15% of predictions of those + # made by GPy. + assert_allclose(y_pred_std, y_pred_std_gpy, rtol=0.15, atol=0) + + def test_y_multioutput(): # Test that GPR can deal with multi-dimensional target values y_2d = np.vstack((y, y * 2)).T From b8c402722bb0a3dc307443e719ab7b5a037d6de5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 26 Feb 2020 14:00:22 +0100 Subject: [PATCH 396/448] BUG avoid nan variance with sparse input in StandardScaler (#16466) --- doc/whats_new/v0.23.rst | 4 ++ sklearn/preprocessing/tests/test_data.py | 14 +++++++ sklearn/utils/sparsefuncs_fast.pyx | 37 ++++++++++-------- sklearn/utils/tests/test_sparsefuncs.py | 50 ++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 17 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 19fd78444aad1..027ca62ccd853 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -356,6 +356,10 @@ Changelog each feature with two categories. :pr:`#16245` by :user:`Rushabh Vasani `. +- |Fix| Fix a bug in :class:`preprocessing.StandardScaler` which was incorrectly + computing statistics when calling `partial_fit` on sparse inputs. + :pr:`16466` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.svm` .................. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index cb5959c0e49b6..95721a0508091 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2471,3 +2471,17 @@ def test_power_transformer_copy_False(method, standardize): X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is X_inv_trans + + +@pytest.mark.parametrize( + "X_2", + [sparse.random(10, 1, density=0.8, random_state=0), + sparse.csr_matrix(np.full((10, 1), fill_value=np.nan))] +) +def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16448 + X_1 = sparse.random(5, 1, density=0.8) + scaler = StandardScaler(with_mean=False) + scaler.fit(X_1).partial_fit(X_2) + assert np.isfinite(scaler.var_[0]) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index debbbebbfe204..35abccf284088 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -334,23 +334,26 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, # Next passes for i in range(n_features): - updated_n[i] = last_n[i] + new_n[i] - last_over_new_n[i] = last_n[i] / new_n[i] - - # Unnormalized stats - for i in range(n_features): - last_mean[i] *= last_n[i] - last_var[i] *= last_n[i] - new_mean[i] *= new_n[i] - new_var[i] *= new_n[i] - - # Update stats - for i in range(n_features): - updated_var[i] = (last_var[i] + new_var[i] + - last_over_new_n[i] / updated_n[i] * - (last_mean[i] / last_over_new_n[i] - new_mean[i])**2) - updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i] - updated_var[i] /= updated_n[i] + if new_n[i] > 0: + updated_n[i] = last_n[i] + new_n[i] + last_over_new_n[i] = dtype(last_n[i]) / dtype(new_n[i]) + # Unnormalized stats + last_mean[i] *= last_n[i] + last_var[i] *= last_n[i] + new_mean[i] *= new_n[i] + new_var[i] *= new_n[i] + # Update stats + updated_var[i] = ( + last_var[i] + new_var[i] + + last_over_new_n[i] / updated_n[i] * + (last_mean[i] / last_over_new_n[i] - new_mean[i])**2 + ) + updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i] + updated_var[i] /= updated_n[i] + else: + updated_var[i] = last_var[i] + updated_mean[i] = last_mean[i] + updated_n[i] = last_n[i] return updated_mean, updated_var, updated_n diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 56e14819a8e26..ddb569f457249 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -151,6 +151,56 @@ def test_incr_mean_variance_axis(): assert_array_equal(X.shape[axis], n_incr) +@pytest.mark.parametrize( + "X1, X2", + [ + (sp.random(5, 2, density=0.8, format='csr', random_state=0), + sp.random(13, 2, density=0.8, format='csr', random_state=0)), + (sp.random(5, 2, density=0.8, format='csr', random_state=0), + sp.hstack([sp.csr_matrix(np.full((13, 1), fill_value=np.nan)), + sp.random(13, 1, density=0.8, random_state=42)], + format="csr")) + ] +) +def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2): + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16448 + # check that computing the incremental mean and variance is equivalent to + # computing the mean and variance on the stacked dataset. + axis = 0 + last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1]) + last_n = np.zeros(X1.shape[1], dtype=np.int64) + updated_mean, updated_var, updated_n = incr_mean_variance_axis( + X1, axis, last_mean, last_var, last_n + ) + updated_mean, updated_var, updated_n = incr_mean_variance_axis( + X2, axis, updated_mean, updated_var, updated_n + ) + X = sp.vstack([X1, X2]) + assert_allclose(updated_mean, np.nanmean(X.A, axis=axis)) + assert_allclose(updated_var, np.nanvar(X.A, axis=axis)) + assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.A), axis=0)) + + +def test_incr_mean_variance_no_new_n(): + # check the behaviour when we update the variance with an empty matrix + axis = 0 + X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr() + X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr() + last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1]) + last_n = np.zeros(X1.shape[1], dtype=np.int64) + last_mean, last_var, last_n = incr_mean_variance_axis( + X1, axis, last_mean, last_var, last_n + ) + # update statistic with a column which should ignored + updated_mean, updated_var, updated_n = incr_mean_variance_axis( + X2, axis, last_mean, last_var, last_n + ) + assert_allclose(updated_mean, last_mean) + assert_allclose(updated_var, last_var) + assert_allclose(updated_n, last_n) + + @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix]) def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor): From 54cbf428a963dde20b12b54e97d849775b8eb991 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 26 Feb 2020 14:05:18 +0100 Subject: [PATCH 397/448] DOC Note on commit co-authorship when merging PRs (#16550) --- doc/developers/maintainer.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index f400989a7d877..f38514ec075b7 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -213,6 +213,24 @@ The following GitHub checklist might be helpful in a release PR:: #15847) * [ ] announce on mailing list and on twitter +Merging Pull Requests +--------------------- + +Individual commits are squashed when a Pull Request (PR) is merged on Github. +Before merging, + +- the resulting commit title can be edited if necessary. Note + that this will rename the PR title by default. +- the detailed description, containing the titles of all the commits, can + be edited or deleted. +- for PRs with multiple code contributors care must be taken to keep + the `Co-authored-by: name ` tags in the detailed + description. This will mark the PR as having `multiple co-authors + `_. + Whether code contributions are significanly enough to merit co-authorship is + left to the maintainer's discretion, same as for the "what's new" entry. + + The scikit-learn.org web site ----------------------------- From ca78d75e751d8d57c08fb48fc8f437d18d454e43 Mon Sep 17 00:00:00 2001 From: CastaChick Date: Thu, 27 Feb 2020 18:04:04 +0900 Subject: [PATCH 398/448] DOC Fixed documents that refer to Bunch object #16438 (#16447) * Added links to utils.Bunch and fixed format of the docstring in datasets * Added links to utils.Bunch in sklearn.compose * Added links to utils.Bunch in sklearn.tree * Added links to utils.Bunch in sklearn.ensemble * Added links to utils.Bunch in sklearn.inspection * Added links to utils.Bunch in sklearn.pipeline * modified docstring of Bunch * Added links to utils.Bunch to index.rst of sklearn.datasets * Fixed some docstrings because the lines are too long * Fixed some points as reviewed. * Add links and delete 'for more information...' * Fixed indent * Fixed forgotten points. * Fixed some points as reviewed. --- doc/datasets/index.rst | 60 +++++++------- sklearn/compose/_column_transformer.py | 2 +- sklearn/datasets/_base.py | 78 ++++++++++++------- sklearn/datasets/_california_housing.py | 29 ++++--- sklearn/datasets/_covtype.py | 22 +++--- sklearn/datasets/_kddcup99.py | 24 +++--- sklearn/datasets/_lfw.py | 73 +++++++++-------- sklearn/datasets/_olivetti_faces.py | 24 +++--- sklearn/datasets/_openml.py | 4 +- sklearn/datasets/_rcv1.py | 31 ++++---- sklearn/datasets/_species_distributions.py | 47 ++++++----- sklearn/datasets/_twenty_newsgroups.py | 37 +++++---- sklearn/ensemble/_stacking.py | 5 +- sklearn/ensemble/_voting.py | 3 +- sklearn/inspection/_permutation_importance.py | 4 +- sklearn/pipeline.py | 3 +- sklearn/tree/_classes.py | 4 +- sklearn/utils/__init__.py | 4 +- 18 files changed, 249 insertions(+), 205 deletions(-) diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 2fb7e84610833..88ae88d7a3151 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -21,46 +21,50 @@ also possible to generate synthetic data. General dataset API =================== -There are three main kinds of dataset interfaces that can be used to get +There are three main kinds of dataset interfaces that can be used to get datasets depending on the desired type of dataset. - -**The dataset loaders.** They can be used to load small standard datasets, -described in the :ref:`toy_datasets` section. + +**The dataset loaders.** They can be used to load small standard datasets, +described in the :ref:`toy_datasets` section. **The dataset fetchers.** They can be used to download and load larger datasets, described in the :ref:`real_world_datasets` section. -Both loaders and fetchers functions return a dictionary-like object holding -at least two items: an array of shape ``n_samples`` * ``n_features`` with -key ``data`` (except for 20newsgroups) and a numpy array of +Both loaders and fetchers functions return a :class:`sklearn.utils.Bunch` +object holding at least two items: +an array of shape ``n_samples`` * ``n_features`` with +key ``data`` (except for 20newsgroups) and a numpy array of length ``n_samples``, containing the target values, with key ``target``. +The Bunch object is a dictionary that exposes its keys are attributes. +For more information about Bunch object, see :class:`sklearn.utils.Bunch`: + It's also possible for almost all of these function to constrain the output -to be a tuple containing only the data and the target, by setting the +to be a tuple containing only the data and the target, by setting the ``return_X_y`` parameter to ``True``. -The datasets also contain a full description in their ``DESCR`` attribute and -some contain ``feature_names`` and ``target_names``. See the dataset -descriptions below for details. +The datasets also contain a full description in their ``DESCR`` attribute and +some contain ``feature_names`` and ``target_names``. See the dataset +descriptions below for details. -**The dataset generation functions.** They can be used to generate controlled +**The dataset generation functions.** They can be used to generate controlled synthetic datasets, described in the :ref:`sample_generators` section. These functions return a tuple ``(X, y)`` consisting of a ``n_samples`` * ``n_features`` numpy array ``X`` and an array of length ``n_samples`` containing the targets ``y``. -In addition, there are also miscellaneous tools to load datasets of other +In addition, there are also miscellaneous tools to load datasets of other formats or from other locations, described in the :ref:`loading_other_datasets` -section. +section. .. _toy_datasets: Toy datasets ============ -scikit-learn comes with a few small standard datasets that do not require to -download any file from some external website. +scikit-learn comes with a few small standard datasets that do not require to +download any file from some external website. They can be loaded using the following functions: @@ -484,17 +488,17 @@ Loading from external datasets scikit-learn works on any numeric data stored as numpy arrays or scipy sparse matrices. Other types that are convertible to numeric arrays such as pandas DataFrame are also acceptable. - -Here are some recommended ways to load standard columnar data into a -format usable by scikit-learn: -* `pandas.io `_ +Here are some recommended ways to load standard columnar data into a +format usable by scikit-learn: + +* `pandas.io `_ provides tools to read data from common formats including CSV, Excel, JSON and SQL. DataFrames may also be constructed from lists of tuples or dicts. Pandas handles heterogeneous data smoothly and provides tools for manipulation and conversion into a numeric array suitable for scikit-learn. -* `scipy.io `_ - specializes in binary formats often used in scientific computing +* `scipy.io `_ + specializes in binary formats often used in scientific computing context such as .mat and .arff * `numpy/routines.io `_ for standard loading of columnar data into numpy arrays @@ -508,18 +512,18 @@ For some miscellaneous data such as images, videos, and audio, you may wish to refer to: * `skimage.io `_ or - `Imageio `_ + `Imageio `_ for loading images and videos into numpy arrays -* `scipy.io.wavfile.read - `_ +* `scipy.io.wavfile.read + `_ for reading WAV files into a numpy array -Categorical (or nominal) features stored as strings (common in pandas DataFrames) +Categorical (or nominal) features stored as strings (common in pandas DataFrames) will need converting to numerical features using :class:`sklearn.preprocessing.OneHotEncoder` or :class:`sklearn.preprocessing.OrdinalEncoder` or similar. See :ref:`preprocessing`. -Note: if you manage your own numerical data it is recommended to use an +Note: if you manage your own numerical data it is recommended to use an optimized file format such as HDF5 to reduce data load times. Various libraries -such as H5Py, PyTables and pandas provides a Python interface for reading and +such as H5Py, PyTables and pandas provides a Python interface for reading and writing data in that format. diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 89e816833b5f6..f5526ec185875 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -124,7 +124,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): ``len(transformers_)==len(transformers)+1``, otherwise ``len(transformers_)==len(transformers)``. - named_transformers_ : Bunch + named_transformers_ : :class:`~sklearn.utils.Bunch` Read-only attribute to access any transformer by given name. Keys are transformer names and values are the fitted transformer objects. diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 128452c6d36af..909470f980a5e 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -163,12 +163,20 @@ def load_files(container_path, description=None, categories=None, Returns ------- - data : Bunch - Dictionary-like object, the interesting attributes are: either - data, the raw text data to learn, or 'filenames', the files - holding it, 'target', the classification labels (integer index), - 'target_names', the meaning of the labels, and 'DESCR', the full - description of the dataset. + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : list of str + Only present when `load_content=True`. + The raw text data to learn. + target : ndarray + The target labels (integer index). + target_names : list + The names of target classes. + DESCR : str + The full description of the dataset. + filenames: ndarray + The filenames holding the dataset. """ target = [] target_names = [] @@ -295,8 +303,8 @@ def load_wine(return_X_y=False, as_frame=False): Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (178, 13) The data matrix. If `as_frame=True`, `data` will be a pandas @@ -409,8 +417,8 @@ def load_iris(return_X_y=False, as_frame=False): Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (150, 4) The data matrix. If `as_frame=True`, `data` will be a pandas @@ -521,8 +529,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False): Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (569, 30) The data matrix. If `as_frame=True`, `data` will be a pandas @@ -645,8 +653,8 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False): Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (1797, 64) The flattened data matrix. If `as_frame=True`, `data` will be @@ -759,8 +767,8 @@ def load_diabetes(return_X_y=False, as_frame=False): Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (442, 10) The data matrix. If `as_frame=True`, `data` will be a pandas @@ -853,8 +861,8 @@ def load_linnerud(return_X_y=False, as_frame=False): Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (20, 3) The data matrix. If `as_frame=True`, `data` will be a pandas @@ -943,12 +951,21 @@ def load_boston(return_X_y=False): Returns ------- - data : Bunch - Dictionary-like object, the interesting attributes are: - 'data', the data to learn, 'target', the regression targets, - 'DESCR', the full description of the dataset, - and 'filename', the physical location of boston - csv dataset (added in version `0.20`). + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray of shape (506, 13) + The data matrix. + target : ndarray of shape (506, ) + The regression target. + filename : str + The physical location of boston csv dataset. + + .. versionadded:: 0.20 + DESCR : str + The full description of the dataset. + feature_names : ndarray + The names of features (data, target) : tuple if ``return_X_y`` is True @@ -1007,10 +1024,15 @@ def load_sample_images(): Returns ------- - data : Bunch - Dictionary-like object with the following attributes : 'images', the - two sample images, 'filenames', the file names for the images, and - 'DESCR' the full description of the dataset. + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + images : list of ndarray of shape (427, 640, 3) + The two sample image. + filenames : list + The filenames for the images. + DESCR : str + The full description of the dataset. Examples -------- diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index 958184369b63d..e3df2124aab2b 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -87,21 +87,20 @@ def fetch_california_housing(data_home=None, download_if_missing=True, Returns ------- - dataset : dict-like object with the following attributes: - - dataset.data : ndarray, shape [20640, 8] - Each row corresponding to the 8 feature values in order. - If ``as_frame`` is True, ``data`` is a pandas object. - - dataset.target : numpy array of shape (20640,) - Each value corresponds to the average house value in units of 100,000. - If ``as_frame`` is True, ``target`` is a pandas object. - - dataset.feature_names : array of length 8 - Array of ordered feature names used in the dataset. - - dataset.DESCR : string - Description of the California housing dataset. + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray, shape (20640, 8) + Each row corresponding to the 8 feature values in order. + If ``as_frame`` is True, ``data`` is a pandas object. + target : numpy array of shape (20640,) + Each value corresponds to the average + house value in units of 100,000. + If ``as_frame`` is True, ``target`` is a pandas object. + feature_names : list of length 8 + Array of ordered feature names used in the dataset. + DESCR : string + Description of the California housing dataset. (data, target) : tuple if ``return_X_y`` is True diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 367ec1f9e2970..6b23f913e05a7 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -81,17 +81,17 @@ def fetch_covtype(data_home=None, download_if_missing=True, Returns ------- - dataset : dict-like object with the following attributes: - - dataset.data : numpy array of shape (581012, 54) - Each row corresponds to the 54 features in the dataset. - - dataset.target : numpy array of shape (581012,) - Each value corresponds to one of the 7 forest covertypes with values - ranging between 1 to 7. - - dataset.DESCR : string - Description of the forest covertype dataset. + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : numpy array of shape (581012, 54) + Each row corresponds to the 54 features in the dataset. + target : numpy array of shape (581012,) + Each value corresponds to one of + the 7 forest covertypes with values + ranging between 1 to 7. + DESCR : str + Description of the forest covertype dataset. (data, target) : tuple if ``return_X_y`` is True diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 4585df8b0fb8b..c0ba00fa46f04 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -96,11 +96,15 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False, Returns ------- - data : Bunch - Dictionary-like object, the interesting attributes are: - - 'data', the data to learn. - - 'target', the regression target for each sample. - - 'DESCR', a description of the dataset. + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray of shape (494021, 41) + The data matrix to learn. + target : ndarray of shape (494021,) + The regression target for each sample. + DESCR : str + The full description of the dataset. (data, target) : tuple if ``return_X_y`` is True @@ -190,13 +194,15 @@ def _fetch_brute_kddcup99(data_home=None, Returns ------- - dataset : dict-like object with the following attributes: - dataset.data : numpy array of shape (494021, 41) + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : numpy array of shape (494021, 41) Each row corresponds to the 41 features in the dataset. - dataset.target : numpy array of shape (494021,) + target : numpy array of shape (494021,) Each value corresponds to one of the 21 attack types or to the label 'normal.'. - dataset.DESCR : string + DESCR : string Description of the kddcup99 dataset. """ diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py index 0cb65b3221039..b5efd68adbd1c 100644 --- a/sklearn/datasets/_lfw.py +++ b/sklearn/datasets/_lfw.py @@ -272,24 +272,23 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, Returns ------- - dataset : dict-like object with the following attributes: - - dataset.data : numpy array of shape (13233, 2914) - Each row corresponds to a ravelled face image of original size 62 x 47 - pixels. Changing the ``slice_`` or resize parameters will change the - shape of the output. - - dataset.images : numpy array of shape (13233, 62, 47) - Each row is a face image corresponding to one of the 5749 people in - the dataset. Changing the ``slice_`` or resize parameters will change - the shape of the output. - - dataset.target : numpy array of shape (13233,) - Labels associated to each face image. Those labels range from 0-5748 - and correspond to the person IDs. - - dataset.DESCR : string - Description of the Labeled Faces in the Wild (LFW) dataset. + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : numpy array of shape (13233, 2914) + Each row corresponds to a ravelled face image + of original size 62 x 47 pixels. + Changing the ``slice_`` or resize parameters will change the + shape of the output. + images : numpy array of shape (13233, 62, 47) + Each row is a face image corresponding to one of the 5749 people in + the dataset. Changing the ``slice_`` + or resize parameters will change the shape of the output. + target : numpy array of shape (13233,) + Labels associated to each face image. + Those labels range from 0-5748 and correspond to the person IDs. + DESCR : string + Description of the Labeled Faces in the Wild (LFW) dataset. (data, target) : tuple if ``return_X_y`` is True @@ -446,25 +445,25 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, Returns ------- - The data is returned as a Bunch object with the following attributes: - - data : numpy array of shape (2200, 5828). Shape depends on ``subset``. - Each row corresponds to 2 ravel'd face images of original size 62 x 47 - pixels. Changing the ``slice_``, ``resize`` or ``subset`` parameters - will change the shape of the output. - - pairs : numpy array of shape (2200, 2, 62, 47). Shape depends on ``subset`` - Each row has 2 face images corresponding to same or different person - from the dataset containing 5749 people. Changing the ``slice_``, - ``resize`` or ``subset`` parameters will change the shape of the - output. - - target : numpy array of shape (2200,). Shape depends on ``subset``. - Labels associated to each pair of images. The two label values being - different persons or the same person. - - DESCR : string - Description of the Labeled Faces in the Wild (LFW) dataset. + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray of shape (2200, 5828). Shape depends on ``subset``. + Each row corresponds to 2 ravel'd face images + of original size 62 x 47 pixels. + Changing the ``slice_``, ``resize`` or ``subset`` parameters + will change the shape of the output. + pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset`` + Each row has 2 face images corresponding + to same or different person from the dataset + containing 5749 people. Changing the ``slice_``, + ``resize`` or ``subset`` parameters will change the shape of the + output. + target : numpy array of shape (2200,). Shape depends on ``subset``. + Labels associated to each pair of images. + The two label values being different persons or the same person. + DESCR : string + Description of the Labeled Faces in the Wild (LFW) dataset. """ lfw_home, data_folder_path = _check_fetch_lfw( diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index d1a9805b495f2..d5f163d468214 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -77,15 +77,21 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, Returns ------- - bunch : Bunch object with the following attributes: - - data: ndarray, shape (400, 4096). Each row corresponds to a ravelled - face image of original size 64 x 64 pixels. - - images : ndarray, shape (400, 64, 64). Each row is a face image - corresponding to one of the 40 subjects of the dataset. - - target : ndarray, shape (400,). Labels associated to each face image. - Those labels are ranging from 0-39 and correspond to the - Subject IDs. - - DESCR : string. Description of the modified Olivetti Faces Dataset. + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data: ndarray, shape (400, 4096) + Each row corresponds to a ravelled + face image of original size 64 x 64 pixels. + images : ndarray, shape (400, 64, 64) + Each row is a face image + corresponding to one of the 40 subjects of the dataset. + target : ndarray, shape (400,) + Labels associated to each face image. + Those labels are ranging from 0-39 and correspond to the + Subject IDs. + DESCR : str + Description of the modified Olivetti Faces Dataset. (data, target) : tuple if `return_X_y=True` .. versionadded:: 0.22 diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 2c2b194f9ef71..cef0e6cb1f411 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -579,8 +579,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame The feature matrix. Categorical features are encoded as ordinals. diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index d930a347b7f7c..4f1c5cc4af199 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -127,23 +127,20 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, Returns ------- - dataset : dict-like object with the following attributes: - - dataset.data : scipy csr array, dtype np.float64, shape (804414, 47236) - The array has 0.16% of non zero values. - - dataset.target : scipy csr array, dtype np.uint8, shape (804414, 103) - Each sample has a value of 1 in its categories, and 0 in others. - The array has 3.15% of non zero values. - - dataset.sample_id : numpy array, dtype np.uint32, shape (804414,) - Identification number of each sample, as ordered in dataset.data. - - dataset.target_names : numpy array, dtype object, length (103) - Names of each target (RCV1 topics), as ordered in dataset.target. - - dataset.DESCR : string - Description of the RCV1 dataset. + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : scipy csr array, dtype np.float64, shape (804414, 47236) + The array has 0.16% of non zero values. + target : scipy csr array, dtype np.uint8, shape (804414, 103) + Each sample has a value of 1 in its categories, and 0 in others. + The array has 3.15% of non zero values. + sample_id : numpy array, dtype np.uint32, shape (804414,) + Identification number of each sample, as ordered in dataset.data. + target_names : numpy array, dtype object, length (103) + Names of each target (RCV1 topics), as ordered in dataset.target. + DESCR : string + Description of the RCV1 dataset. (data, target) : tuple if ``return_X_y`` is True diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py index 7800dfce2c190..7f621d1de74eb 100644 --- a/sklearn/datasets/_species_distributions.py +++ b/sklearn/datasets/_species_distributions.py @@ -155,31 +155,28 @@ def fetch_species_distributions(data_home=None, Returns ------- - The data is returned as a Bunch object with the following attributes: - - coverages : array, shape = [14, 1592, 1212] - These represent the 14 features measured at each point of the map grid. - The latitude/longitude values for the grid are discussed below. - Missing data is represented by the value -9999. - - train : record array, shape = (1624,) - The training points for the data. Each point has three fields: - - - train['species'] is the species name - - train['dd long'] is the longitude, in degrees - - train['dd lat'] is the latitude, in degrees - - test : record array, shape = (620,) - The test points for the data. Same format as the training data. - - Nx, Ny : integers - The number of longitudes (x) and latitudes (y) in the grid - - x_left_lower_corner, y_left_lower_corner : floats - The (x,y) position of the lower-left corner, in degrees - - grid_size : float - The spacing between points of the grid, in degrees + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + coverages : array, shape = [14, 1592, 1212] + These represent the 14 features measured + at each point of the map grid. + The latitude/longitude values for the grid are discussed below. + Missing data is represented by the value -9999. + train : record array, shape = (1624,) + The training points for the data. Each point has three fields: + + - train['species'] is the species name + - train['dd long'] is the longitude, in degrees + - train['dd lat'] is the latitude, in degrees + test : record array, shape = (620,) + The test points for the data. Same format as the training data. + Nx, Ny : integers + The number of longitudes (x) and latitudes (y) in the grid + x_left_lower_corner, y_left_lower_corner : floats + The (x,y) position of the lower-left corner, in degrees + grid_size : float + The spacing between points of the grid, in degrees References ---------- diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index a1edc08019c85..ebbd191069c49 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -214,13 +214,19 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None, Returns ------- - bunch : Bunch object with the following attribute: - - data: list, length [n_samples] - - target: array, shape [n_samples] - - filenames: list, length [n_samples] - - DESCR: a description of the dataset. - - target_names: a list of categories of the returned data, - length [n_classes]. This depends on the `categories` parameter. + bunch : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : list, length [n_samples] + The data list to learn. + target: array, shape [n_samples] + The target labels. + filenames: list, length [n_samples] + The path to the location of the data. + DESCR: str + The full description of the dataset. + target_names: list, length [n_classes] + The names of target classes. (data, target) : tuple if `return_X_y=True` .. versionadded:: 0.22 @@ -384,12 +390,17 @@ def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None, Returns ------- - bunch : Bunch object with the following attribute: - - bunch.data: sparse matrix, shape [n_samples, n_features] - - bunch.target: array, shape [n_samples] - - bunch.target_names: a list of categories of the returned data, - length [n_classes]. - - bunch.DESCR: a description of the dataset. + bunch : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data: sparse matrix, shape [n_samples, n_features] + The data matrix to learn. + target: array, shape [n_samples] + The target labels. + target_names: list, length [n_classes] + The names of target classes. + DESCR: str + The full description of the dataset. (data, target) : tuple if ``return_X_y`` is True diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 7af2d6dc95d2b..6b206dfbe3d02 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -323,7 +323,7 @@ class StackingClassifier(ClassifierMixin, _BaseStacking): training data. If an estimator has been set to `'drop'`, it will not appear in `estimators_`. - named_estimators_ : Bunch + named_estimators_ : :class:`~sklearn.utils.Bunch` Attribute to access any fitted sub-estimators by name. final_estimator_ : estimator @@ -571,9 +571,10 @@ class StackingRegressor(RegressorMixin, _BaseStacking): training data. If an estimator has been set to `'drop'`, it will not appear in `estimators_`. - named_estimators_ : Bunch + named_estimators_ : :class:`~sklearn.utils.Bunch` Attribute to access any fitted sub-estimators by name. + final_estimator_ : estimator The regressor to stacked the base estimators fitted. diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 80b5f30c42bcd..f79870a2c6891 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -142,9 +142,10 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): The collection of fitted sub-estimators as defined in ``estimators`` that are not 'drop'. - named_estimators_ : Bunch + named_estimators_ : :class:`~sklearn.utils.Bunch` Attribute to access any fitted sub-estimators by name. + .. versionadded:: 0.20 classes_ : array-like of shape (n_predictions,) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index b95e5d11fad9d..ff4d9d6738977 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -86,8 +86,8 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, Returns ------- - result : Bunch - Dictionary-like object, with attributes: + result : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. importances_mean : ndarray, shape (n_features, ) Mean of feature importance over `n_repeats`. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index eb42c43a98905..d387206288bf5 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -72,7 +72,8 @@ class Pipeline(_BaseComposition): Attributes ---------- - named_steps : bunch object, a dictionary with attribute access + named_steps : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 45ad326b2d742..41d09ead9aec4 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -545,8 +545,8 @@ def cost_complexity_pruning_path(self, X, y, sample_weight=None): Returns ------- - ccp_path : Bunch - Dictionary-like object, with attributes: + ccp_path : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. ccp_alphas : ndarray Effective alphas of subtree during pruning. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index f24669b105806..4b69365339389 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -64,7 +64,8 @@ class Bunch(dict): They extend dictionaries by enabling values to be accessed by key, `bunch["value_key"]`, or by an attribute, `bunch.value_key`. - + Examples + -------- >>> b = Bunch(a=1, b=2) >>> b['b'] 2 @@ -76,7 +77,6 @@ class Bunch(dict): >>> b.c = 6 >>> b['c'] 6 - """ def __init__(self, **kwargs): From 9b1928dbc233c052366f5686432e017022bcaa5d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 27 Feb 2020 06:58:21 -0500 Subject: [PATCH 399/448] MNT Adds autolabler for modules (#16520) * MNT Adds labeler * BUG Fix * Double quotes are better * BUG Fix * MNT Adds build ci tag * MNT Use fork for new feature --- .github/labeler.yml | 88 +++++++++++++++++++++++++++++++++++ .github/workflows/labeler.yml | 12 +++++ 2 files changed, 100 insertions(+) create mode 100644 .github/labeler.yml create mode 100644 .github/workflows/labeler.yml diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000000000..107f94627cb13 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,88 @@ +module:cluster: +- sklearn/cluster/**/* + +module:common: +- sklearn/common/**/* + +module:compose: +- sklearn/compose/**/* + +module:covariance: +- sklearn/covariance/**/* + +module:cross_decomposition: +- sklearn/cross_decomposition/**/* + +module:datasets: +- sklearn/datasets/**/* + +module:decomposition: +- sklearn/decomposition/**/* + +module:ensemble: +- sklearn/ensemble/**/* + +module:feature_extraction: +- sklearn/feature_extraction/**/* + +module:feature_selection: +- sklearn/feature_selection/**/* + +module:gaussian_process: +- sklearn/gaussian_process/**/* + +module:impute: +- sklearn/impute/**/* + +module:inspection: +- sklearn/inspection/**/* + +module:linear_model: +- sklearn/linear_model/**/* + +module:manifold: +- sklearn/manifold/**/* + +module:metrics: +- sklearn/metrics/**/* + +module:mixture: +- sklearn/mixture/**/* + +module:model_selection: +- sklearn/model_selection/**/* + +module:naive_bayes: +- sklearn/naive_bayes.py + +module:neighbors: +- sklearn/neighbors/**/* + +module:neural_network: +- sklearn/neural_network/**/* + +module:pipeline: +- sklearn/pipeline.py + +module:preprocessing: +- sklearn/preprocessing/**/* + +module:semi_supervised: +- sklearn/semi_supervised/**/* + +module:svm: +- sklearn/svm/**/* + +module:tree: +- sklearn/tree/**/* + +module:utils: +- sklearn/utils/**/* + +Build / CI: +- build_tools/**/* +- .github/**/* +- maint_tools/**/* +- azure-pipelines.yml +- .travis.yml +- pyproject.toml diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000000000..65ea75b1e7ad5 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,12 @@ +name: "Pull Request Labeler" +on: +- pull_request + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: thomasjpfan/labeler@master + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + max-labels: '3' From 8caa93889f85254fc3ca84caa0a24a1640eebdd1 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Thu, 27 Feb 2020 13:53:41 +0100 Subject: [PATCH 400/448] API make cluster's __init__ parameters kwonly (#16501) --- sklearn/cluster/_affinity_propagation.py | 6 ++--- sklearn/cluster/_agglomerative.py | 10 ++++----- sklearn/cluster/_bicluster.py | 9 +++++--- sklearn/cluster/_birch.py | 28 ++++++++++++++---------- sklearn/cluster/_dbscan.py | 6 ++--- sklearn/cluster/_kmeans.py | 10 ++++----- sklearn/cluster/_mean_shift.py | 5 +++-- sklearn/cluster/_optics.py | 9 ++++---- sklearn/cluster/_spectral.py | 6 ++--- 9 files changed, 50 insertions(+), 39 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index eaba9ccf1ec20..5f9ce02d869b7 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -11,7 +11,7 @@ from ..exceptions import ConvergenceWarning from ..base import BaseEstimator, ClusterMixin from ..utils import as_float_array, check_array -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _deprecate_positional_args from ..metrics import euclidean_distances from ..metrics import pairwise_distances_argmin @@ -334,8 +334,8 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages Between Data Points", Science Feb. 2007 """ - - def __init__(self, damping=.5, max_iter=200, convergence_iter=15, + @_deprecate_positional_args + def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False): diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 2cb2dd25e4839..6bc5d30adee2b 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -17,7 +17,7 @@ from ..base import BaseEstimator, ClusterMixin from ..metrics.pairwise import paired_distances, pairwise_distances from ..utils import check_array -from ..utils.validation import check_memory +from ..utils.validation import check_memory, _deprecate_positional_args from ..neighbors import DistanceMetric from ..neighbors._dist_metrics import METRIC_MAPPING @@ -780,8 +780,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): array([1, 1, 1, 0, 0, 0]) """ - - def __init__(self, n_clusters=2, affinity="euclidean", + @_deprecate_positional_args + def __init__(self, n_clusters=2, *, affinity="euclidean", memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None): @@ -1029,8 +1029,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): >>> X_reduced.shape (1797, 32) """ - - def __init__(self, n_clusters=2, affinity="euclidean", + @_deprecate_positional_args + def __init__(self, n_clusters=2, *, affinity="euclidean", memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func=np.mean, diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 7fb11c1033981..356072f2d33aa 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -18,7 +18,8 @@ from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) -from ..utils.validation import assert_all_finite, check_array +from ..utils.validation import (assert_all_finite, check_array, + _deprecate_positional_args) __all__ = ['SpectralCoclustering', @@ -284,7 +285,8 @@ class SpectralCoclustering(BaseSpectral): `__. """ - def __init__(self, n_clusters=3, svd_method='randomized', + @_deprecate_positional_args + def __init__(self, n_clusters=3, *, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, n_jobs='deprecated', random_state=None): super().__init__(n_clusters, @@ -435,7 +437,8 @@ class SpectralBiclustering(BaseSpectral): `__. """ - def __init__(self, n_clusters=3, method='bistochastic', + @_deprecate_positional_args + def __init__(self, n_clusters=3, *, method='bistochastic', n_components=6, n_best=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, n_jobs='deprecated', random_state=None): diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index b2cbd8208969d..68c07cc5a2860 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -13,8 +13,8 @@ from ..metrics.pairwise import euclidean_distances from ..base import TransformerMixin, ClusterMixin, BaseEstimator from ..utils import check_array -from ..utils.extmath import row_norms, safe_sparse_dot -from ..utils.validation import check_is_fitted +from ..utils.extmath import row_norms +from ..utils.validation import check_is_fitted, _deprecate_positional_args from ..exceptions import ConvergenceWarning from . import AgglomerativeClustering @@ -50,10 +50,12 @@ def _split_node(node, threshold, branching_factor): new_subcluster1 = _CFSubcluster() new_subcluster2 = _CFSubcluster() new_node1 = _CFNode( - threshold, branching_factor, is_leaf=node.is_leaf, + threshold=threshold, branching_factor=branching_factor, + is_leaf=node.is_leaf, n_features=node.n_features) new_node2 = _CFNode( - threshold, branching_factor, is_leaf=node.is_leaf, + threshold=threshold, branching_factor=branching_factor, + is_leaf=node.is_leaf, n_features=node.n_features) new_subcluster1.child_ = new_node1 new_subcluster2.child_ = new_node2 @@ -134,7 +136,7 @@ class _CFNode: view of ``init_sq_norm_``. """ - def __init__(self, threshold, branching_factor, is_leaf, n_features): + def __init__(self, *, threshold, branching_factor, is_leaf, n_features): self.threshold = threshold self.branching_factor = branching_factor self.is_leaf = is_leaf @@ -275,7 +277,7 @@ class _CFSubcluster: Squared norm of the subcluster. Used to prevent recomputing when pairwise minimum distances are computed. """ - def __init__(self, linear_sum=None): + def __init__(self, *, linear_sum=None): if linear_sum is None: self.n_samples_ = 0 self.squared_sum_ = 0.0 @@ -431,8 +433,8 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): >>> brc.predict(X) array([0, 0, 0, 1, 1, 1]) """ - - def __init__(self, threshold=0.5, branching_factor=50, n_clusters=3, + @_deprecate_positional_args + def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True): self.threshold = threshold self.branching_factor = branching_factor @@ -475,11 +477,14 @@ def _fit(self, X): has_root = getattr(self, 'root_', None) if getattr(self, 'fit_') or (partial_fit and not has_root): # The first root is the leaf. Manipulate this object throughout. - self.root_ = _CFNode(threshold, branching_factor, is_leaf=True, + self.root_ = _CFNode(threshold=threshold, + branching_factor=branching_factor, + is_leaf=True, n_features=n_features) # To enable getting back subclusters. - self.dummy_leaf_ = _CFNode(threshold, branching_factor, + self.dummy_leaf_ = _CFNode(threshold=threshold, + branching_factor=branching_factor, is_leaf=True, n_features=n_features) self.dummy_leaf_.next_leaf_ = self.root_ self.root_.prev_leaf_ = self.dummy_leaf_ @@ -498,7 +503,8 @@ def _fit(self, X): new_subcluster1, new_subcluster2 = _split_node( self.root_, threshold, branching_factor) del self.root_ - self.root_ = _CFNode(threshold, branching_factor, + self.root_ = _CFNode(threshold=threshold, + branching_factor=branching_factor, is_leaf=False, n_features=n_features) self.root_.append_subcluster(new_subcluster1) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index dd1de3043d444..92a2b0f716ac7 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -15,7 +15,7 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import check_array -from ..utils.validation import _check_sample_weight +from ..utils.validation import _check_sample_weight, _deprecate_positional_args from ..neighbors import NearestNeighbors from ._dbscan_inner import dbscan_inner @@ -270,8 +270,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. ACM Transactions on Database Systems (TODS), 42(3), 19. """ - - def __init__(self, eps=0.5, min_samples=5, metric='euclidean', + @_deprecate_positional_args + def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None): self.eps = eps diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7e4df5908137b..efe4c9fad23b3 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -22,7 +22,7 @@ from ..utils.extmath import row_norms, stable_cumsum from ..utils.sparsefuncs_fast import assign_rows_csr from ..utils.sparsefuncs import mean_variance_axis -from ..utils.validation import _num_samples +from ..utils.validation import _num_samples, _deprecate_positional_args from ..utils import check_array from ..utils import gen_batches from ..utils import check_random_state @@ -895,8 +895,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): array([[10., 2.], [ 1., 2.]]) """ - - def __init__(self, n_clusters=8, init='k-means++', n_init=10, + @_deprecate_positional_args + def __init__(self, n_clusters=8, *, init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='deprecated', verbose=0, random_state=None, copy_x=True, n_jobs='deprecated', algorithm='auto'): @@ -1553,8 +1553,8 @@ class MiniBatchKMeans(KMeans): >>> kmeans.predict([[0, 0], [4, 4]]) array([1, 0], dtype=int32) """ - - def __init__(self, n_clusters=8, init='k-means++', max_iter=100, + @_deprecate_positional_args + def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01): diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 83f655acdd7dd..5245a187bbc86 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -19,7 +19,7 @@ from joblib import Parallel, delayed from collections import defaultdict -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _deprecate_positional_args from ..utils import check_random_state, gen_batches, check_array from ..base import BaseEstimator, ClusterMixin from ..neighbors import NearestNeighbors @@ -346,7 +346,8 @@ class MeanShift(ClusterMixin, BaseEstimator): Machine Intelligence. 2002. pp. 603-619. """ - def __init__(self, bandwidth=None, seeds=None, bin_seeding=False, + @_deprecate_positional_args + def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300): self.bandwidth = bandwidth self.seeds = seeds diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index ec2c45453d2be..8e68b45d9a369 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -16,6 +16,7 @@ from ..utils import check_array from ..utils import gen_batches, get_chunk_n_rows +from ..utils.validation import _deprecate_positional_args from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances @@ -203,10 +204,10 @@ class OPTICS(ClusterMixin, BaseEstimator): >>> clustering.labels_ array([0, 0, 0, 1, 1, 1]) """ - - def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, - metric_params=None, cluster_method='xi', eps=None, xi=0.05, - predecessor_correction=True, min_cluster_size=None, + @_deprecate_positional_args + def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski', + p=2, metric_params=None, cluster_method='xi', eps=None, + xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, n_jobs=None): self.max_eps = max_eps self.min_samples = min_samples diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 34ece7688ec92..0f5f816a993cc 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -11,7 +11,7 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array -from ..utils.validation import check_array +from ..utils.validation import check_array, _deprecate_positional_args from ..metrics.pairwise import pairwise_kernels from ..neighbors import kneighbors_graph, NearestNeighbors from ..manifold import spectral_embedding @@ -433,8 +433,8 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Stella X. Yu, Jianbo Shi https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf """ - - def __init__(self, n_clusters=8, eigen_solver=None, n_components=None, + @_deprecate_positional_args + def __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1., affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None): From 496e7106fa8fff9e955620ec8b3b74d1bba59453 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 27 Feb 2020 10:10:40 -0500 Subject: [PATCH 401/448] MNT Fix autolabeler by pinning the version (#16566) --- .github/workflows/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 65ea75b1e7ad5..d60fa0c1f10c0 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -6,7 +6,7 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: thomasjpfan/labeler@master + - uses: thomasjpfan/labeler@v2.2.0 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: '3' From 6da44dd6b37cb64202b8baed148ed83294001396 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 27 Feb 2020 23:38:52 +0100 Subject: [PATCH 402/448] MAINT Use "Bug: triage" tag for user reports (#16560) --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f980a6d167bf8..102ebd0770535 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -2,7 +2,7 @@ name: Bug report about: Create a report to help us reproduce and correct the bug title: '' -labels: Bug +labels: 'Bug: triage' assignees: '' --- From 1b00c8e87da1c72d6be161e929fbe84216a179ba Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 27 Feb 2020 20:07:47 -0500 Subject: [PATCH 403/448] MNT Removes autolabeler (#16577) --- .github/labeler.yml | 88 ----------------------------------- .github/workflows/labeler.yml | 12 ----- 2 files changed, 100 deletions(-) delete mode 100644 .github/labeler.yml delete mode 100644 .github/workflows/labeler.yml diff --git a/.github/labeler.yml b/.github/labeler.yml deleted file mode 100644 index 107f94627cb13..0000000000000 --- a/.github/labeler.yml +++ /dev/null @@ -1,88 +0,0 @@ -module:cluster: -- sklearn/cluster/**/* - -module:common: -- sklearn/common/**/* - -module:compose: -- sklearn/compose/**/* - -module:covariance: -- sklearn/covariance/**/* - -module:cross_decomposition: -- sklearn/cross_decomposition/**/* - -module:datasets: -- sklearn/datasets/**/* - -module:decomposition: -- sklearn/decomposition/**/* - -module:ensemble: -- sklearn/ensemble/**/* - -module:feature_extraction: -- sklearn/feature_extraction/**/* - -module:feature_selection: -- sklearn/feature_selection/**/* - -module:gaussian_process: -- sklearn/gaussian_process/**/* - -module:impute: -- sklearn/impute/**/* - -module:inspection: -- sklearn/inspection/**/* - -module:linear_model: -- sklearn/linear_model/**/* - -module:manifold: -- sklearn/manifold/**/* - -module:metrics: -- sklearn/metrics/**/* - -module:mixture: -- sklearn/mixture/**/* - -module:model_selection: -- sklearn/model_selection/**/* - -module:naive_bayes: -- sklearn/naive_bayes.py - -module:neighbors: -- sklearn/neighbors/**/* - -module:neural_network: -- sklearn/neural_network/**/* - -module:pipeline: -- sklearn/pipeline.py - -module:preprocessing: -- sklearn/preprocessing/**/* - -module:semi_supervised: -- sklearn/semi_supervised/**/* - -module:svm: -- sklearn/svm/**/* - -module:tree: -- sklearn/tree/**/* - -module:utils: -- sklearn/utils/**/* - -Build / CI: -- build_tools/**/* -- .github/**/* -- maint_tools/**/* -- azure-pipelines.yml -- .travis.yml -- pyproject.toml diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index d60fa0c1f10c0..0000000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: "Pull Request Labeler" -on: -- pull_request - -jobs: - triage: - runs-on: ubuntu-latest - steps: - - uses: thomasjpfan/labeler@v2.2.0 - with: - repo-token: "${{ secrets.GITHUB_TOKEN }}" - max-labels: '3' From e8928e1f11791a86b3e9e63031193edde71eacb9 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 28 Feb 2020 10:51:43 +0100 Subject: [PATCH 404/448] ENH Include verbose message when sample_weight is provided. (#16564) --- sklearn/ensemble/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 07e2afe6a27b5..41e50f7d5a7fc 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -25,7 +25,8 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None, """Private function used to fit an estimator within a job.""" if sample_weight is not None: try: - estimator.fit(X, y, sample_weight=sample_weight) + with _print_elapsed_time(message_clsname, message): + estimator.fit(X, y, sample_weight=sample_weight) except TypeError as exc: if "unexpected keyword argument 'sample_weight'" in str(exc): raise TypeError( From b3e01389353260f3305a1216cb57e954da75fcd2 Mon Sep 17 00:00:00 2001 From: Arunav Konwar Date: Fri, 28 Feb 2020 16:59:07 +0530 Subject: [PATCH 405/448] DOC Remove 3.5 from Python badge in README (#16583) --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 96bc81fa3822d..fa0b665bbc8dd 100644 --- a/README.rst +++ b/README.rst @@ -14,8 +14,8 @@ .. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/master.svg?style=shield&circle-token=:circle-token .. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn -.. |PythonVersion| image:: https://img.shields.io/pypi/pyversions/scikit-learn.svg -.. _PythonVersion: https://img.shields.io/pypi/pyversions/scikit-learn.svg +.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue +.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue .. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg .. _PyPi: https://badge.fury.io/py/scikit-learn From b6bbf58f0c0b6cc7be219acfe8eec8626bd04842 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 07:30:27 -0500 Subject: [PATCH 406/448] DOC Updated gradient boosting UG (#16178) * Updated gradient boosting UG * maybe fix sphinx warnings * Addressed comments from Guillaume --- doc/modules/ensemble.rst | 157 +++++++++++++++++++++++++++------------ 1 file changed, 109 insertions(+), 48 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 3d6798e0d548b..b7c0e49f9c477 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -489,6 +489,10 @@ trees. in this setting. +The usage and the parameters of :class:`GradientBoostingClassifier` and +:class:`GradientBoostingRegressor` are described below. The 2 most important +parameters of these estimators are `n_estimators` and `learning_rate`. + Classification --------------- @@ -509,7 +513,13 @@ with 100 decision stumps as weak learners:: >>> clf.score(X_test, y_test) 0.913... -The number of weak learners (i.e. regression trees) is controlled by the parameter ``n_estimators``; :ref:`The size of each tree ` can be controlled either by setting the tree depth via ``max_depth`` or by setting the number of leaf nodes via ``max_leaf_nodes``. The ``learning_rate`` is a hyper-parameter in the range (0.0, 1.0] that controls overfitting via :ref:`shrinkage ` . +The number of weak learners (i.e. regression trees) is controlled by the +parameter ``n_estimators``; :ref:`The size of each tree +` can be controlled either by setting the tree +depth via ``max_depth`` or by setting the number of leaf nodes via +``max_leaf_nodes``. The ``learning_rate`` is a hyper-parameter in the range +(0.0, 1.0] that controls overfitting via :ref:`shrinkage +` . .. note:: @@ -615,65 +625,118 @@ chapter on gradient boosting in [F2001]_ and is related to the parameter Mathematical formulation ------------------------- -GBRT considers additive models of the following form: +We first present GBRT for regression, and then detail the classification +case. + +Regression +^^^^^^^^^^ + +GBRT regressors are additive models whose prediction :math:`y_i` for a +given input :math:`x_i` is of the following form: .. math:: - F(x) = \sum_{m=1}^{M} \gamma_m h_m(x) + \hat{y_i} = F_M(x_i) = \sum_{m=1}^{M} h_m(x_i) -where :math:`h_m(x)` are the basis functions which are usually called -*weak learners* in the context of boosting. Gradient Tree Boosting -uses :ref:`decision trees ` of fixed size as weak -learners. Decision trees have a number of abilities that make them -valuable for boosting, namely the ability to handle data of mixed type -and the ability to model complex functions. +where the :math:`h_m` are estimators called *weak learners* in the context +of boosting. Gradient Tree Boosting uses :ref:`decision tree regressors +` of fixed size as weak learners. The constant M corresponds to the +`n_estimators` parameter. -Similar to other boosting algorithms, GBRT builds the additive model in -a greedy fashion: +Similar to other boosting algorithms, a GBRT is built in a greedy fashion: .. math:: - F_m(x) = F_{m-1}(x) + \gamma_m h_m(x), + F_m(x) = F_{m-1}(x) + h_m(x), -where the newly added tree :math:`h_m` tries to minimize the loss :math:`L`, -given the previous ensemble :math:`F_{m-1}`: +where the newly added tree :math:`h_m` is fitted in order to minimize a sum +of losses :math:`L_m`, given the previous ensemble :math:`F_{m-1}`: .. math:: - h_m = \arg\min_{h} \sum_{i=1}^{n} L(y_i, - F_{m-1}(x_i) + h(x_i)). + h_m = \arg\min_{h} L_m = \arg\min_{h} \sum_{i=1}^{n} + l(y_i, F_{m-1}(x_i) + h(x_i)), -The initial model :math:`F_{0}` is problem specific, for least-squares -regression one usually chooses the mean of the target values. +where :math:`l(y_i, F(x_i))` is defined by the `loss` parameter, detailed +in the next section. -.. note:: The initial model can also be specified via the ``init`` - argument. The passed object has to implement ``fit`` and ``predict``. +By default, the initial model :math:`F_{0}` is chosen as the constant that +minimizes the loss: for a least-squares loss, this is the empirical mean of +the target values. The initial model can also be specified via the ``init`` +argument. -Gradient Boosting attempts to solve this minimization problem -numerically via steepest descent: The steepest descent direction is -the negative gradient of the loss function evaluated at the current -model :math:`F_{m-1}` which can be calculated for any differentiable -loss function: +Using a first-order Taylor approximation, the value of :math:`l` can be +approximated as follows: .. math:: - F_m(x) = F_{m-1}(x) - \gamma_m \sum_{i=1}^{n} \nabla_F L(y_i, - F_{m-1}(x_i)) + l(y_i, F_{m-1}(x_i) + h_m(x_i)) \approx + l(y_i, F_{m-1}(x_i)) + + h_m(x_i) + \left[ \frac{\partial l(y_i, F(x_i))}{\partial F(x_i)} \right]_{F=F_{m - 1}}. + +.. note:: + + Briefly, a first-order Taylor approximation says that + :math:`l(z) \approx l(a) + (z - a) \frac{\partial l(a)}{\partial a}`. + Here, :math:`z` corresponds to :math:`F_{m - 1}(x_i) + h_m(x_i)`, and + :math:`a` corresponds to :math:`F_{m-1}(x_i)` + +The quantity :math:`\left[ \frac{\partial l(y_i, F(x_i))}{\partial F(x_i)} +\right]_{F=F_{m - 1}}` is the derivative of the loss with respect to its +second parameter, evaluated at :math:`F_{m-1}(x)`. It is easy to compute for +any given :math:`F_{m - 1}(x_i)` in a closed form since the loss is +differentiable. We will denote it by :math:`g_i`. -Where the step length :math:`\gamma_m` is chosen using line search: +Removing the constant terms, we have: .. math:: - \gamma_m = \arg\min_{\gamma} \sum_{i=1}^{n} L(y_i, F_{m-1}(x_i) - - \gamma \frac{\partial L(y_i, F_{m-1}(x_i))}{\partial F_{m-1}(x_i)}) + h_m \approx \arg\min_{h} \sum_{i=1}^{n} h(x_i) g_i + +This is minimized if :math:`h(x_i)` is fitted to predict a value that is +proportional to the negative gradient :math:`-g_i`. Therefore, at each +iteration, **the estimator** :math:`h_m` **is fitted to predict the negative +gradients of the samples**. The gradients are updated at each iteration. +This can be considered as some kind of gradient descent in a functional +space. + +.. note:: + + For some losses, e.g. the least absolute deviation (LAD) where the gradients + are :math:`\pm 1`, the values predicted by a fitted :math:`h_m` are not + accurate enough: the tree can only output integer values. As a result, the + leaves values of the tree :math:`h_m` are modified once the tree is + fitted, such that the leaves values minimize the loss :math:`L_m`. The + update is loss-dependent: for the LAD loss, the value of a leaf is updated + to the median of the samples in that leaf. -The algorithms for regression and classification -only differ in the concrete loss function used. +Classification +^^^^^^^^^^^^^^ + +Gradient boosting for classification is very similar to the regression case. +However, the sum of the trees :math:`F_M(x_i) = \sum_m h_m(x_i)` is not +homogeneous to a prediction: it cannot be a class, since the trees predict +continuous values. + +The mapping from the value :math:`F_M(x_i)` to a class or a probability is +loss-dependent. For the deviance (or log-loss), the probability that +:math:`x_i` belongs to the positive class is modeled as :math:`p(y_i = 1 | +x_i) = \sigma(F_M(x_i))` where :math:`\sigma` is the sigmoid function. + +For multiclass classification, K trees (for K classes) are built at each of +the :math:`M` iterations. The probability that :math:`x_i` belongs to class +k is modeled as a softmax of the :math:`F_{M,k}(x_i)` values. + +Note that even for a classification task, the :math:`h_m` sub-estimator is +still a regressor, not a classifier. This is because the sub-estimators are +trained to predict (negative) *gradients*, which are always continuous +quantities. .. _gradient_boosting_loss: Loss Functions -............... +-------------- The following loss functions are supported and can be specified using the parameter ``loss``: @@ -713,20 +776,17 @@ the parameter ``loss``: examples than ``'deviance'``; can only be used for binary classification. -Regularization ----------------- - .. _gradient_boosting_shrinkage: -Shrinkage -.......... +Shrinkage via learning rate +--------------------------- [F2001]_ proposed a simple regularization strategy that scales -the contribution of each weak learner by a factor :math:`\nu`: +the contribution of each weak learner by a constant factor :math:`\nu`: .. math:: - F_m(x) = F_{m-1}(x) + \nu \gamma_m h_m(x) + F_m(x) = F_{m-1}(x) + \nu h_m(x) The parameter :math:`\nu` is also called the **learning rate** because it scales the step length the gradient descent procedure; it can @@ -743,7 +803,7 @@ stopping. For a more detailed discussion of the interaction between ``learning_rate`` and ``n_estimators`` see [R2007]_. Subsampling -............ +----------- [F1999]_ proposed stochastic gradient boosting, which combines gradient boosting with bootstrap averaging (bagging). At each iteration @@ -787,8 +847,8 @@ is too time consuming. * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_oob.py` * :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py` -Interpretation --------------- +Interpretation with feature importance +-------------------------------------- Individual decision trees can be interpreted easily by simply visualizing the tree structure. Gradient boosting models, however, @@ -797,9 +857,6 @@ interpreted by visual inspection of the individual trees. Fortunately, a number of techniques have been proposed to summarize and interpret gradient boosting models. -Feature importance -.................. - Often features do not contribute equally to predict the target response; in many situations the majority of the features are in fact irrelevant. @@ -827,6 +884,10 @@ accessed via the ``feature_importances_`` property:: >>> clf.feature_importances_ array([0.10..., 0.10..., 0.11..., ... +Note that this computation of feature importance is based on entropy, and it +is distinct from :func:`sklearn.inspection.permutation_importance` which is +based on permutation of the features. + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` @@ -1089,7 +1150,7 @@ based on the ascending sort order. E.g., in the following scenario the class label 1 will be assigned to the sample. Usage -..... +----- The following example shows how to fit the majority rule classifier:: @@ -1206,7 +1267,7 @@ hyperparameters of the individual estimators:: >>> grid = grid.fit(iris.data, iris.target) Usage -..... +----- In order to predict the class labels based on the predicted class-probabilities (scikit-learn estimators in the VotingClassifier From b14d420f1b4583220a310dda9b076c9e00292f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 28 Feb 2020 13:59:30 +0100 Subject: [PATCH 407/448] move 0.22.2 what's new entries (#16586) --- doc/whats_new/v0.22.rst | 42 +++++++++++++++++++++++++++++++++++++++++ doc/whats_new/v0.23.rst | 26 ------------------------- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 399f6352410e9..381594219d597 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -2,6 +2,48 @@ .. currentmodule:: sklearn +.. _changes_0_22_2: + +Version 0.22.2 +============== + +**February 28 2020** + +Changelog +--------- + +:mod:`sklearn.impute` +..................... + +- |Efficiency| Reduce :func:`impute.KNNImputer` asymptotic memory usage by + chunking pairwise distance computation. + :pr:`16397` by `Joel Nothman`_. + +:mod:`sklearn.metrics` +...................... + +- |Fix| Fixed a bug in :func:`metrics.plot_roc_curve` where + the name of the estimator was passed in the :class:`metrics.RocCurveDisplay` + instead of the parameter `name`. It results in a different plot when calling + :meth:`metrics.RocCurveDisplay.plot` for the subsequent times. + :pr:`16500` by :user:`Guillaume Lemaitre `. + +- |Fix| Fixed a bug in :func:`metrics.plot_precision_recall_curve` where the + name of the estimator was passed in the + :class:`metrics.PrecisionRecallDisplay` instead of the parameter `name`. It + results in a different plot when calling + :meth:`metrics.PrecisionRecallDisplay.plot` for the subsequent times. + :pr:`#16505` by :user:`Guillaume Lemaitre `. + +:mod:`sklearn.neighbors` +.............................. + +- |Fix| Fix a bug which converted a list of arrays into a 2-D object + array instead of a 1-D array containing NumPy arrays. This bug + was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`. + :pr:`16076` by :user:`Guillaume Lemaitre ` and + :user:`Alex Shacked `. + .. _changes_0_22_1: Version 0.22.1 diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 027ca62ccd853..368d92a012097 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -206,10 +206,6 @@ Changelog :mod:`sklearn.impute` ..................... -- |Efficiency| Reduce :func:`impute.KNNImputer` asymptotic memory usage by - chunking pairwise distance computation. - :pr:`16397` by `Joel Nothman`_. - - |Enhancement| :class:`impute.IterativeImputer` accepts both scalar and array-like inputs for ``max_value`` and ``min_value``. Array-like inputs allow a different max and min to be specified for each feature. :pr:`16403` by :user:`Narendra Mukherjee `. @@ -278,25 +274,12 @@ Changelog - |Fix| Fixed a bug in :func:`metrics.mutual_info_score` where negative scores could be returned. :pr:`16362` by `Thomas Fan`_. -- |Fix| Fixed a bug in :func:`metrics.plot_precision_recall_curve` where the - name of the estimator was passed in the - :class:`metrics.PrecisionRecallDisplay` instead of the parameter `name`. It - results in a different plot when calling - :meth:`metrics.PrecisionRecallDisplay.plot` for the subsequent times. - :pr:`#16505` by :user:`Guillaume Lemaitre `. - - |Fix| Fixed a bug in :func:`metrics.confusion_matrix` that would raise an error when `y_true` and `y_pred` were length zero and `labels` was not `None`. In addition, we raise an error when an empty list is given to the `labels` parameter. :pr:`16442` by `Kyle Parsons `. -- |Fix| Fixed a bug in :func:`metrics.plot_roc_curve` where - the name of the estimator was passed in the :class:`metrics.RocCurveDisplay` - instead of the parameter `name`. It results in a different plot when calling - :meth:`metrics.RocCurveDisplay.plot` for the subsequent times. - :pr:`16500` by :user:`Guillaume Lemaitre `. - :mod:`sklearn.model_selection` .............................. @@ -329,15 +312,6 @@ Changelog differs between `predict` and `fit`. :pr:`16090` by :user:`Madhura Jayaratne `. -:mod:`sklearn.neighbors` -.............................. - -- |Fix| Fix a bug which converted a list of arrays into a 2-D object - array instead of a 1-D array containing NumPy arrays. This bug - was affecting :meth:`neighbors.NearestNeighbors.radius_neighbors`. - :pr:`16076` by :user:`Guillaume Lemaitre ` and - :user:`Alex Shacked `. - :mod:`sklearn.neural_network` ............................. From 5009d113474b163d409822a5cac036b448a9dc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 28 Feb 2020 15:01:22 +0100 Subject: [PATCH 408/448] DOC remind to update the version in maintainers guide (#16589) --- doc/developers/maintainer.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index f38514ec075b7..2a42bee301554 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -83,6 +83,8 @@ master!) with all the desired changes:: $ git rebase -i upstream/0.999.2 +Do not forget to add a commit updating sklearn.__version__. + It's nice to have a copy of the ``git rebase -i`` log in the PR to help others understand what's included. From d205638475ca542dc46862652e3bb0be663a8eac Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 09:05:11 -0500 Subject: [PATCH 409/448] MNT Introduction of n_features_in_ attr with _validate_data mtd (#16112) --- doc/developers/develop.rst | 11 +++ doc/whats_new/v0.23.rst | 10 ++- sklearn/base.py | 68 ++++++++++++++ sklearn/calibration.py | 4 +- sklearn/cluster/_affinity_propagation.py | 2 +- sklearn/cluster/_agglomerative.py | 13 ++- sklearn/cluster/_bicluster.py | 2 +- sklearn/cluster/_birch.py | 2 +- sklearn/cluster/_dbscan.py | 2 +- sklearn/cluster/_kmeans.py | 10 ++- sklearn/cluster/_mean_shift.py | 2 +- sklearn/cluster/_optics.py | 2 +- sklearn/cluster/_spectral.py | 4 +- sklearn/cluster/tests/test_bicluster.py | 11 +++ sklearn/compose/_column_transformer.py | 3 + sklearn/compose/_target.py | 15 ++++ .../compose/tests/test_column_transformer.py | 12 +++ sklearn/covariance/_empirical_covariance.py | 2 +- sklearn/covariance/_graph_lasso.py | 6 +- sklearn/covariance/_robust_covariance.py | 2 +- sklearn/covariance/_shrunk_covariance.py | 6 +- sklearn/cross_decomposition/_pls.py | 8 +- sklearn/decomposition/_dict_learning.py | 8 +- sklearn/decomposition/_factor_analysis.py | 2 +- sklearn/decomposition/_fastica.py | 8 +- sklearn/decomposition/_incremental_pca.py | 4 +- sklearn/decomposition/_kernel_pca.py | 2 +- sklearn/decomposition/_lda.py | 34 +++++-- sklearn/decomposition/_nmf.py | 4 +- sklearn/decomposition/_pca.py | 4 +- sklearn/decomposition/_sparse_pca.py | 4 +- sklearn/decomposition/_truncated_svd.py | 4 +- .../decomposition/tests/test_dict_learning.py | 6 ++ sklearn/discriminant_analysis.py | 6 +- sklearn/dummy.py | 3 + sklearn/ensemble/_bagging.py | 6 +- sklearn/ensemble/_forest.py | 8 +- sklearn/ensemble/_gb.py | 3 +- .../gradient_boosting.py | 3 +- sklearn/ensemble/_stacking.py | 1 + sklearn/ensemble/_voting.py | 15 ++++ sklearn/ensemble/_weight_boosting.py | 49 ++++------ sklearn/ensemble/tests/test_voting.py | 20 +++++ .../tests/test_dict_vectorizer.py | 9 ++ sklearn/feature_extraction/tests/test_text.py | 12 +++ sklearn/feature_selection/_from_model.py | 15 ++++ sklearn/feature_selection/_rfe.py | 17 ++-- .../_univariate_selection.py | 3 +- .../feature_selection/_variance_threshold.py | 5 +- sklearn/gaussian_process/_gpc.py | 8 +- sklearn/gaussian_process/_gpr.py | 8 +- sklearn/impute/_base.py | 23 ++--- sklearn/impute/_iterative.py | 4 +- sklearn/impute/_knn.py | 5 +- sklearn/kernel_approximation.py | 8 +- sklearn/kernel_ridge.py | 4 +- sklearn/linear_model/_base.py | 4 +- sklearn/linear_model/_bayes.py | 6 +- sklearn/linear_model/_coordinate_descent.py | 21 +++-- sklearn/linear_model/_huber.py | 2 +- sklearn/linear_model/_least_angle.py | 6 +- sklearn/linear_model/_logistic.py | 11 +-- sklearn/linear_model/_omp.py | 6 +- sklearn/linear_model/_ransac.py | 2 +- sklearn/linear_model/_ridge.py | 22 ++--- sklearn/linear_model/_stochastic_gradient.py | 10 ++- sklearn/linear_model/_theil_sen.py | 2 +- sklearn/manifold/_isomap.py | 2 +- sklearn/manifold/_locally_linear.py | 2 +- sklearn/manifold/_mds.py | 2 +- sklearn/manifold/_spectral_embedding.py | 4 +- sklearn/manifold/_t_sne.py | 9 +- sklearn/mixture/_base.py | 1 + sklearn/model_selection/_search.py | 14 +++ sklearn/model_selection/tests/test_search.py | 19 ++++ sklearn/multiclass.py | 18 +++- sklearn/multioutput.py | 6 +- sklearn/naive_bayes.py | 7 +- sklearn/neighbors/_base.py | 3 +- sklearn/neighbors/_kde.py | 2 +- sklearn/neighbors/_nca.py | 2 +- sklearn/neighbors/_nearest_centroid.py | 4 +- .../neural_network/_multilayer_perceptron.py | 8 +- sklearn/neural_network/_rbm.py | 2 +- sklearn/pipeline.py | 10 +++ sklearn/preprocessing/_data.py | 90 +++++++++++-------- sklearn/preprocessing/_discretization.py | 2 +- .../preprocessing/_function_transformer.py | 2 +- sklearn/random_projection.py | 2 +- sklearn/semi_supervised/_label_propagation.py | 2 +- sklearn/svm/_base.py | 6 +- sklearn/svm/_classes.py | 12 +-- sklearn/tests/test_dummy.py | 11 +++ sklearn/tests/test_pipeline.py | 45 ++++++++++ sklearn/tree/_classes.py | 2 +- sklearn/utils/estimator_checks.py | 40 +++++++++ sklearn/utils/tests/test_estimator_checks.py | 55 ++++++------ 97 files changed, 703 insertions(+), 285 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index a9ebaaced0672..96aa942fb9238 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -226,6 +226,17 @@ the dataset, e.g. when ``X`` is a precomputed kernel matrix. Specifically, the :term:`_pairwise` property is used by ``utils.metaestimators._safe_split`` to slice rows and columns. +Universal attributes +^^^^^^^^^^^^^^^^^^^^ + +Estimators that expect tabular input should set a `n_features_in_` +attribute at `fit` time to indicate the number of features that the estimator +expects for subsequent calls to `predict` or `transform`. +See +`SLEP010 +`_ +for details. + .. _rolling_your_own_estimator: Rolling your own estimator diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 368d92a012097..89622c52c5041 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -14,7 +14,6 @@ Version 0.23.0 Put the changes in their relevant module. - Changed models -------------- @@ -378,3 +377,12 @@ Changelog - |Fix| :class:`cluster.AgglomerativeClustering` add specific error when distance matrix is not square and `affinity=precomputed`. :pr:`16257` by :user:`Simona Maggio `. + +Miscellaneous +............. + +- |API| Most estimators now expose a `n_features_in_` attribute. This + attribute is equal to the number of features passed to the `fit` method. + See `SLEP010 + `_ + for details. :pr:`16112` by `Nicolas Hug`_. diff --git a/sklearn/base.py b/sklearn/base.py index e56e13872bffb..70dec8c030418 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -18,6 +18,8 @@ from . import __version__ from .utils import _IS_32BIT +from .utils.validation import check_X_y +from .utils.validation import check_array _DEFAULT_TAGS = { 'non_deterministic': False, @@ -343,6 +345,72 @@ def _get_tags(self): collected_tags.update(more_tags) return collected_tags + def _check_n_features(self, X, reset): + """Set the `n_features_in_` attribute, or check against it. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples. + reset : bool + If True, the `n_features_in_` attribute is set to `X.shape[1]`. + Else, the attribute must already exist and the function checks + that it is equal to `X.shape[1]`. + """ + n_features = X.shape[1] + + if reset: + self.n_features_in_ = n_features + else: + if not hasattr(self, 'n_features_in_'): + raise RuntimeError( + "The reset parameter is False but there is no " + "n_features_in_ attribute. Is this estimator fitted?" + ) + if n_features != self.n_features_in_: + raise ValueError( + 'X has {} features, but this {} is expecting {} features ' + 'as input.'.format(n_features, self.__class__.__name__, + self.n_features_in_) + ) + + def _validate_data(self, X, y=None, reset=True, **check_params): + """Validate input data and set or check the `n_features_in_` attribute. + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,), default=None + The targets. If None, `check_array` is called on `X` and + `check_X_y` is called otherwise. + reset : bool, default=True + Whether to reset the `n_features_in_` attribute. + If False, the input will be checked for consistency with data + provided when reset was last True. + **check_params : kwargs + Parameters passed to :func:`sklearn.utils.check_array` or + :func:`sklearn.utils.check_X_y`. + + Returns + ------- + out : {ndarray, sparse matrix} or tuple of these + The validated input. A tuple is returned if `y` is not None. + """ + + if y is None: + X = check_array(X, **check_params) + out = X + else: + X, y = check_X_y(X, y, **check_params) + out = X, y + + if check_params.get('ensure_2d', True): + self._check_n_features(X, reset=reset) + + return out + class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ff9c4b3e75c44..a5490efa28c0a 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -124,8 +124,8 @@ def fit(self, X, y, sample_weight=None): self : object Returns an instance of self. """ - X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'], - force_all_finite=False, allow_nd=True) + X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo'], + force_all_finite=False, allow_nd=True) X, y = indexable(X, y) le = LabelBinarizer().fit(y) self.classes_ = le.classes_ diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 5f9ce02d869b7..9516c8e4bdd05 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -374,7 +374,7 @@ def fit(self, X, y=None): accept_sparse = False else: accept_sparse = 'csr' - X = check_array(X, accept_sparse=accept_sparse) + X = self._validate_data(X, accept_sparse=accept_sparse) if self.affinity == "precomputed": self.affinity_matrix_ = X elif self.affinity == "euclidean": diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 6bc5d30adee2b..182ae4b481116 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -809,7 +809,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ensure_min_samples=2, estimator=self) + X = self._validate_data(X, ensure_min_samples=2, estimator=self) memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: @@ -1055,9 +1055,14 @@ def fit(self, X, y=None, **params): ------- self """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - ensure_min_features=2, estimator=self) - return AgglomerativeClustering.fit(self, X.T, **params) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + ensure_min_features=2, estimator=self) + # save n_features_in_ attribute here to reset it after, because it will + # be overridden in AgglomerativeClustering since we passed it X.T. + n_features_in_ = self.n_features_in_ + AgglomerativeClustering.fit(self, X.T, **params) + self.n_features_in_ = n_features_in_ + return self @property def fit_predict(self): diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 356072f2d33aa..c98272d6aae33 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -121,7 +121,7 @@ def fit(self, X, y=None): warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" " removed in 0.25.", FutureWarning) - X = check_array(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() self._fit(X) return self diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 68c07cc5a2860..1d81dafc7504d 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -463,7 +463,7 @@ def fit(self, X, y=None): return self._fit(X) def _fit(self, X): - X = check_array(X, accept_sparse='csr', copy=self.copy) + X = self._validate_data(X, accept_sparse='csr', copy=self.copy) threshold = self.threshold branching_factor = self.branching_factor diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 92a2b0f716ac7..6a33f411886b0 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -308,7 +308,7 @@ def fit(self, X, y=None, sample_weight=None): self """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') if not self.eps > 0.0: raise ValueError("eps must be positive.") diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index efe4c9fad23b3..27ec0e5f388f6 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -975,8 +975,10 @@ def fit(self, X, y=None, sample_weight=None): ' got %d instead' % self.max_iter ) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], - order='C', copy=self.copy_x, accept_large_sparse=False) + X = self._validate_data(X, accept_sparse='csr', + dtype=[np.float64, np.float32], + order='C', copy=self.copy_x, + accept_large_sparse=False) # verify that the number of samples given is larger than k if _num_samples(X) < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( @@ -1591,8 +1593,8 @@ def fit(self, X, y=None, sample_weight=None): self """ random_state = check_random_state(self.random_state) - X = check_array(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse="csr", order='C', + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 5245a187bbc86..dc90967ebe5dc 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -368,7 +368,7 @@ def fit(self, X, y=None): y : Ignored """ - X = check_array(X) + X = self._validate_data(X) bandwidth = self.bandwidth if bandwidth is None: bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 8e68b45d9a369..92322b0ab0bfd 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -245,7 +245,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float) + X = self._validate_data(X, dtype=np.float) if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 0f5f816a993cc..2faddabefa157 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -474,8 +474,8 @@ def fit(self, X, y=None): self """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64, ensure_min_samples=2) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, ensure_min_samples=2) allow_squared = self.affinity in ["precomputed", "precomputed_nearest_neighbors"] if X.shape[0] == X.shape[1] and not allow_squared: diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index a31e61dd2423d..7d5a920600d7d 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -254,6 +254,17 @@ def test_wrong_shape(): model.fit(data) +@pytest.mark.parametrize('est', + (SpectralBiclustering(), SpectralCoclustering())) +def test_n_features_in_(est): + + X, _, _ = make_biclusters((3, 3), 3, random_state=0) + + assert not hasattr(est, 'n_features_in_') + est.fit(X) + assert est.n_features_in_ == 3 + + @pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering]) @pytest.mark.parametrize("n_jobs", [None, 1]) def test_n_jobs_deprecated(klass, n_jobs): diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f5526ec185875..e94757bca6993 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -513,6 +513,8 @@ def fit_transform(self, X, y=None): else: self._feature_names_in = None X = _check_X(X) + # set n_features_in_ attribute + self._check_n_features(X, reset=True) self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) @@ -587,6 +589,7 @@ def transform(self, X): 'and for transform when using the ' 'remainder keyword') + # TODO: also call _check_n_features(reset=False) in 0.24 self._validate_features(X.shape[1], X_feature_names) Xs = self._fit_transform(X, None, _transform_one, fitted=True) self._validate_output(Xs) diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index aad8050cb689e..27f4ef63edf68 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -10,6 +10,7 @@ from ..utils.validation import check_is_fitted from ..utils import check_array, _safe_indexing from ..preprocessing import FunctionTransformer +from ..exceptions import NotFittedError __all__ = ['TransformedTargetRegressor'] @@ -235,3 +236,17 @@ def predict(self, X): def _more_tags(self): return {'poor_score': True, 'no_validation': True} + + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() returns False the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.regressor_.n_features_in_ diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f6a49f4cd6601..d26f6895427bb 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1187,6 +1187,18 @@ def test_column_transformer_mask_indexing(array_type): assert X_trans.shape == (3, 2) +def test_n_features_in(): + # make sure n_features_in is what is passed as input to the column + # transformer. + + X = [[1, 2], [3, 4], [5, 6]] + ct = ColumnTransformer([('a', DoubleTrans(), [0]), + ('b', DoubleTrans(), [1])]) + assert not hasattr(ct, 'n_features_in_') + ct.fit(X) + assert ct.n_features_in_ == 2 + + @pytest.mark.parametrize('cols, pattern, include, exclude', [ (['col_int', 'col_float'], None, np.number, None), (['col_int', 'col_float'], None, None, object), diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 2f617b981ca54..30520b64dc507 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -198,7 +198,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X) + X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 9dbf786839585..4fd1b01d763b2 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -382,8 +382,8 @@ def fit(self, X, y=None): self : object """ # Covariance does not make sense for a single feature - X = check_array(X, ensure_min_features=2, ensure_min_samples=2, - estimator=self) + X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2, + estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) @@ -659,7 +659,7 @@ def fit(self, X, y=None): self : object """ # Covariance does not make sense for a single feature - X = check_array(X, ensure_min_features=2, estimator=self) + X = self._validate_data(X, ensure_min_features=2, estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 93624931c4303..586c39cadecbe 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -637,7 +637,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X, ensure_min_samples=2, estimator='MinCovDet') + X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet') random_state = check_random_state(self.random_state) n_samples, n_features = X.shape # check that the empirical covariance is full rank diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 61907274ad823..b2c43cb3eb3cd 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -140,7 +140,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X) + X = self._validate_data(X) # Not calling the parent object to fit, to avoid a potential # matrix inversion when setting the precision if self.assume_centered: @@ -412,7 +412,7 @@ def fit(self, X, y=None): """ # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) - X = check_array(X) + X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: @@ -562,7 +562,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X) + X = self._validate_data(X) # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) if self.assume_centered: diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 125c5946b1562..af81ece6baf58 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -277,8 +277,8 @@ def fit(self, X, Y): # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_data(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) @@ -886,8 +886,8 @@ def fit(self, X, Y): """ # copy since this will contains the centered data check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_data(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index f19305dbfc272..49b78a0916e7a 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1043,6 +1043,10 @@ def fit(self, X, y=None): """ return self + @property + def n_features_in_(self): + return self.components_.shape[1] + class DictionaryLearning(SparseCodingMixin, BaseEstimator): """Dictionary learning @@ -1217,7 +1221,7 @@ def fit(self, X, y=None): Returns the object itself """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) if self.n_components is None: n_components = X.shape[1] else: @@ -1424,7 +1428,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) U, (A, B), self.n_iter_ = dict_learning_online( X, self.n_components, self.alpha, diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 21bf89ae056d8..7147fd452559c 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -168,7 +168,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, copy=self.copy, dtype=np.float64) + X = self._validate_data(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape n_components = self.n_components diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 44e665556b805..f9e3a148f6860 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -424,14 +424,12 @@ def _fit(self, X, compute_sources=False): ------- X_new : array-like, shape (n_samples, n_components) """ + + X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args random_state = check_random_state(self.random_state) - # make interface compatible with other decompositions - # a copy is required only for non whitened data - X = check_array(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T - alpha = fun_args.get('alpha', 1.0) if not 1 <= alpha <= 2: raise ValueError('alpha must be in [1,2]') diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index fe7c57c61999a..2a0d19d373dbb 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -194,8 +194,8 @@ def fit(self, X, y=None): self.singular_values_ = None self.noise_variance_ = None - X = check_array(X, accept_sparse=['csr', 'csc', 'lil'], - copy=self.copy, dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'lil'], + copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index a8559f341591b..b1f83c8e0ff81 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -275,7 +275,7 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - X = check_array(X, accept_sparse='csr', copy=self.copy_X) + X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X) self._centerer = KernelCenterer() K = self._get_kernel(X) self._fit_transform(K) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 48b52df811734..ba68e03a16191 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -467,7 +467,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def _more_tags(self): return {'requires_positive_X': True} - def _check_non_neg_array(self, X, whom): + def _check_non_neg_array(self, X, reset_n_features, whom): """check X format check X format and make sure no negative value in X. @@ -477,7 +477,8 @@ def _check_non_neg_array(self, X, whom): X : array-like or sparse matrix """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, reset=reset_n_features, + accept_sparse='csr') check_non_negative(X, whom) return X @@ -496,13 +497,23 @@ def partial_fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, + first_time = not hasattr(self, 'components_') + + # In theory reset should be equal to `first_time`, but there are tests + # checking the input number of feature and they expect a specific + # string, which is not the same one raised by check_n_features. So we + # don't check n_features_in_ here for now (it's done with adhoc code in + # the estimator anyway). + # TODO: set reset=first_time when addressing reset in + # predict/transform/etc. + reset_n_features = True + X = self._check_non_neg_array(X, reset_n_features, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size # initialize parameters or check - if not hasattr(self, 'components_'): + if first_time: self._init_latent_vars(n_features) if n_features != self.components_.shape[1]: @@ -540,7 +551,8 @@ def fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit") + X = self._check_non_neg_array(X, reset_n_features=True, + whom="LatentDirichletAllocation.fit") n_samples, n_features = X.shape max_iter = self.max_iter evaluate_every = self.evaluate_every @@ -609,7 +621,9 @@ def _unnormalized_transform(self, X): check_is_fitted(self) # make sure feature size is the same in fitted model and in X - X = self._check_non_neg_array(X, "LatentDirichletAllocation.transform") + X = self._check_non_neg_array( + X, reset_n_features=True, + whom="LatentDirichletAllocation.transform") n_samples, n_features = X.shape if n_features != self.components_.shape[1]: raise ValueError( @@ -733,7 +747,8 @@ def score(self, X, y=None): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, "LatentDirichletAllocation.score") + X = self._check_non_neg_array(X, reset_n_features=True, + whom="LatentDirichletAllocation.score") doc_topic_distr = self._unnormalized_transform(X) score = self._approx_bound(X, doc_topic_distr, sub_sampling=False) @@ -762,8 +777,9 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, """ check_is_fitted(self) - X = self._check_non_neg_array(X, - "LatentDirichletAllocation.perplexity") + X = self._check_non_neg_array( + X, reset_n_features=True, + whom="LatentDirichletAllocation.perplexity") if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 1c14a84a990be..86c9acddfea1e 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1275,8 +1275,8 @@ def fit_transform(self, X, y=None, W=None, H=None): W : array, shape (n_samples, n_components) Transformed data. """ - X = check_array(X, accept_sparse=('csr', 'csc'), - dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + dtype=[np.float64, np.float32]) W, H, n_iter_ = non_negative_factorization( X=X, W=W, H=H, n_components=self.n_components, init=self.init, diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 41b5bd68fecce..f64a9752896b3 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -386,8 +386,8 @@ def _fit(self, X): raise TypeError('PCA does not support sparse input. See ' 'TruncatedSVD for a possible alternative.') - X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, - copy=self.copy) + X = self._validate_data(X, dtype=[np.float64, np.float32], + ensure_2d=True, copy=self.copy) # Handle n_components==None if self.n_components is None: diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index da1bd0dddf529..158bbefc22e92 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -165,7 +165,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ @@ -371,7 +371,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index e3bddd23c4de8..940eab56feea8 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -157,8 +157,8 @@ def fit_transform(self, X, y=None): X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - X = check_array(X, accept_sparse=['csr', 'csc'], - ensure_min_features=2) + X = self._validate_data(X, accept_sparse=['csr', 'csc'], + ensure_min_features=2) random_state = check_random_state(self.random_state) if self.algorithm == "arpack": diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 9ecc9cbf25598..5f082ffea13ee 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -498,3 +498,9 @@ def test_sparse_coder_parallel_mmap(): sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data) + + +def test_sparse_coder_n_features_in(): + d = np.array([[1, 2, 3], [1, 2, 3]]) + sc = SparseCoder(d) + assert sc.n_features_in_ == d.shape[1] diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index da986f900ab8e..a954081b380cb 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -423,8 +423,8 @@ def fit(self, X, y): y : array-like of shape (n_samples,) Target values. """ - X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self, - dtype=[np.float64, np.float32]) + X, y = self._validate_data(X, y, ensure_min_samples=2, estimator=self, + dtype=[np.float64, np.float32]) self.classes_ = unique_labels(y) n_samples, _ = X.shape n_classes = len(self.classes_) @@ -645,7 +645,7 @@ def fit(self, X, y): y : array-like of shape (n_samples,) Target values (integers) """ - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 6fb9b21711930..daa2c1ff0da11 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -156,6 +156,8 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] + self.n_features_in_ = None # No input validation is done for X + check_consistent_length(X, y) if sample_weight is not None: @@ -489,6 +491,7 @@ def fit(self, X, y, sample_weight=None): % (self.strategy, allowed_strategies)) y = check_array(y, ensure_2d=False) + self.n_features_in_ = None # No input validation is done for X if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 8533c84ef5e88..d73f38954d21a 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -278,9 +278,9 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) - X, y = check_X_y( - X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, - multi_output=True + X, y = self._validate_data( + X, y, accept_sparse=['csr', 'csc'], dtype=None, + force_all_finite=False, multi_output=True ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 0b1c1c1c73686..8070d0cd0eaff 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -292,13 +292,13 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - # Validate and convert input data + # Validate or convert input data if issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported." - ) - X = check_array(X, accept_sparse="csc", dtype=DTYPE) - y = check_array(y, ensure_2d=False, dtype=None) + ) + X, y = self._validate_data(X, y, multi_output=True, + accept_sparse="csc", dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index ef76e3ba97bd7..4a458359437b3 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -411,7 +411,8 @@ def fit(self, X, y, sample_weight=None, monitor=None): # Check input # Since check_array converts both X and y to the same dtype, but the # trees use different types for X and y, checking them separately. - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=DTYPE) n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 97fcdb4713802..e18d3ac4b1f9b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -106,7 +106,8 @@ def fit(self, X, y, sample_weight=None): acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. - X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False) + X, y = self._validate_data(X, y, dtype=[X_DTYPE], + force_all_finite=False) y = self._encode_y(y) check_consistent_length(X, y) # Do not create unit sample weights by default to later skip some diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 6b206dfbe3d02..a5fedc6431344 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -140,6 +140,7 @@ def fit(self, X, y, sample_weight=None): delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight) for est in all_estimators if est != 'drop' ) + self.n_features_in_ = self.estimators_[0].n_features_in_ self.named_estimators_ = Bunch() est_fitted_idx = 0 diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index f79870a2c6891..d01647a5e4448 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -30,6 +30,7 @@ from ..utils.validation import check_is_fitted from ..utils.multiclass import check_classification_targets from ..utils.validation import column_or_1d +from ..exceptions import NotFittedError class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): @@ -88,6 +89,20 @@ def fit(self, X, y, sample_weight=None): return self + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.estimators_[0].n_features_in_ + class VotingClassifier(ClassifierMixin, _BaseVoting): """Soft Voting/Majority Rule classifier for unfitted estimators. diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 2908b888e7c91..de73858f4bb3f 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -71,25 +71,9 @@ def __init__(self, self.learning_rate = learning_rate self.random_state = random_state - def _validate_data(self, X, y=None): - - # Accept or convert to these sparse matrix formats so we can - # use _safe_indexing - accept_sparse = ['csr', 'csc'] - if y is None: - ret = check_array(X, - accept_sparse=accept_sparse, - ensure_2d=False, - allow_nd=True, - dtype=None) - else: - ret = check_X_y(X, y, - accept_sparse=accept_sparse, - ensure_2d=False, - allow_nd=True, - dtype=None, - y_numeric=is_regressor(self)) - return ret + def _check_X(self, X): + return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True, + allow_nd=True, dtype=None) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). @@ -116,7 +100,12 @@ def fit(self, X, y, sample_weight=None): if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") - X, y = self._validate_data(X, y) + X, y = self._validate_data(X, y, + accept_sparse=['csr', 'csc'], + ensure_2d=True, + allow_nd=True, + dtype=None, + y_numeric=is_regressor(self)) sample_weight = _check_sample_weight(sample_weight, X, np.float64) sample_weight /= sample_weight.sum() @@ -229,7 +218,7 @@ def staged_score(self, X, y, sample_weight=None): ------ z : float """ - X = self._validate_data(X) + X = self._check_X(X) for y_pred in self.staged_predict(X): if is_classifier(self): @@ -637,7 +626,7 @@ def predict(self, X): y : ndarray of shape (n_samples,) The predicted classes. """ - X = self._validate_data(X) + X = self._check_X(X) pred = self.decision_function(X) @@ -667,7 +656,7 @@ def staged_predict(self, X): y : generator of ndarray of shape (n_samples,) The predicted classes. """ - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_ @@ -701,7 +690,7 @@ def decision_function(self, X): class in ``classes_``, respectively. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] @@ -744,7 +733,7 @@ def staged_decision_function(self, X): class in ``classes_``, respectively. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] @@ -813,7 +802,7 @@ def predict_proba(self, X): outputs is the same of that of the :term:`classes_` attribute. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ @@ -847,7 +836,7 @@ def staged_predict_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ @@ -873,7 +862,7 @@ def predict_log_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ - X = self._validate_data(X) + X = self._check_X(X) return np.log(self.predict_proba(X)) @@ -1151,7 +1140,7 @@ def predict(self, X): The predicted regression values. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) return self._get_median_predict(X, len(self.estimators_)) @@ -1176,7 +1165,7 @@ def staged_predict(self, X): The predicted regression values. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) for i, _ in enumerate(self.estimators_, 1): yield self._get_median_predict(X, limit=i) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 61b106cbeedff..599f62366b51b 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -514,6 +514,26 @@ def test_check_estimators_voting_estimator(estimator): check_no_attributes_set_in_init(estimator.__class__.__name__, estimator) +@pytest.mark.parametrize( + "est", + [VotingRegressor( + estimators=[('lr', LinearRegression()), + ('tree', DecisionTreeRegressor(random_state=0))]), + VotingClassifier( + estimators=[('lr', LogisticRegression(random_state=0)), + ('tree', DecisionTreeClassifier(random_state=0))])], + ids=['VotingRegressor', 'VotingClassifier'] +) +def test_n_features_in(est): + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + assert not hasattr(est, 'n_features_in_') + est.fit(X, y) + assert est.n_features_in_ == 2 + + @pytest.mark.parametrize( "estimator", [VotingRegressor( diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 7e7481a369646..22a7402908cf1 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -110,3 +110,12 @@ def test_deterministic_vocabulary(): v_2 = DictVectorizer().fit([d_shuffled]) assert v_1.vocabulary_ == v_2.vocabulary_ + + +def test_n_features_in(): + # For vectorizers, n_features_in_ does not make sense and does not exist. + dv = DictVectorizer() + assert not hasattr(dv, 'n_features_in_') + d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + dv.fit(d) + assert not hasattr(dv, 'n_features_in_') diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 0b680b2a9e400..86ae2fd6c149e 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1342,6 +1342,18 @@ def test_unused_parameters_warn(Vectorizer, stop_words, vect.fit(train_data) +@pytest.mark.parametrize('Vectorizer, X', ( + (HashingVectorizer, [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]), + (CountVectorizer, JUNK_FOOD_DOCS)) +) +def test_n_features_in(Vectorizer, X): + # For vectorizers, n_features_in_ does not make sense + vectorizer = Vectorizer() + assert not hasattr(vectorizer, 'n_features_in_') + vectorizer.fit(X) + assert not hasattr(vectorizer, 'n_features_in_') + + # TODO: Remove in 0.24 def test_vectorizermixin_is_deprecated(): class MyVectorizer(VectorizerMixin): diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 674127f06acd7..dd72bddc58eb5 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -6,6 +6,7 @@ from ._base import SelectorMixin from ..base import BaseEstimator, clone, MetaEstimatorMixin +from ..utils.validation import check_is_fitted from ..exceptions import NotFittedError from ..utils.metaestimators import if_delegate_has_method @@ -254,6 +255,20 @@ def partial_fit(self, X, y=None, **fit_params): self.estimator_.partial_fit(X, y, **fit_params) return self + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.estimator_.n_features_in_ + def _more_tags(self): estimator_tags = self.estimator._get_tags() return {'allow_nan': estimator_tags.get('allow_nan', True)} diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 91312c7dc80f9..69e3cc4de9e6c 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -155,9 +155,12 @@ def _fit(self, X, y, step_score=None): # self.scores_ will not be calculated when calling _fit through fit tags = self._get_tags() - X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=not tags.get('allow_nan', True), - multi_output=True) + X, y = self._validate_data( + X, y, accept_sparse="csc", + ensure_min_features=2, + force_all_finite=not tags.get('allow_nan', True), + multi_output=True + ) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -491,9 +494,11 @@ def fit(self, X, y, groups=None): instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). """ tags = self._get_tags() - X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=not tags.get('allow_nan', True), - multi_output=True) + X, y = self._validate_data( + X, y, accept_sparse="csr", ensure_min_features=2, + force_all_finite=not tags.get('allow_nan', True), + multi_output=True + ) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 21990bb3a8167..221e46f2a505e 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -338,7 +338,8 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if not callable(self.score_func): raise TypeError("The score function should be a callable, %s (%s) " diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 4f9d720b762b9..6438e6b80dc0a 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -65,8 +65,9 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ('csr', 'csc'), dtype=np.float64, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + dtype=np.float64, + force_all_finite='allow-nan') if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 5618ff28128b8..ed8ed2a007a22 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -616,11 +616,11 @@ def fit(self, X, y): self : returns an instance of self. """ if self.kernel is None or self.kernel.requires_vector_input: - X, y = check_X_y(X, y, multi_output=False, - ensure_2d=True, dtype="numeric") + X, y = self._validate_data(X, y, multi_output=False, + ensure_2d=True, dtype="numeric") else: - X, y = check_X_y(X, y, multi_output=False, - ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=False, + ensure_2d=False, dtype=None) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 0c1db0d209458..1b48efb39f26d 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -184,11 +184,11 @@ def fit(self, X, y): self._rng = check_random_state(self.random_state) if self.kernel_.requires_vector_input: - X, y = check_X_y(X, y, multi_output=True, y_numeric=True, - ensure_2d=True, dtype="numeric") + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, + ensure_2d=True, dtype="numeric") else: - X, y = check_X_y(X, y, multi_output=True, y_numeric=True, - ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, + ensure_2d=False, dtype=None) # Normalize target value if self.normalize_y: diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 6ef34b8312e23..bc98778d5c5d8 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -217,7 +217,7 @@ def __init__(self, missing_values=np.nan, strategy="mean", self.verbose = verbose self.copy = copy - def _validate_input(self, X): + def _validate_input(self, X, in_fit): allowed_strategies = ["mean", "median", "most_frequent", "constant"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " @@ -235,8 +235,10 @@ def _validate_input(self, X): force_all_finite = "allow-nan" try: - X = check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, copy=self.copy) + X = self._validate_data(X, reset=in_fit, + accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, + copy=self.copy) except ValueError as ve: if "could not convert" in str(ve): new_ve = ValueError("Cannot use {} strategy with non-numeric " @@ -269,7 +271,7 @@ def fit(self, X, y=None): ------- self : SimpleImputer """ - X = self._validate_input(X) + X = self._validate_input(X, in_fit=True) super()._fit_indicator(X) # default fill_value is 0 for numerical input and "missing_value" @@ -407,7 +409,7 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_input(X) + X = self._validate_input(X, in_fit=False) X_indicator = super()._transform_indicator(X) statistics = self.statistics_ @@ -587,13 +589,14 @@ def _get_missing_features_info(self, X): return imputer_mask, features_indices - def _validate_input(self, X): + def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, - force_all_finite=force_all_finite) + X = self._validate_data(X, reset=in_fit, + accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("MissingIndicator does not support data with " @@ -628,7 +631,7 @@ def _fit(self, X, y=None): The imputer mask of the original data. """ - X = self._validate_input(X) + X = self._validate_input(X, in_fit=True) self._n_features = X.shape[1] if self.features not in ('missing-only', 'all'): @@ -680,7 +683,7 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_input(X) + X = self._validate_input(X, in_fit=False) if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index e168892337bec..58a35d157c7a4 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -505,8 +505,8 @@ def _initial_imputation(self, X): else: force_all_finite = True - X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite=force_all_finite) + X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) mask_missing_values = _get_mask(X, self.missing_values) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index a34a51b945253..f782a46a6b40d 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -178,8 +178,9 @@ def fit(self, X, y=None): raise ValueError( "Expected n_neighbors > 0. Got {}".format(self.n_neighbors)) - X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, copy=self.copy) + X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, + force_all_finite=force_all_finite, + copy=self.copy) super()._fit_indicator(X) _check_weights(self.weights) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index f15bf508f8dad..6ae62ce245a56 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -104,7 +104,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -210,7 +210,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = check_array(X) + X = self._validate_data(X) random_state = check_random_state(self.random_state) n_features = X.shape[1] uniform = random_state.uniform(size=(n_features, self.n_components)) @@ -337,7 +337,7 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') check_non_negative(X, 'X in AdditiveChi2Sampler.fit') if self.sample_interval is None: @@ -556,7 +556,7 @@ def fit(self, X, y=None): X : array-like of shape (n_samples, n_features) Training data. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') rnd = check_random_state(self.random_state) n_samples = X.shape[0] diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index d8f0e16d6f2b9..21c43979c3b1e 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -157,8 +157,8 @@ def fit(self, X, y=None, sample_weight=None): self : returns an instance of self. """ # Convert data - X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, - y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=("csr", "csc"), + multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = _check_sample_weight(sample_weight, X) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 92383dd82019b..d280f9d0f0d81 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -498,8 +498,8 @@ def fit(self, X, y, sample_weight=None): """ n_jobs_ = self.n_jobs - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 333ae5494b4e9..c67fc54f43157 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -190,7 +190,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError('n_iter should be greater than or equal to 1.' ' Got {!r}.'.format(self.n_iter)) - X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True) + X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -526,8 +526,8 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True, - ensure_min_samples=2) + X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True, + ensure_min_samples=2) n_samples, n_features = X.shape coef_ = np.zeros(n_features) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 87b5be7b650a5..9281d03710455 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -751,9 +751,11 @@ def fit(self, X, y, sample_weight=None, check_input=True): # when bypassing checks if check_input: X_copied = self.copy_X and self.fit_intercept - X, y = check_X_y(X, y, accept_sparse='csc', - order='F', dtype=[np.float64, np.float32], - copy=X_copied, multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse='csc', + order='F', + dtype=[np.float64, np.float32], + copy=X_copied, multi_output=True, + y_numeric=True) y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False) @@ -1197,8 +1199,8 @@ def fit(self, X, y): # Let us not impose fortran ordering so far: it is # not useful for the cross-validation loop and will be done # by the model fitting itself - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - copy=False) + X = self._validate_data(X, accept_sparse='csc', + dtype=[np.float64, np.float32], copy=False) if sparse.isspmatrix(X): if (hasattr(reference_to_old_X, "data") and not np.may_share_memory(reference_to_old_X.data, X.data)): @@ -1209,8 +1211,9 @@ def fit(self, X, y): copy_X = False del reference_to_old_X else: - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) + X = self._validate_data(X, accept_sparse='csc', + dtype=[np.float64, np.float32], order='F', + copy=copy_X) copy_X = False if X.shape[0] != y.shape[0]: @@ -1831,8 +1834,8 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - X = check_array(X, dtype=[np.float64, np.float32], order='F', - copy=self.copy_X and self.fit_intercept) + X = self._validate_data(X, dtype=[np.float64, np.float32], order='F', + copy=self.copy_X and self.fit_intercept) y = check_array(y, dtype=X.dtype.type, ensure_2d=False) if hasattr(self, 'l1_ratio'): diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 152055a62c662..1d3a3fcc73421 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y( + X, y = self._validate_data( X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 0a9a67844a3f3..9f0f62471376a 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -944,7 +944,7 @@ def fit(self, X, y, Xy=None): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) alpha = getattr(self, 'alpha', 0.) if hasattr(self, 'n_nonzero_coefs'): @@ -1367,7 +1367,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) X = as_float_array(X, copy=self.copy_X) y = as_float_array(y, copy=self.copy_X) @@ -1748,7 +1748,7 @@ def fit(self, X, y, copy_X=None): """ if copy_X is None: copy_X = self.copy_X - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, copy_X) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 4b541884eece8..9e84e56ee0284 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1339,8 +1339,9 @@ def fit(self, X, y, sample_weight=None): else: _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) @@ -1812,9 +1813,9 @@ def fit(self, X, y, sample_weight=None): "LogisticRegressionCV." ) - X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, - order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) class_weight = self.class_weight diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index 1fc0a8b69491c..54b751423c933 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -641,7 +641,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True) n_features = X.shape[1] X, y, X_offset, y_offset, X_scale, Gram, Xy = \ @@ -879,8 +879,8 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True, ensure_min_features=2, - estimator=self) + X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2, + estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) cv = check_cv(self.cv, classifier=False) max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 0363032359524..cd5e3db49842d 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -246,7 +246,7 @@ def fit(self, X, y, sample_weight=None): `max_trials` randomly chosen sub-samples. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') y = check_array(y, ensure_2d=False) check_consistent_length(X, y) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index a1f6b89230b94..1a9cb661318e9 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -537,10 +537,10 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = check_X_y(X, y, - accept_sparse=_accept_sparse, - dtype=_dtype, - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, + accept_sparse=_accept_sparse, + dtype=_dtype, + multi_output=True, y_numeric=True) if sparse.issparse(X) and self.fit_intercept: if self.solver not in ['auto', 'sparse_cg', 'sag']: raise ValueError( @@ -921,8 +921,8 @@ def fit(self, X, y, sample_weight=None): """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True, - y_numeric=False) + X, y = self._validate_data(X, y, accept_sparse=_accept_sparse, + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) @@ -1450,8 +1450,9 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=[np.float64], - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float64], + multi_output=True, y_numeric=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -1618,6 +1619,7 @@ def fit(self, X, y, sample_weight=None): self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ + self.n_features_in_ = estimator.n_features_in_ return self @@ -1882,8 +1884,8 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True, y_numeric=False) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 7fc2126a131b1..df2189b9400cc 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -546,8 +546,9 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, if hasattr(self, "classes_"): self.classes_ = None - X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class @@ -1120,8 +1121,9 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init): - X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64, - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse="csr", copy=False, + order='C', dtype=np.float64, + accept_large_sparse=False) y = y.astype(np.float64, copy=False) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 35cb3b8f25471..a29cc26cdc0a3 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -357,7 +357,7 @@ def fit(self, X, y): self : returns an instance of self. """ random_state = check_random_state(self.random_state) - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) n_samples, n_features = X.shape n_subsamples, self.n_subpopulation_ = self._check_subparams(n_samples, n_features) diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 792c21ce51c2c..8a7fc3f85f425 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -140,13 +140,13 @@ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', self.metric_params = metric_params def _fit_transform(self, X): - self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs) self.nbrs_.fit(X) + self.n_features_in_ = self.nbrs_.n_features_in_ self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 4854cf228f0ca..7b46d51df718d 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -652,7 +652,7 @@ def _fit_transform(self, X): n_jobs=self.n_jobs) random_state = check_random_state(self.random_state) - X = check_array(X, dtype=float) + X = self._validate_data(X, dtype=float) self.nbrs_.fit(X) self.embedding_, self.reconstruction_error_ = \ locally_linear_embedding( diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index b7b52344eb21b..ca8c08ed69f98 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -411,7 +411,7 @@ def fit_transform(self, X, y=None, init=None): algorithm. By default, the algorithm is initialized with a randomly chosen array. """ - X = check_array(X) + X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn("The MDS API has changed. ``fit`` now constructs an" " dissimilarity matrix from data. To use a custom " diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index e885a94eaaded..caac2236e1dd6 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -531,8 +531,8 @@ def fit(self, X, y=None): Returns the instance itself. """ - X = check_array(X, accept_sparse='csr', ensure_min_samples=2, - estimator=self) + X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, + estimator=self) random_state = check_random_state(self.random_state) if isinstance(self.affinity, str): diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 81972dac33d07..d0c9e4e509a73 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -661,11 +661,12 @@ def _fit(self, X, skip_num_points=0): if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.method == 'barnes_hut': - X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, - dtype=[np.float32, np.float64]) + X = self._validate_data(X, accept_sparse=['csr'], + ensure_min_samples=2, + dtype=[np.float32, np.float64]) else: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index 07f3669db27ef..5c09d67f6e63d 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -218,6 +218,7 @@ def fit_predict(self, X, y=None): Component labels. """ X = _check_X(X, self.n_components, ensure_min_samples=2) + self._check_n_features(X, reset=True) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 1c2551a5f600f..55e770d701858 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -566,6 +566,20 @@ def inverse_transform(self, Xt): self._check_is_fitted('inverse_transform') return self.best_estimator_.inverse_transform(Xt) + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the search estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.best_estimator_.n_features_in_ + @property def classes_(self): self._check_is_fitted("classes_") diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index e9f21f0eb5e8f..49d4b156e0686 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -67,6 +67,8 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import Ridge, SGDClassifier, LinearRegression +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection.tests.common import OneTimeSplitter @@ -1810,6 +1812,23 @@ def get_n_splits(self, *args, **kw): ridge.fit(X[:train_size], y[:train_size]) +def test_n_features_in(): + # make sure grid search and random search delegate n_features_in to the + # best estimator + n_features = 4 + X, y = make_classification(n_features=n_features) + gbdt = HistGradientBoostingClassifier() + param_grid = {'max_iter': [3, 4]} + gs = GridSearchCV(gbdt, param_grid) + rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1) + assert not hasattr(gs, 'n_features_in_') + assert not hasattr(rs, 'n_features_in_') + gs.fit(X, y) + rs.fit(X, y) + assert gs.n_features_in_ == n_features + assert rs.n_features_in_ == n_features + + def test_search_cv__pairwise_property_delegated_to_base_estimator(): """ Test implementation of BaseSearchCV has the _pairwise property diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 13dda2f6e6927..9eeb4248f83fd 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -52,6 +52,7 @@ check_classification_targets, _ovr_decision_function) from .utils.metaestimators import _safe_split, if_delegate_has_method +from .exceptions import NotFittedError from joblib import Parallel, delayed @@ -433,6 +434,19 @@ def _pairwise(self): def _first_estimator(self): return self.estimators_[0] + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the OVR estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + return self.estimators_[0].n_features_in_ + def _fit_ovo_binary(estimator, X, y, i, j): """Fit a single binary estimator (one-vs-one).""" @@ -521,7 +535,7 @@ def fit(self, X, y): ------- self """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) check_classification_targets(y) self.classes_ = np.unique(y) @@ -762,7 +776,7 @@ def fit(self, X, y): ------- self """ - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {0}" "".format(self.code_size)) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 8b6d4b8384c4d..82edd85472880 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -152,9 +152,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): raise ValueError("The base estimator should implement" " a fit method") - X, y = check_X_y(X, y, - multi_output=True, - accept_sparse=True) + X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(y) @@ -423,7 +421,7 @@ def fit(self, X, Y, **fit_params): ------- self : object """ - X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True) + X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 6238aa294530a..c23cb86644e1b 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -203,6 +203,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + X, y = self._validate_data(X, y) y = column_or_1d(y, warn=True) return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight) @@ -472,7 +473,7 @@ def _check_X(self, X): return check_array(X, accept_sparse='csr') def _check_X_y(self, X, y): - return check_X_y(X, y, accept_sparse='csr') + return self._validate_data(X, y, accept_sparse='csr') def _update_class_log_prior(self, class_prior=None): n_classes = len(self.classes_) @@ -1154,8 +1155,8 @@ def _check_X(self, X): return X def _check_X_y(self, X, y): - X, y = check_X_y(X, y, dtype='int', accept_sparse=False, - force_all_finite=True) + X, y = self._validate_data(X, y, dtype='int', accept_sparse=False, + force_all_finite=True) if np.any(X < 0): raise ValueError("X must not contain negative values.") return X, y diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index f927c26868a5f..945959ef10d9c 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -396,8 +396,9 @@ def _fit(self, X): if self.effective_metric_ == 'precomputed': X = _check_precomputed(X) + self.n_features_in_ = X.shape[1] else: - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 5b44f6f6b2b75..3404a9768f36a 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None): Returns instance of object. """ algorithm = self._choose_algorithm(self.algorithm, self.metric) - X = check_array(X, order='C', dtype=DTYPE) + X = self._validate_data(X, order='C', dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DTYPE) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 131a1bf0b04c1..b9d2de01c958d 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -300,7 +300,7 @@ def _validate_params(self, X, y): """ # Validate the inputs X and y, and converts y to numerical classes. - X, y = check_X_y(X, y, ensure_min_samples=2) + X, y = self._validate_data(X, y, ensure_min_samples=2) check_classification_targets(y) y = LabelEncoder().fit_transform(y) diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 3eefb7b4fbf58..48712c1fcfb44 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -104,9 +104,9 @@ def fit(self, X, y): # If X is sparse and the metric is "manhattan", store it in a csc # format is easier to calculate the median. if self.metric == 'manhattan': - X, y = check_X_y(X, y, ['csc']) + X, y = self._validate_data(X, y, accept_sparse=['csc']) else: - X, y = check_X_y(X, y, ['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) is_X_sparse = sp.issparse(X) if is_X_sparse and self.shrink_threshold: raise ValueError("threshold shrinking not supported" diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 90f9210db5d6b..dbd4c012c8afa 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -943,8 +943,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", n_iter_no_change=n_iter_no_change, max_fun=max_fun) def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) @@ -1352,8 +1352,8 @@ def predict(self, X): return y_pred def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True, y_numeric=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) return X, y diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 38673eb4ab8bd..06e7cc71bad3c 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -343,7 +343,7 @@ def fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - X = check_array(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) n_samples = X.shape[0] rng = check_random_state(self.random_state) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index d387206288bf5..64d2de70df531 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -628,6 +628,11 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + @property + def n_features_in_(self): + # delegate to first step (which will call _check_is_fitted) + return self.steps[0][1].n_features_in_ + def _name_estimators(estimators): """Generate names for estimators.""" @@ -1000,6 +1005,11 @@ def _update_transformer_list(self, transformers): else next(transformers)) for name, old in self.transformer_list] + @property + def n_features_in_(self): + # X is passed to all transformers so we just delegate to the first one + return self.transformer_list[0][1].n_features_in_ + def make_union(*transformers, **kwargs): """ diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 9ff3723b25550..72ad6bacd43b4 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -368,17 +368,16 @@ def partial_fit(self, X, y=None): raise TypeError("MinMaxScaler does not support sparse input. " "Consider using MaxAbsScaler instead.") - X = check_array(X, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan") + first_pass = not hasattr(self, 'n_samples_seen_') + X = self._validate_data(X, reset=first_pass, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite="allow-nan") data_min = np.nanmin(X, axis=0) data_max = np.nanmax(X, axis=0) - # First pass - if not hasattr(self, 'n_samples_seen_'): + if first_pass: self.n_samples_seen_ = X.shape[0] - # Next steps else: data_min = np.minimum(self.data_min_, data_min) data_max = np.maximum(self.data_max_, data_max) @@ -695,9 +694,9 @@ def partial_fit(self, X, y=None): self : object Transformer instance. """ - X = check_array(X, accept_sparse=('csr', 'csc'), - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -790,9 +789,10 @@ def transform(self, X, copy=None): check_is_fitted(self) copy = copy if copy is not None else self.copy - X = check_array(X, accept_sparse='csr', copy=copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, reset=False, + accept_sparse='csr', copy=copy, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: @@ -965,9 +965,11 @@ def partial_fit(self, X, y=None): self : object Transformer instance. """ - X = check_array(X, accept_sparse=('csr', 'csc'), - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + first_pass = not hasattr(self, 'n_samples_seen_') + X = self._validate_data(X, reset=first_pass, + accept_sparse=('csr', 'csc'), estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) @@ -975,10 +977,8 @@ def partial_fit(self, X, y=None): else: max_abs = np.nanmax(np.abs(X), axis=0) - # First pass - if not hasattr(self, 'n_samples_seen_'): + if first_pass: self.n_samples_seen_ = X.shape[0] - # Next passes else: max_abs = np.maximum(self.max_abs_, max_abs) self.n_samples_seen_ += X.shape[0] @@ -1196,8 +1196,9 @@ def fit(self, X, y=None): """ # at fit, convert sparse matrices to csc for optimized computation of # the quantiles - X = check_array(X, accept_sparse='csc', estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse='csc', estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: @@ -1505,7 +1506,8 @@ def fit(self, X, y=None): ------- self : instance """ - n_samples, n_features = check_array(X, accept_sparse=True).shape + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape combinations = self._combinations(n_features, self.degree, self.interaction_only, self.include_bias) @@ -1811,7 +1813,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -1945,7 +1947,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -2025,7 +2027,7 @@ def fit(self, K, y=None): self : returns an instance of self. """ - K = check_array(K, dtype=FLOAT_DTYPES) + K = self._validate_data(K, dtype=FLOAT_DTYPES) if K.shape[0] != K.shape[1]: raise ValueError("Kernel matrix must be a square matrix." @@ -2347,7 +2349,7 @@ def fit(self, X, y=None): " and {} samples.".format(self.n_quantiles, self.subsample)) - X = self._check_inputs(X, copy=False) + X = self._check_inputs(X, in_fit=True, copy=False) n_samples = X.shape[0] if self.n_quantiles > n_samples: @@ -2438,11 +2440,22 @@ def _transform_col(self, X_col, quantiles, inverse): return X_col - def _check_inputs(self, X, accept_sparse_negative=False, copy=False): + def _check_inputs(self, X, in_fit, accept_sparse_negative=False, + copy=False): """Check inputs before fit and transform""" - X = check_array(X, accept_sparse='csc', copy=copy, - dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + # In theory reset should be equal to `in_fit`, but there are tests + # checking the input number of feature and they expect a specific + # string, which is not the same one raised by check_n_features. So we + # don't check n_features_in_ here for now (it's done with adhoc code in + # the estimator anyway). + # TODO: set reset=in_fit when addressing reset in + # predict/transform/etc. + reset = True + + X = self._validate_data(X, reset=reset, + accept_sparse='csc', copy=copy, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. with np.errstate(invalid='ignore'): # hide NaN comparison warnings @@ -2518,7 +2531,7 @@ def transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, copy=self.copy) + X = self._check_inputs(X, in_fit=False, copy=self.copy) self._check_is_fitted(X) return self._transform(X, inverse=False) @@ -2539,7 +2552,8 @@ def inverse_transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, accept_sparse_negative=True, copy=self.copy) + X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, + copy=self.copy) self._check_is_fitted(X) return self._transform(X, inverse=True) @@ -2781,7 +2795,8 @@ def fit_transform(self, X, y=None): return self._fit(X, y, force_transform=True) def _fit(self, X, y=None, force_transform=False): - X = self._check_input(X, check_positive=True, check_method=True) + X = self._check_input(X, in_fit=True, check_positive=True, + check_method=True) if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace @@ -2823,7 +2838,8 @@ def transform(self, X): The transformed data. """ check_is_fitted(self) - X = self._check_input(X, check_positive=True, check_shape=True) + X = self._check_input(X, in_fit=False, check_positive=True, + check_shape=True) transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform @@ -2869,7 +2885,7 @@ def inverse_transform(self, X): The original data """ check_is_fitted(self) - X = self._check_input(X, check_shape=True) + X = self._check_input(X, in_fit=False, check_shape=True) if self.standardize: X = self._scaler.inverse_transform(X) @@ -2974,7 +2990,7 @@ def _neg_log_likelihood(lmbda): # choosing bracket -2, 2 like for boxcox return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) - def _check_input(self, X, check_positive=False, check_shape=False, + def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method=False): """Validate the input before fit and transform. @@ -2992,8 +3008,8 @@ def _check_input(self, X, check_positive=False, check_shape=False, check_method : bool If True, check that the transformation method is valid. """ - X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy, - force_all_finite='allow-nan') + X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan') with np.warnings.catch_warnings(): np.warnings.filterwarnings( diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 5a73bf5c7f845..67641601e06f5 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -137,7 +137,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, dtype='numeric') + X = self._validate_data(X, dtype='numeric') valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 4aceaa08100f2..85ce3a1f845c1 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -92,7 +92,7 @@ def __init__(self, func=None, inverse_func=None, validate=False, def _check_input(self, X): if self.validate: - return check_array(X, accept_sparse=self.accept_sparse) + return self._validate_data(X, accept_sparse=self.accept_sparse) return X def _check_inverse_transform(self, X): diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 311a254239cf0..d18f3bf846901 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -354,7 +354,7 @@ def fit(self, X, y=None): self """ - X = check_array(X, accept_sparse=['csr', 'csc']) + X = self._validate_data(X, accept_sparse=['csr', 'csc']) n_samples, n_features = X.shape diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index d40b5008db33c..a84a9950aa3ac 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -223,7 +223,7 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) self.X_ = X check_classification_targets(y) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 1f81987f78c52..662a4ffa24678 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -147,9 +147,9 @@ def fit(self, X, y, sample_weight=None): if callable(self.kernel): check_consistent_length(X, y) else: - X, y = check_X_y(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = self._validate_data(X, y, dtype=np.float64, + order='C', accept_sparse='csr', + accept_large_sparse=False) y = self._validate_targets(y) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 1d96900555400..e04b904e294b2 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -217,9 +217,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = check_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) check_classification_targets(y) self.classes_ = np.unique(y) @@ -395,9 +395,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = check_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) penalty = 'l2' # SVR only accepts l2 penalty self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear( X, y, self.C, self.fit_intercept, self.intercept_scaling, diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 0d4addb48e64d..38abb0b158fd3 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -756,6 +756,17 @@ def test_dtype_of_classifier_probas(strategy): assert probas.dtype == np.float64 +@pytest.mark.filterwarnings("ignore:The default value of strategy.*") # 0.24 +@pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier)) +def test_n_features_in_(Dummy): + X = [[1, 2]] + y = [0] + d = Dummy() + assert not hasattr(d, 'n_features_in_') + d.fit(X, y) + assert d.n_features_in_ is None + + @pytest.mark.parametrize("Dummy", (DummyRegressor, DummyClassifier)) def test_outputs_2d_deprecation(Dummy): X = [[1, 2]] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index bd9246269f0f4..b9c2e26abac61 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -34,6 +34,8 @@ from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingClassifier iris = load_iris() @@ -1161,6 +1163,49 @@ def test_verbose(est, method, pattern, capsys): assert re.match(pattern, capsys.readouterr().out) +def test_n_features_in_pipeline(): + # make sure pipelines delegate n_features_in to the first step + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + ss = StandardScaler() + gbdt = HistGradientBoostingClassifier() + pipe = make_pipeline(ss, gbdt) + assert not hasattr(pipe, 'n_features_in_') + pipe.fit(X, y) + assert pipe.n_features_in_ == ss.n_features_in_ == 2 + + # if the first step has the n_features_in attribute then the pipeline also + # has it, even though it isn't fitted. + ss = StandardScaler() + gbdt = HistGradientBoostingClassifier() + pipe = make_pipeline(ss, gbdt) + ss.fit(X, y) + assert pipe.n_features_in_ == ss.n_features_in_ == 2 + assert not hasattr(gbdt, 'n_features_in_') + + +def test_n_features_in_feature_union(): + # make sure FeatureUnion delegates n_features_in to the first transformer + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + ss = StandardScaler() + fu = make_union(ss) + assert not hasattr(fu, 'n_features_in_') + fu.fit(X, y) + assert fu.n_features_in_ == ss.n_features_in_ == 2 + + # if the first step has the n_features_in attribute then the feature_union + # also has it, even though it isn't fitted. + ss = StandardScaler() + fu = make_union(ss) + ss.fit(X, y) + assert fu.n_features_in_ == ss.n_features_in_ == 2 + + def test_feature_union_fit_params(): # Regression test for issue: #15117 class Dummy(TransformerMixin, BaseEstimator): diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 41d09ead9aec4..09481aefeed41 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -146,7 +146,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, raise ValueError("ccp_alpha must be greater than or equal to 0") if check_input: - X = check_array(X, dtype=DTYPE, accept_sparse="csc") + X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d376f1edb3097..21060b3462520 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -281,6 +281,8 @@ def _yield_all_checks(name, estimator): yield check_dict_unchanged yield check_dont_overwrite_parameters yield check_fit_idempotent + if not tags["no_validation"]: + yield check_n_features_in if tags["requires_positive_X"]: yield check_fit_non_negative @@ -2906,3 +2908,41 @@ def check_fit_idempotent(name, estimator_orig): atol=max(tol, 1e-9), rtol=max(tol, 1e-7), err_msg="Idempotency check failed for method {}".format(method) ) + + +def check_n_features_in(name, estimator_orig): + # Make sure that n_features_in_ attribute doesn't exist until fit is + # called, and that its value is correct. + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + if 'warm_start' in estimator.get_params(): + estimator.set_params(warm_start=False) + + n_samples = 100 + X = rng.normal(loc=100, size=(n_samples, 2)) + X = _pairwise_estimator_convert_X(X, estimator) + if is_regressor(estimator_orig): + y = rng.normal(size=n_samples) + else: + y = rng.randint(low=0, high=2, size=n_samples) + y = _enforce_estimator_tags_y(estimator, y) + + assert not hasattr(estimator, 'n_features_in_') + estimator.fit(X, y) + if hasattr(estimator, 'n_features_in_'): + assert estimator.n_features_in_ == X.shape[1] + else: + warnings.warn( + "As of scikit-learn 0.23, estimators should expose a " + "n_features_in_ attribute, unless the 'no_validation' tag is " + "True. This attribute should be equal to the number of features " + "passed to the fit method. " + "An error will be raised from version 0.25 when calling " + "check_estimator(). " + "See SLEP010: " + "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html", # noqa + FutureWarning + ) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index cc017b6667658..748666884e60e 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -60,7 +60,7 @@ def __init__(self, key=0): self.key = key def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self def predict(self, X): @@ -75,7 +75,7 @@ def __init__(self, acceptable_key=0): def fit(self, X, y=None): self.wrong_attribute = 0 - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -85,14 +85,14 @@ def __init__(self, wrong_attribute=0): def fit(self, X, y=None): self.wrong_attribute = 1 - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self class ChangesUnderscoreAttribute(BaseEstimator): def fit(self, X, y=None): self._good_attribute = 1 - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -109,7 +109,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -126,7 +126,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -145,19 +145,19 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self class NoCheckinPredict(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self class NoSparseClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) if sp.issparse(X): raise ValueError("Nonsensical Error") return self @@ -169,7 +169,7 @@ def predict(self, X): class CorrectNotFittedErrorClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) self.coef_ = np.ones(X.shape[1]) return self @@ -182,10 +182,11 @@ def predict(self, X): class NoSampleWeightPandasSeriesType(BaseEstimator): def fit(self, X, y, sample_weight=None): # Convert data - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc"), - multi_output=True, - y_numeric=True) + X, y = self._validate_data( + X, y, + accept_sparse=("csr", "csc"), + multi_output=True, + y_numeric=True) # Function is only called after we verify that pandas is installed from pandas import Series if isinstance(sample_weight, Series): @@ -222,7 +223,7 @@ def fit(self, X, y): class BadTransformerWithoutMixin(BaseEstimator): def fit(self, X, y=None): - X = check_array(X) + X = self._validate_data(X) return self def transform(self, X): @@ -233,10 +234,11 @@ def transform(self, X): class NotInvariantPredict(BaseEstimator): def fit(self, X, y): # Convert data - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc"), - multi_output=True, - y_numeric=True) + X, y = self._validate_data( + X, y, + accept_sparse=("csr", "csc"), + multi_output=True, + y_numeric=True) return self def predict(self, X): @@ -249,11 +251,12 @@ def predict(self, X): class LargeSparseNotSupportedClassifier(BaseEstimator): def fit(self, X, y): - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc", "coo"), - accept_large_sparse=True, - multi_output=True, - y_numeric=True) + X, y = self._validate_data( + X, y, + accept_sparse=("csr", "csc", "coo"), + accept_large_sparse=True, + multi_output=True, + y_numeric=True) if sp.issparse(X): if X.getformat() == "coo": if X.row.dtype == "int64" or X.col.dtype == "int64": @@ -268,7 +271,7 @@ def fit(self, X, y): class SparseTransformer(BaseEstimator): def fit(self, X, y=None): - self.X_shape_ = check_array(X).shape + self.X_shape_ = self._validate_data(X).shape return self def fit_transform(self, X, y=None): @@ -320,7 +323,7 @@ def _more_tags(self): class RequiresPositiveYRegressor(LinearRegression): def fit(self, X, y): - X, y = check_X_y(X, y, multi_output=True) + X, y = self._validate_data(X, y, multi_output=True) if (y <= 0).any(): raise ValueError('negative y values not supported!') return super().fit(X, y) From eb540f35f8e5d010f641a4ebda186040b0cf179c Mon Sep 17 00:00:00 2001 From: jumon <34873661+jumon@users.noreply.github.com> Date: Sun, 1 Mar 2020 02:38:45 +0900 Subject: [PATCH 410/448] DOC Rename clf to regr in SVR examples (#16598) --- doc/modules/svm.rst | 6 +++--- sklearn/svm/_classes.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index fae28b1ffc0a2..706a9ff559aa8 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -320,10 +320,10 @@ floating point values instead of integer values:: >>> from sklearn import svm >>> X = [[0, 0], [2, 2]] >>> y = [0.5, 2.5] - >>> clf = svm.SVR() - >>> clf.fit(X, y) + >>> regr = svm.SVR() + >>> regr.fit(X, y) SVR() - >>> clf.predict([[1, 1]]) + >>> regr.predict([[1, 1]]) array([1.5]) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index e04b904e294b2..fbaa6e97ec616 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -939,8 +939,8 @@ class SVR(RegressorMixin, BaseLibSVM): >>> rng = np.random.RandomState(0) >>> y = rng.randn(n_samples) >>> X = rng.randn(n_samples, n_features) - >>> clf = SVR(C=1.0, epsilon=0.2) - >>> clf.fit(X, y) + >>> regr = SVR(C=1.0, epsilon=0.2) + >>> regr.fit(X, y) SVR(epsilon=0.2) See also @@ -1079,8 +1079,8 @@ class NuSVR(RegressorMixin, BaseLibSVM): >>> np.random.seed(0) >>> y = np.random.randn(n_samples) >>> X = np.random.randn(n_samples, n_features) - >>> clf = NuSVR(C=1.0, nu=0.1) - >>> clf.fit(X, y) + >>> regr = NuSVR(C=1.0, nu=0.1) + >>> regr.fit(X, y) NuSVR(nu=0.1) See also From 1c7449036515d4204f9467493010228588ed378d Mon Sep 17 00:00:00 2001 From: Tim Nonner Date: Sat, 29 Feb 2020 19:43:30 +0100 Subject: [PATCH 411/448] DOC Adds example for RandomTreesEmbedding (#15202) --- sklearn/ensemble/_forest.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 8070d0cd0eaff..d6784b10f05d3 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2189,6 +2189,20 @@ class RandomTreesEmbedding(BaseForest): estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. + Examples + -------- + >>> from sklearn.ensemble import RandomTreesEmbedding + >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]] + >>> random_trees = RandomTreesEmbedding( + ... n_estimators=5, random_state=0, max_depth=1).fit(X) + >>> X_sparse_embedding = random_trees.transform(X) + >>> X_sparse_embedding.toarray() + array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.], + [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.], + [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.], + [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.], + [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]]) + References ---------- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", From 68a639e3fdcbd41a556a43768c1270cece18702c Mon Sep 17 00:00:00 2001 From: wderose Date: Sat, 29 Feb 2020 11:20:40 -0800 Subject: [PATCH 412/448] MNT rename _parallel_fit_estimator to _fit_single_estimator to reflect lack of parallelism in the method (#16599) --- sklearn/ensemble/_base.py | 4 ++-- sklearn/ensemble/_stacking.py | 4 ++-- sklearn/ensemble/_voting.py | 4 ++-- sklearn/ensemble/tests/test_voting.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 41e50f7d5a7fc..23db107874c9b 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -20,8 +20,8 @@ from ..utils.metaestimators import _BaseComposition -def _parallel_fit_estimator(estimator, X, y, sample_weight=None, - message_clsname=None, message=None): +def _fit_single_estimator(estimator, X, y, sample_weight=None, + message_clsname=None, message=None): """Private function used to fit an estimator within a job.""" if sample_weight is not None: try: diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index a5fedc6431344..cd18a684a4518 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -14,7 +14,7 @@ from ..base import ClassifierMixin, RegressorMixin, TransformerMixin from ..base import is_classifier, is_regressor -from ._base import _parallel_fit_estimator +from ._base import _fit_single_estimator from ._base import _BaseHeterogeneousEnsemble from ..linear_model import LogisticRegression @@ -137,7 +137,7 @@ def fit(self, X, y, sample_weight=None): # base estimators will be used in transform, predict, and # predict_proba. They are exposed publicly. self.estimators_ = Parallel(n_jobs=self.n_jobs)( - delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight) + delayed(_fit_single_estimator)(clone(est), X, y, sample_weight) for est in all_estimators if est != 'drop' ) self.n_features_in_ = self.estimators_[0].n_features_in_ diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index d01647a5e4448..0da6dc86c30fa 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -23,7 +23,7 @@ from ..base import RegressorMixin from ..base import TransformerMixin from ..base import clone -from ._base import _parallel_fit_estimator +from ._base import _fit_single_estimator from ._base import _BaseHeterogeneousEnsemble from ..preprocessing import LabelEncoder from ..utils import Bunch @@ -69,7 +69,7 @@ def fit(self, X, y, sample_weight=None): % (len(self.weights), len(self.estimators))) self.estimators_ = Parallel(n_jobs=self.n_jobs)( - delayed(_parallel_fit_estimator)( + delayed(_fit_single_estimator)( clone(clf), X, y, sample_weight=sample_weight, message_clsname='Voting', diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 599f62366b51b..4eb47bea0a514 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -316,7 +316,7 @@ def test_sample_weight(): with pytest.raises(TypeError, match=msg): eclf3.fit(X, y, sample_weight) - # check that _parallel_fit_estimator will raise the right error + # check that _fit_single_estimator will raise the right error # it should raise the original error if this is not linked to sample_weight class ClassifierErrorFit(ClassifierMixin, BaseEstimator): def fit(self, X, y, sample_weight): From d86f8fdf0cfa96389ab4863a595e8d50ae9c9322 Mon Sep 17 00:00:00 2001 From: pspachtholz Date: Sat, 29 Feb 2020 21:08:49 +0100 Subject: [PATCH 413/448] DOC Adds examples to GradientBoostingClassifier and GradientBoostingRegressor (#15151) --- sklearn/ensemble/_gb.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 4a458359437b3..7e354cb720bbe 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1029,6 +1029,22 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): split. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed. + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import GradientBoostingClassifier + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = GradientBoostingClassifier(random_state=0) + >>> clf.fit(X_train, y_train) + GradientBoostingClassifier(random_state=0) + >>> clf.predict(X_test[:2]) + array([1, 0]) + >>> clf.score(X_test, y_test) + 0.88 + See also -------- sklearn.ensemble.HistGradientBoostingClassifier, @@ -1506,6 +1522,22 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): split. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed. + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_regression(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> reg = GradientBoostingRegressor(random_state=0) + >>> reg.fit(X_train, y_train) + GradientBoostingRegressor(random_state=0) + >>> reg.predict(X_test[1:2]) + array([-61...]) + >>> reg.score(X_test, y_test) + 0.4... + See also -------- sklearn.ensemble.HistGradientBoostingRegressor, From 94f877b55efb55b4cccf8fe03f8d299abca3eb7a Mon Sep 17 00:00:00 2001 From: Rick Mackenbach Date: Sun, 1 Mar 2020 04:07:52 +0100 Subject: [PATCH 414/448] Fix format of values in confusion matrix plot. (#16159) --- doc/whats_new/v0.23.rst | 6 +++++ sklearn/metrics/_plot/confusion_matrix.py | 26 ++++++++++++------- .../_plot/tests/test_plot_confusion_matrix.py | 18 +++++++++++++ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 89622c52c5041..96702dae01235 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -262,6 +262,12 @@ Changelog :mod:`sklearn.metrics` ...................... +- |API| Changed the formatting of values in + :meth:`metrics.ConfusionMatrixDisplay.plot` and + :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g' + or 'd'). :pr:`16159` by :user:`Rick Mackenbach ` and + `Thomas Fan`_. + - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows its ``reduce_func`` to not have a return value, enabling in-place operations. :pr:`16397` by `Joel Nothman`_. diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index df4e59fb5384d..96d99adfe7386 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -61,7 +61,7 @@ def plot(self, include_values=True, cmap='viridis', values_format : str, default=None Format specification for values in confusion matrix. If `None`, - the format specification is '.2g'. + the format specification is 'd' or '.2g' whichever is shorter. ax : matplotlib axes, default=None Axes object to plot on. If `None`, a new figure and axes is @@ -83,22 +83,30 @@ def plot(self, include_values=True, cmap='viridis', n_classes = cm.shape[0] self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap) self.text_ = None - cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256) if include_values: self.text_ = np.empty_like(cm, dtype=object) - if values_format is None: - values_format = '.2g' # print text with appropriate color depending on background thresh = (cm.max() + cm.min()) / 2.0 + for i, j in product(range(n_classes), range(n_classes)): color = cmap_max if cm[i, j] < thresh else cmap_min - self.text_[i, j] = ax.text(j, i, - format(cm[i, j], values_format), - ha="center", va="center", - color=color) + + if values_format is None: + text_cm = format(cm[i, j], '.2g') + if cm.dtype.kind != 'f': + text_d = format(cm[i, j], 'd') + if len(text_d) < len(text_cm): + text_cm = text_d + else: + text_cm = format(cm[i, j], values_format) + + self.text_[i, j] = ax.text( + j, i, text_cm, + ha="center", va="center", + color=color) fig.colorbar(self.im_, ax=ax) ax.set(xticks=np.arange(n_classes), @@ -164,7 +172,7 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None, values_format : str, default=None Format specification for values in confusion matrix. If `None`, - the format specification is '.2g'. + the format specification is 'd' or '.2g' whichever is shorter. cmap : str or matplotlib Colormap, default='viridis' Colormap recognized by matplotlib. diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py index c58684a61d418..b8a24ae15f1e5 100644 --- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py +++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py @@ -21,6 +21,7 @@ "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:" "matplotlib.*") + @pytest.fixture(scope="module") def n_classes(): return 5 @@ -262,3 +263,20 @@ def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes, text_text = np.array([ t.get_text() for t in disp.text_.ravel()]) assert_array_equal(expected_text, text_text) + + +def test_confusion_matrix_standard_format(pyplot): + cm = np.array([[10000000, 0], [123456, 12345678]]) + plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_ + # Values should be shown as whole numbers 'd', + # except the first number which should be shown as 1e+07 (longer length) + # and the last number will be showns as 1.2e+07 (longer length) + test = [t.get_text() for t in plotted_text.ravel()] + assert test == ['1e+07', '0', '123456', '1.2e+07'] + + cm = np.array([[0.1, 10], [100, 0.525]]) + plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_ + # Values should now formatted as '.2g', since there's a float in + # Values are have two dec places max, (e.g 100 becomes 1e+02) + test = [t.get_text() for t in plotted_text.ravel()] + assert test == ['0.1', '10', '1e+02', '0.53'] From 8868ec775ef734f87a275b6aceaaf3cead913e36 Mon Sep 17 00:00:00 2001 From: Shiki-H <25473496+Shiki-H@users.noreply.github.com> Date: Sun, 1 Mar 2020 08:50:29 -0500 Subject: [PATCH 415/448] DOC Add formula for binary balanced accuracy in UG (#16604) --- doc/modules/model_evaluation.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 7745bbb351809..e1b7ae34f1647 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -459,7 +459,11 @@ In the binary case, balanced accuracy is equal to the arithmetic mean of (true positive rate) and `specificity `_ (true negative rate), or the area under the ROC curve with binary predictions rather than -scores. +scores: + +.. math:: + + \texttt{balanced-accuracy} = \frac{1}{2}\left( \frac{TP}{TP + FN} + \frac{TN}{TN + FP}\right ) If the classifier performs equally well on either class, this term reduces to the conventional accuracy (i.e., the number of correct predictions divided by From cd622dfb4215d093925f4c4bd2241b4f0201e90e Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Mon, 2 Mar 2020 06:41:19 +0100 Subject: [PATCH 416/448] MNT Download and test datasets in cron job (#16348) --- build_tools/travis/test_script.sh | 7 +-- sklearn/datasets/tests/conftest.py | 61 +++++++++++++++++++ sklearn/datasets/tests/test_20news.py | 59 ++++++------------ .../datasets/tests/test_california_housing.py | 47 +++----------- sklearn/datasets/tests/test_covtype.py | 25 +++----- sklearn/datasets/tests/test_kddcup99.py | 41 +++++-------- sklearn/datasets/tests/test_olivetti_faces.py | 25 +++----- sklearn/datasets/tests/test_rcv1.py | 25 +++----- 8 files changed, 129 insertions(+), 161 deletions(-) create mode 100644 sklearn/datasets/tests/conftest.py diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index f13e0f1bbb2fa..a9c8fb73f9552 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -30,10 +30,9 @@ run_tests() { cp setup.cfg $TEST_DIR cd $TEST_DIR - # Skip tests that require large downloads over the network to save bandwidth - # usage as travis workers are stateless and therefore traditional local - # disk caching does not work. - export SKLEARN_SKIP_NETWORK_TESTS=1 + # Tests that require large downloads over the networks are skipped in CI. + # Here we make sure, that they are still run on a regular basis. + export SKLEARN_SKIP_NETWORK_TESTS=0 if [[ "$COVERAGE" == "true" ]]; then TEST_CMD="$TEST_CMD --cov sklearn" diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py new file mode 100644 index 0000000000000..85242d7335685 --- /dev/null +++ b/sklearn/datasets/tests/conftest.py @@ -0,0 +1,61 @@ +""" Network tests are only run, if data is already locally available, +or if download is specifically requested by environment variable.""" +from os import environ +import pytest +from sklearn.datasets import fetch_20newsgroups +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.datasets import fetch_california_housing +from sklearn.datasets import fetch_covtype +from sklearn.datasets import fetch_kddcup99 +from sklearn.datasets import fetch_olivetti_faces +from sklearn.datasets import fetch_rcv1 + + +def _wrapped_fetch(f, dataset_name): + """ Fetch dataset (download if missing and requested by environment) """ + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' + + def wrapped(*args, **kwargs): + kwargs['download_if_missing'] = download_if_missing + try: + return f(*args, **kwargs) + except IOError: + pytest.skip("Download {} to run this test".format(dataset_name)) + return wrapped + + +@pytest.fixture +def fetch_20newsgroups_fxt(): + return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups') + + +@pytest.fixture +def fetch_20newsgroups_vectorized_fxt(): + return _wrapped_fetch(fetch_20newsgroups_vectorized, + dataset_name='20newsgroups_vectorized') + + +@pytest.fixture +def fetch_california_housing_fxt(): + return _wrapped_fetch(fetch_california_housing, + dataset_name='california_housing') + + +@pytest.fixture +def fetch_covtype_fxt(): + return _wrapped_fetch(fetch_covtype, dataset_name='covtype') + + +@pytest.fixture +def fetch_kddcup99_fxt(): + return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99') + + +@pytest.fixture +def fetch_olivetti_faces_fxt(): + return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces') + + +@pytest.fixture +def fetch_rcv1_fxt(): + return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 15cb49c44b0e5..f800a49238ec1 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -1,25 +1,21 @@ -"""Test the 20news downloader, if the data is available.""" +"""Test the 20news downloader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" from functools import partial import numpy as np import scipy.sparse as sp -from sklearn.utils._testing import SkipTest, assert_allclose_dense_sparse +from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.datasets.tests.test_common import check_return_X_y - -from sklearn import datasets from sklearn.preprocessing import normalize -def test_20news(): - try: - data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=False, shuffle=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") +def test_20news(fetch_20newsgroups_fxt): + data = fetch_20newsgroups_fxt(subset='all', shuffle=False) # Extract a reduced dataset - data2cats = datasets.fetch_20newsgroups( + data2cats = fetch_20newsgroups_fxt( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset @@ -40,72 +36,53 @@ def test_20news(): assert entry1 == entry2 # check that return_X_y option - X, y = datasets.fetch_20newsgroups( - subset='all', shuffle=False, return_X_y=True - ) + X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True) assert len(X) == len(data.data) assert y.shape == data.target.shape -def test_20news_length_consistency(): +def test_20news_length_consistency(fetch_20newsgroups_fxt): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ - try: - data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=False, shuffle=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") # Extract the full dataset - data = datasets.fetch_20newsgroups(subset='all') + data = fetch_20newsgroups_fxt(subset='all') assert len(data['data']) == len(data.data) assert len(data['target']) == len(data.target) assert len(data['filenames']) == len(data.filenames) -def test_20news_vectorized(): - try: - datasets.fetch_20newsgroups(subset='all', - download_if_missing=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") - +def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt): # test subset = train - bunch = datasets.fetch_20newsgroups_vectorized(subset="train") + bunch = fetch_20newsgroups_vectorized_fxt(subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test - bunch = datasets.fetch_20newsgroups_vectorized(subset="test") + bunch = fetch_20newsgroups_vectorized_fxt(subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option - fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') + fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all - bunch = datasets.fetch_20newsgroups_vectorized(subset='all') + bunch = fetch_20newsgroups_vectorized_fxt(subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64 -def test_20news_normalization(): - try: - X = datasets.fetch_20newsgroups_vectorized(normalize=False, - download_if_missing=False) - X_ = datasets.fetch_20newsgroups_vectorized(normalize=True, - download_if_missing=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") - +def test_20news_normalization(fetch_20newsgroups_vectorized_fxt): + X = fetch_20newsgroups_vectorized_fxt(normalize=False) + X_ = fetch_20newsgroups_vectorized_fxt(normalize=True) X_norm = X_['data'][:100] X = X['data'][:100] diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 6112bf966b303..af1e1ff1370e1 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -1,48 +1,25 @@ -"""Test the california_housing loader. - -Skipped if california_housing is not already downloaded to data_home. -""" - +"""Test the california_housing loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" import pytest -from sklearn.datasets import fetch_california_housing from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def fetch(*args, **kwargs): - return fetch_california_housing(*args, download_if_missing=False, **kwargs) - - -def _is_california_housing_dataset_not_available(): - try: - fetch_california_housing(download_if_missing=False) - return False - except IOError: - return True - - -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_fetch(): - data = fetch() +def test_fetch(fetch_california_housing_fxt): + data = fetch_california_housing_fxt() assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option - fetch_func = partial(fetch) + fetch_func = partial(fetch_california_housing_fxt) check_return_X_y(data, fetch_func) -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_fetch_asframe(): +def test_fetch_asframe(fetch_california_housing_fxt): pd = pytest.importorskip('pandas') - bunch = fetch(as_frame=True) + bunch = fetch_california_housing_fxt(as_frame=True) frame = bunch.frame assert hasattr(bunch, 'frame') is True assert frame.shape == (20640, 9) @@ -50,11 +27,7 @@ def test_fetch_asframe(): assert isinstance(bunch.target, pd.Series) -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_pandas_dependency_message(): +def test_pandas_dependency_message(fetch_california_housing_fxt): try: import pandas # noqa pytest.skip("This test requires pandas to be not installed") @@ -64,4 +37,4 @@ def test_pandas_dependency_message(): expected_msg = ('fetch_california_housing with as_frame=True' ' requires pandas') with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing(as_frame=True) + fetch_california_housing_fxt(as_frame=True) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 1127b8114c5e7..d966e6c3890d0 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,25 +1,14 @@ -"""Test the covtype loader. +"""Test the covtype loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" -Skipped if covtype is not already downloaded to data_home. -""" - -from sklearn.datasets import fetch_covtype -from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def fetch(*args, **kwargs): - return fetch_covtype(*args, download_if_missing=False, **kwargs) - - -def test_fetch(): - try: - data1 = fetch(shuffle=True, random_state=42) - except IOError: - raise SkipTest("Covertype dataset can not be loaded.") - - data2 = fetch(shuffle=True, random_state=37) +def test_fetch(fetch_covtype_fxt): + data1 = fetch_covtype_fxt(shuffle=True, random_state=42) + data2 = fetch_covtype_fxt(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape @@ -32,5 +21,5 @@ def test_fetch(): assert (X1.shape[0],) == y2.shape # test return_X_y option - fetch_func = partial(fetch) + fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func) diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 6d371e5a8e6f0..899abd2bcb153 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -1,55 +1,46 @@ -"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data -is too big to use in unit-testing. +"""Test kddcup99 loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job). -The test is skipped if the data wasn't previously fetched and saved to -scikit-learn data folder. +Only 'percent10' mode is tested, as the full data +is too big to use in unit-testing. """ -from sklearn.datasets import fetch_kddcup99 from sklearn.datasets.tests.test_common import check_return_X_y -from sklearn.utils._testing import SkipTest from functools import partial - -def test_percent10(): - try: - data = fetch_kddcup99(download_if_missing=False) - except IOError: - raise SkipTest("kddcup99 dataset can not be loaded.") +def test_percent10(fetch_kddcup99_fxt): + data = fetch_kddcup99_fxt() assert data.data.shape == (494021, 41) assert data.target.shape == (494021,) - data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) + data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape - data = fetch_kddcup99('SA') + data = fetch_kddcup99_fxt('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655,) - data = fetch_kddcup99('SF') + data = fetch_kddcup99_fxt('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237,) - data = fetch_kddcup99('http') + data = fetch_kddcup99_fxt('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725,) - data = fetch_kddcup99('smtp') + data = fetch_kddcup99_fxt('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571,) - fetch_func = partial(fetch_kddcup99, 'smtp') + fetch_func = partial(fetch_kddcup99_fxt, 'smtp') check_return_X_y(data, fetch_func) -def test_shuffle(): - try: - dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, - percent10=True, download_if_missing=False) - except IOError: - raise SkipTest("kddcup99 dataset can not be loaded.") - +def test_shuffle(fetch_kddcup99_fxt): + dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True, + percent10=True) assert(any(dataset.target[-100:] == b'normal.')) diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index 0162676c50af7..f0c7aa1216e76 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -1,28 +1,17 @@ -"""Test Olivetti faces fetcher, if the data is available.""" -import pytest +"""Test Olivetti faces fetcher, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" + import numpy as np -from sklearn import datasets from sklearn.utils import Bunch from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_array_equal -def _is_olivetti_faces_not_available(): - try: - datasets.fetch_olivetti_faces(download_if_missing=False) - return False - except IOError: - return True - - -@pytest.mark.skipif( - _is_olivetti_faces_not_available(), - reason='Download Olivetti faces dataset to run this test' -) -def test_olivetti_faces(): - data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0) +def test_olivetti_faces(fetch_olivetti_faces_fxt): + data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ('data', 'images', 'target', 'DESCR'): @@ -34,4 +23,4 @@ def test_olivetti_faces(): assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) # test the return_X_y option - check_return_X_y(data, datasets.fetch_olivetti_faces) + check_return_X_y(data, fetch_olivetti_faces_fxt) diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index 7cae454bf158b..2c21201dce40e 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -1,26 +1,17 @@ -"""Test the rcv1 loader. +"""Test the rcv1 loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" -Skipped if rcv1 is not already downloaded to data_home. -""" - -import errno import scipy.sparse as sp import numpy as np from functools import partial -from sklearn.datasets import fetch_rcv1 from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import SkipTest - -def test_fetch_rcv1(): - try: - data1 = fetch_rcv1(shuffle=False, download_if_missing=False) - except IOError as e: - if e.errno == errno.ENOENT: - raise SkipTest("Download RCV1 dataset to run this test.") +def test_fetch_rcv1(fetch_rcv1_fxt): + data1 = fetch_rcv1_fxt(shuffle=False) X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id @@ -48,14 +39,12 @@ def test_fetch_rcv1(): assert num == Y1[:, j].data.size # test shuffling and subset - data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, - download_if_missing=False) + data2 = fetch_rcv1_fxt(shuffle=True, subset='train', random_state=77) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option - fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', - download_if_missing=False) + fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset='train') check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples From 76ef4e11b5f85bd9c475ab9009bd7a95a4540bc4 Mon Sep 17 00:00:00 2001 From: Riccardo Folloni Date: Mon, 2 Mar 2020 14:25:40 +0100 Subject: [PATCH 417/448] DOC Added MLPRegressor and MLPClassifier examples (#15228) * Added MLPRegressor and MLPClassifier examples * shortened line due to test failure * changed way to calculate score due to approximation error in tests * removed comments, used predict instead of cross validation * DOC Simplified make_regression and make_classification arguments * DOC fix for linting * DOC Update * DOC Less precision Co-authored-by: Thomas J Fan --- .../neural_network/_multilayer_perceptron.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index dbd4c012c8afa..6eb42bb455c3a 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -890,6 +890,23 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): out_activation_ : string Name of the output activation function. + + Examples + -------- + >>> from sklearn.neural_network import MLPClassifier + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_classification(n_samples=100, random_state=1) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, + ... random_state=1) + >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train) + >>> clf.predict_proba(X_test[:1]) + array([[0.038..., 0.961...]]) + >>> clf.predict(X_test[:5, :]) + array([1, 0, 1, 0, 1]) + >>> clf.score(X_test, y_test) + 0.8... + Notes ----- MLPClassifier trains iteratively since at each time step @@ -1279,6 +1296,20 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): out_activation_ : string Name of the output activation function. + Examples + -------- + >>> from sklearn.neural_network import MLPRegressor + >>> from sklearn.datasets import make_regression + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_regression(n_samples=200, random_state=1) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... random_state=1) + >>> regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train) + >>> regr.predict(X_test[:2]) + array([-0.9..., -7.1...]) + >>> regr.score(X_test, y_test) + 0.4... + Notes ----- MLPRegressor trains iteratively since at each time step From 0e4f85fe72932c5adeb7de8652a7b76482190341 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 2 Mar 2020 13:46:49 -0500 Subject: [PATCH 418/448] MNT Periodic adds labels based on module (#16596) * MNT Users pull request labeler * MNT Fix * labeler fix * MNT Update * MNT Uses single quotes * BUG Uses number * MN Removes Build / CI auto labeling * MNT Updates to labeler v2.4.1 * ENH Updates version to include logs --- .github/labeler.yml | 80 +++++++++++++++++++++++++++++++++++ .github/workflows/labeler.yml | 13 ++++++ 2 files changed, 93 insertions(+) create mode 100644 .github/labeler.yml create mode 100644 .github/workflows/labeler.yml diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000000000..faf2acdc2e9db --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,80 @@ +module:cluster: +- sklearn/cluster/**/* + +module:common: +- sklearn/common/**/* + +module:compose: +- sklearn/compose/**/* + +module:covariance: +- sklearn/covariance/**/* + +module:cross_decomposition: +- sklearn/cross_decomposition/**/* + +module:datasets: +- sklearn/datasets/**/* + +module:decomposition: +- sklearn/decomposition/**/* + +module:ensemble: +- sklearn/ensemble/**/* + +module:feature_extraction: +- sklearn/feature_extraction/**/* + +module:feature_selection: +- sklearn/feature_selection/**/* + +module:gaussian_process: +- sklearn/gaussian_process/**/* + +module:impute: +- sklearn/impute/**/* + +module:inspection: +- sklearn/inspection/**/* + +module:linear_model: +- sklearn/linear_model/**/* + +module:manifold: +- sklearn/manifold/**/* + +module:metrics: +- sklearn/metrics/**/* + +module:mixture: +- sklearn/mixture/**/* + +module:model_selection: +- sklearn/model_selection/**/* + +module:naive_bayes: +- sklearn/naive_bayes.py + +module:neighbors: +- sklearn/neighbors/**/* + +module:neural_network: +- sklearn/neural_network/**/* + +module:pipeline: +- sklearn/pipeline.py + +module:preprocessing: +- sklearn/preprocessing/**/* + +module:semi_supervised: +- sklearn/semi_supervised/**/* + +module:svm: +- sklearn/svm/**/* + +module:tree: +- sklearn/tree/**/* + +module:utils: +- sklearn/utils/**/* diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000000000..8d7c219f8c4dc --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,13 @@ +name: "Pull Request Labeler" +on: + schedule: + - cron: "*/10 * * * *" + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: thomasjpfan/labeler@v2.4.3 + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + max-labels: "3" From 6464724a802c07c168fb73fba76a6d64ae70e4be Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 2 Mar 2020 14:43:00 -0500 Subject: [PATCH 419/448] MNT Enables error printing correctly in autolabeler (#16614) --- .github/workflows/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 8d7c219f8c4dc..e763551b3cf6a 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -7,7 +7,7 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: thomasjpfan/labeler@v2.4.3 + - uses: thomasjpfan/labeler@v2.4.4 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" From e275218ad6b5d5bc3c6195d139dbfafa0f2978c1 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 2 Mar 2020 20:46:12 +0100 Subject: [PATCH 420/448] API make __init__ params in covariance kw-only (#16544) --- sklearn/covariance/_elliptic_envelope.py | 4 +++- sklearn/covariance/_empirical_covariance.py | 4 +++- sklearn/covariance/_graph_lasso.py | 9 +++++---- sklearn/covariance/_robust_covariance.py | 4 +++- sklearn/covariance/_shrunk_covariance.py | 7 +++++-- 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index 4f4624c995ba7..9b7c00efd53a5 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -5,6 +5,7 @@ import numpy as np from . import MinCovDet from ..utils.validation import check_is_fitted, check_array +from ..utils.validation import _deprecate_positional_args from ..metrics import accuracy_score from ..base import OutlierMixin @@ -102,7 +103,8 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): minimum covariance determinant estimator" Technometrics 41(3), 212 (1999) """ - def __init__(self, store_precision=True, assume_centered=False, + @_deprecate_positional_args + def __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None): super().__init__( diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 30520b64dc507..c83dbc89697e1 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -18,6 +18,7 @@ from ..utils import check_array from ..utils.extmath import fast_logdet from ..metrics.pairwise import pairwise_distances +from ..utils.validation import _deprecate_positional_args def log_likelihood(emp_cov, precision): @@ -142,7 +143,8 @@ class EmpiricalCovariance(BaseEstimator): array([0.0622..., 0.0193...]) """ - def __init__(self, store_precision=True, assume_centered=False): + @_deprecate_positional_args + def __init__(self, *, store_precision=True, assume_centered=False): self.store_precision = store_precision self.assume_centered = assume_centered diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 4fd1b01d763b2..77ff9adb7fc0c 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -19,6 +19,7 @@ from ..exceptions import ConvergenceWarning from ..utils.validation import check_random_state, check_array +from ..utils.validation import _deprecate_positional_args from ..linear_model import _cd_fast as cd_fast from ..linear_model import lars_path_gram from ..model_selection import check_cv, cross_val_score @@ -355,8 +356,8 @@ class GraphicalLasso(EmpiricalCovariance): -------- graphical_lasso, GraphicalLassoCV """ - - def __init__(self, alpha=.01, mode='cd', tol=1e-4, enet_tol=1e-4, + @_deprecate_positional_args + def __init__(self, alpha=.01, *, mode='cd', tol=1e-4, enet_tol=1e-4, max_iter=100, verbose=False, assume_centered=False): super().__init__(assume_centered=assume_centered) self.alpha = alpha @@ -631,8 +632,8 @@ class GraphicalLassoCV(GraphicalLasso): values of alpha then come out as missing values, but the optimum may be close to these missing values. """ - - def __init__(self, alphas=4, n_refinements=4, cv=None, tol=1e-4, + @_deprecate_positional_args + def __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=1e-4, enet_tol=1e-4, max_iter=100, mode='cd', n_jobs=None, verbose=False, assume_centered=False): super().__init__( diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 586c39cadecbe..73b36942682a1 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -17,6 +17,7 @@ from . import empirical_covariance, EmpiricalCovariance from ..utils.extmath import fast_logdet from ..utils import check_random_state, check_array +from ..utils.validation import _deprecate_positional_args # Minimum Covariance Determinant @@ -614,7 +615,8 @@ class MinCovDet(EmpiricalCovariance): """ _nonrobust_covariance = staticmethod(empirical_covariance) - def __init__(self, store_precision=True, assume_centered=False, + @_deprecate_positional_args + def __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, random_state=None): self.store_precision = store_precision self.assume_centered = assume_centered diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index b2c43cb3eb3cd..06e1b4f180347 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -18,6 +18,7 @@ from . import empirical_covariance, EmpiricalCovariance from ..utils import check_array +from ..utils.validation import _deprecate_positional_args # ShrunkCovariance estimator @@ -117,7 +118,8 @@ class ShrunkCovariance(EmpiricalCovariance): where mu = trace(cov) / n_features """ - def __init__(self, store_precision=True, assume_centered=False, + @_deprecate_positional_args + def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1): super().__init__(store_precision=store_precision, assume_centered=assume_centered) @@ -388,7 +390,8 @@ class LedoitWolf(EmpiricalCovariance): Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2, February 2004, pages 365-411. """ - def __init__(self, store_precision=True, assume_centered=False, + @_deprecate_positional_args + def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000): super().__init__(store_precision=store_precision, assume_centered=assume_centered) From 989579f71ad68c075b2534191e9b7efbce34f830 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 2 Mar 2020 15:31:24 -0500 Subject: [PATCH 421/448] =?UTF-8?q?MNT=20Updates=20autolabeler=20to=20use?= =?UTF-8?q?=20per=5Fpage=3D100=20to=20help=20with=20rate=E2=80=A6=20(#1661?= =?UTF-8?q?6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/labeler.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index e763551b3cf6a..688602b5959b6 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -7,7 +7,8 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: thomasjpfan/labeler@v2.4.4 + - uses: thomasjpfan/labeler@v2.4.5 + if: github.repository == 'scikit-learn/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" From e5d7f1d3517e1cd7491c1f92fd9f1ef6af27b1d5 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 2 Mar 2020 16:04:56 -0500 Subject: [PATCH 422/448] MNT Reduces github API calls greatly in autolabeler (#16617) --- .github/workflows/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 688602b5959b6..28d1debcad7f1 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -7,7 +7,7 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: thomasjpfan/labeler@v2.4.5 + - uses: thomasjpfan/labeler@v2.4.6 if: github.repository == 'scikit-learn/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" From 72b3041ed57e42817e4c5c9853b3a2597cab3654 Mon Sep 17 00:00:00 2001 From: Eric Leung Date: Mon, 2 Mar 2020 16:18:05 -0800 Subject: [PATCH 423/448] DOC Capitalize Kaggle as proper noun (#16618) --- doc/getting_started.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/getting_started.rst b/doc/getting_started.rst index ba18b92e40983..79e7ac5b52bb9 100644 --- a/doc/getting_started.rst +++ b/doc/getting_started.rst @@ -205,7 +205,7 @@ the best set of parameters. Read more in the :ref:`User Guide training and testing data. Indeed, since you pre-processed the data using the whole dataset, some information about the test sets are available to the train sets. This will lead to over-estimating the - generalization power of the estimator (you can read more in this `kaggle + generalization power of the estimator (you can read more in this `Kaggle post `_). Using a pipeline for cross-validation and searching will largely keep From 48738f7adfedc590f81fa4b755afd3e2dee57c08 Mon Sep 17 00:00:00 2001 From: maikia Date: Tue, 3 Mar 2020 15:33:57 +0100 Subject: [PATCH 424/448] in plot_stack_predictors exchanged boston dataset for Ames housing dataset (#16345) --- examples/ensemble/plot_stack_predictors.py | 211 +++++++++++++++++---- 1 file changed, 172 insertions(+), 39 deletions(-) diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py index 0a3e12646b427..3d2ff4a38ddd7 100644 --- a/examples/ensemble/plot_stack_predictors.py +++ b/examples/ensemble/plot_stack_predictors.py @@ -3,6 +3,8 @@ Combine predictors using stacking ================================= +.. currentmodule:: sklearn + Stacking refers to a method to blend estimators. In this strategy, some estimators are individually fitted on some training data while a final estimator is trained using the stacked predictions of these base estimators. @@ -16,42 +18,128 @@ print(__doc__) # Authors: Guillaume Lemaitre +# Maria Telenczuk # License: BSD 3 clause + ############################################################################### -# The function ``plot_regression_results`` is used to plot the predicted and -# true targets. +# Download the dataset +############################################################################### +# +# We will use `Ames Housing`_ dataset which was first compiled by Dean De Cock +# and became better known after it was used in Kaggle challenge. It is a set +# of 1460 residential homes in Ames, Iowa, each described by 80 features. We +# will use it to predict the final logarithmic price of the houses. In this +# example we will use only 20 most interesting features chosen using +# GradientBoostingRegressor() and limit number of entries (here we won't go +# into the details on how to select the most interesting features). +# +# The Ames housing dataset is not shipped with scikit-learn and therefore we +# will fetch it from `OpenML`_. +# +# .. _`Ames Housing`: http://jse.amstat.org/v19n3/decock.pdf +# .. _`OpenML`: https://www.openml.org/d/42165 -import matplotlib.pyplot as plt +import numpy as np +from sklearn.datasets import fetch_openml +from sklearn.utils import shuffle -def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time): - """Scatter plot of the predicted vs true targets.""" - ax.plot([y_true.min(), y_true.max()], - [y_true.min(), y_true.max()], - '--r', linewidth=2) - ax.scatter(y_true, y_pred, alpha=0.2) - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - ax.get_xaxis().tick_bottom() - ax.get_yaxis().tick_left() - ax.spines['left'].set_position(('outward', 10)) - ax.spines['bottom'].set_position(('outward', 10)) - ax.set_xlim([y_true.min(), y_true.max()]) - ax.set_ylim([y_true.min(), y_true.max()]) - ax.set_xlabel('Measured') - ax.set_ylabel('Predicted') - extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False, - edgecolor='none', linewidth=0) - ax.legend([extra], [scores], loc='upper left') - title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time) - ax.set_title(title) +def load_ames_housing(): + df = fetch_openml(name="house_prices", as_frame=True) + X = df.data + y = df.target + + features = ['YrSold', 'HeatingQC', 'Street', 'YearRemodAdd', 'Heating', + 'MasVnrType', 'BsmtUnfSF', 'Foundation', 'MasVnrArea', + 'MSSubClass', 'ExterQual', 'Condition2', 'GarageCars', + 'GarageType', 'OverallQual', 'TotalBsmtSF', 'BsmtFinSF1', + 'HouseStyle', 'MiscFeature', 'MoSold'] + + X = X[features] + X, y = shuffle(X, y, random_state=0) + + X = X[:600] + y = y[:600] + return X, np.log(y) + + +X, y = load_ames_housing() + + +############################################################################### +# Make pipeline to preprocess the data +############################################################################### +# +# Before we can use Ames dataset we still need to do some preprocessing. +# First, the dataset has many missing values. To impute them, we will exchange +# categorical missing values with the new category 'missing' while the +# numerical missing values with the 'mean' of the column. We will also encode +# the categories with either :class:`sklearn.preprocessing.OneHotEncoder +# ` or +# :class:`sklearn.preprocessing.OrdinalEncoder +# ` depending for which type of model we +# will use them (linear or non-linear model). To falicitate this preprocessing +# we will make two pipelines. +# You can skip this section if your data is ready to use and does +# not need preprocessing + + +from sklearn.compose import make_column_transformer +from sklearn.impute import SimpleImputer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing import StandardScaler + + +cat_cols = X.columns[X.dtypes == 'O'] +num_cols = X.columns[X.dtypes == 'float64'] + +categories = [ + X[column].unique() for column in X[cat_cols]] + +for cat in categories: + cat[cat == None] = 'missing' # noqa + +cat_proc_nlin = make_pipeline( + SimpleImputer(missing_values=None, strategy='constant', + fill_value='missing'), + OrdinalEncoder(categories=categories) + ) + +num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean')) + +cat_proc_lin = make_pipeline( + SimpleImputer(missing_values=None, + strategy='constant', + fill_value='missing'), + OneHotEncoder(categories=categories) +) + +num_proc_lin = make_pipeline( + SimpleImputer(strategy='mean'), + StandardScaler() +) + +# transformation to use for non-linear estimators +processor_nlin = make_column_transformer( + (cat_proc_nlin, cat_cols), + (num_proc_nlin, num_cols), + remainder='passthrough') + +# transformation to use for linear estimators +processor_lin = make_column_transformer( + (cat_proc_lin, cat_cols), + (num_proc_lin, num_cols), + remainder='passthrough') ############################################################################### # Stack of predictors on a single data set ############################################################################### +# # It is sometimes tedious to find the model which will best perform on a given # dataset. Stacking provide an alternative by combining the outputs of several # learners, without the need to choose a model specifically. The performance of @@ -60,35 +148,79 @@ def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time): # # Here, we combine 3 learners (linear and non-linear) and use a ridge regressor # to combine their outputs together. +# +# Note: although we will make new pipelines with the processors which we wrote +# in the previous section for the 3 learners, the final estimator RidgeCV() +# does not need preprocessing of the data as it will be fed with the already +# preprocessed output from the 3 learners. + -from sklearn.ensemble import StackingRegressor -from sklearn.ensemble import RandomForestRegressor from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import StackingRegressor from sklearn.linear_model import LassoCV from sklearn.linear_model import RidgeCV -estimators = [ - ('Random Forest', RandomForestRegressor(random_state=42)), - ('Lasso', LassoCV()), - ('Gradient Boosting', HistGradientBoostingRegressor(random_state=0)) -] -stacking_regressor = StackingRegressor( - estimators=estimators, final_estimator=RidgeCV() -) +lasso_pipeline = make_pipeline(processor_lin, + LassoCV()) + +rf_pipeline = make_pipeline(processor_nlin, + RandomForestRegressor(random_state=42)) +gradient_pipeline = make_pipeline( + processor_nlin, + HistGradientBoostingRegressor(random_state=0)) + +estimators = [('Random Forest', rf_pipeline), + ('Lasso', lasso_pipeline), + ('Gradient Boosting', gradient_pipeline)] + +stacking_regressor = StackingRegressor(estimators=estimators, + final_estimator=RidgeCV()) + + +############################################################################### +# Measure and plot the results ############################################################################### -# We used the Boston data set (prediction of house prices). We check the -# performance of each individual predictor as well as the stack of the +# +# Now we can use Ames Housing dataset to make the predictions. We check the +# performance of each individual predictor as well as of the stack of the # regressors. +# +# The function ``plot_regression_results`` is used to plot the predicted and +# true targets. + import time -import numpy as np -from sklearn.datasets import load_boston +import matplotlib.pyplot as plt from sklearn.model_selection import cross_validate, cross_val_predict -X, y = load_boston(return_X_y=True) + +def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time): + """Scatter plot of the predicted vs true targets.""" + ax.plot([y_true.min(), y_true.max()], + [y_true.min(), y_true.max()], + '--r', linewidth=2) + ax.scatter(y_true, y_pred, alpha=0.2) + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['left'].set_position(('outward', 10)) + ax.spines['bottom'].set_position(('outward', 10)) + ax.set_xlim([y_true.min(), y_true.max()]) + ax.set_ylim([y_true.min(), y_true.max()]) + ax.set_xlabel('Measured') + ax.set_ylabel('Predicted') + extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False, + edgecolor='none', linewidth=0) + ax.legend([extra], [scores], loc='upper left') + title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time) + ax.set_title(title) + fig, axs = plt.subplots(2, 2, figsize=(9, 7)) axs = np.ravel(axs) @@ -102,6 +234,7 @@ def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time): elapsed_time = time.time() - start_time y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0) + plot_regression_results( ax, y, y_pred, name, From e6e5811809919cd7da1b9f97cb31097271a79dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 3 Mar 2020 15:56:27 +0100 Subject: [PATCH 425/448] [MRG] Update What's new for 0.22.2.post1 (#16610) --- doc/whats_new/v0.22.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 381594219d597..4f62f88be9071 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -4,10 +4,15 @@ .. _changes_0_22_2: -Version 0.22.2 -============== +Version 0.22.2.post1 +==================== + +**March 3 2020** -**February 28 2020** +The 0.22.2.post1 release includes a packaging fix for the source distribution +but the content of the packages is otherwise identical to the content of the +wheels with the 0.22.2 version (without the .post1 suffix). Both contain the +following changes. Changelog --------- @@ -33,7 +38,7 @@ Changelog :class:`metrics.PrecisionRecallDisplay` instead of the parameter `name`. It results in a different plot when calling :meth:`metrics.PrecisionRecallDisplay.plot` for the subsequent times. - :pr:`#16505` by :user:`Guillaume Lemaitre `. + :pr:`16505` by :user:`Guillaume Lemaitre `. :mod:`sklearn.neighbors` .............................. From 8910b14fa8630c4d9ef477a2d339fdf68eecd4b9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 3 Mar 2020 17:34:48 +0100 Subject: [PATCH 426/448] TST More stable test_uniform_grid (#16621) --- sklearn/manifold/tests/test_t_sne.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 15ce1fa6f2482..9486bbd4a96f5 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -772,11 +772,11 @@ def test_uniform_grid(method): we re-run t-SNE from the final point when the convergence is not good enough. """ - seeds = [0, 1, 2] + seeds = range(3) n_iter = 500 for seed in seeds: tsne = TSNE(n_components=2, init='random', random_state=seed, - perplexity=20, n_iter=n_iter, method=method) + perplexity=50, n_iter=n_iter, method=method) Y = tsne.fit_transform(X_2d_grid) try_name = "{}_{}".format(method, seed) From 9766acda9059b8623ae65fc1e2babd6ede9e9abc Mon Sep 17 00:00:00 2001 From: Evgeni Chasnovski Date: Tue, 3 Mar 2020 22:08:59 +0200 Subject: [PATCH 427/448] =?UTF-8?q?DOC=20Fix=20"more..."=20link=20in=20"Mo?= =?UTF-8?q?del=20selection"=20section=20of=20'index.=E2=80=A6=20(#16620)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/templates/index.html b/doc/templates/index.html index b2e02f94ba903..0f43677e668f5 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -115,7 +115,7 @@

Machine Learning in grid search, cross validation, metrics, - and more...
+ and more...

March 2020. scikit-learn 0.22.2 is available for download (Changelog).
January 2020. scikit-learn 0.22.1 is available for download (Changelog).
December 2019. scikit-learn 0.22 is available for download (Changelog).

Date: Wed, 4 Mar 2020 22:33:59 +0100 Subject: [PATCH 438/448] TST Enable california_housing pandas test in cron job (#16547) --- sklearn/datasets/tests/conftest.py | 14 ++++++++++++++ .../datasets/tests/test_california_housing.py | 19 ++++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index 85242d7335685..fdb9516e62a27 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -1,5 +1,6 @@ """ Network tests are only run, if data is already locally available, or if download is specifically requested by environment variable.""" +import builtins from os import environ import pytest from sklearn.datasets import fetch_20newsgroups @@ -59,3 +60,16 @@ def fetch_olivetti_faces_fxt(): @pytest.fixture def fetch_rcv1_fxt(): return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') + + +@pytest.fixture +def hide_available_pandas(monkeypatch): + """ Pretend pandas was not installed. """ + import_orig = builtins.__import__ + + def mocked_import(name, *args, **kwargs): + if name == 'pandas': + raise ImportError() + return import_orig(name, *args, **kwargs) + + monkeypatch.setattr(builtins, '__import__', mocked_import) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index af1e1ff1370e1..a8c5514e2ec73 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -27,14 +27,11 @@ def test_fetch_asframe(fetch_california_housing_fxt): assert isinstance(bunch.target, pd.Series) -def test_pandas_dependency_message(fetch_california_housing_fxt): - try: - import pandas # noqa - pytest.skip("This test requires pandas to be not installed") - except ImportError: - # Check that pandas is imported lazily and that an informative error - # message is raised when pandas is missing: - expected_msg = ('fetch_california_housing with as_frame=True' - ' requires pandas') - with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing_fxt(as_frame=True) +def test_pandas_dependency_message(fetch_california_housing_fxt, + hide_available_pandas): + # Check that pandas is imported lazily and that an informative error + # message is raised when pandas is missing: + expected_msg = ('fetch_california_housing with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_california_housing_fxt(as_frame=True) From df338cddc8094bdb226c7ec4cd4233ac5cffa806 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 11:55:25 +0100 Subject: [PATCH 439/448] EXA align lorenz curves between the two examples with GLMs (#16640) --- .../plot_poisson_regression_non_normal_loss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index ee863dd4198ba..4b0386edfcdf6 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -393,11 +393,11 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # # To compare the 3 models within this perspective, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by -# the model predictions, from riskiest to safest according to each model: +# the model predictions, from safest to riskiest according to each model: def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest + idx_sort = np.argsort(y_pred) # from safest to riskiest sorted_exposure = exposure[idx_sort] sorted_frequencies = y_true[idx_sort] cumulated_exposure = np.cumsum(sorted_exposure) @@ -434,10 +434,10 @@ def _cumulated_claims(y_true, y_pred, exposure): label="Random baseline") ax.set( title="Cumulated number of claims by model", - xlabel='Fraction of exposure (from riskiest to safest)', + xlabel='Fraction of exposure (from safest to riskiest)', ylabel='Fraction of number of claims' ) -ax.legend(loc="lower right") +ax.legend(loc="upper left") ############################################################################## # This plot reveals that the random forest model is slightly better at ranking From 83ec08bfbf27b9bc4340d7a159e5507e6abc2282 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Thu, 5 Mar 2020 21:53:00 +0100 Subject: [PATCH 440/448] DOC update n_jobs description in DBSCAN (#16615) Co-authored-by: JohanWork --- sklearn/cluster/_dbscan.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 6a33f411886b0..52c962052f9bc 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -83,10 +83,11 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, Note that weights are absolute, and default to 1. n_jobs : int or None, optional (default=None) - The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. + The number of parallel jobs to run for neighbors search. ``None`` means + 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means + using all processors. See :term:`Glossary ` for more details. + If precomputed distance are used, parallel execution is not available + and thus n_jobs will have no effect. Returns ------- From f1acf834685f8bcd1bcdd903e9c40b7515fe0a67 Mon Sep 17 00:00:00 2001 From: wderose Date: Fri, 6 Mar 2020 13:51:32 -0800 Subject: [PATCH 441/448] FIX Pass sample_weight when predicting on stacked folds (#16539) --- doc/whats_new/v0.23.rst | 7 +++++++ sklearn/ensemble/_stacking.py | 26 ++++++++++--------------- sklearn/ensemble/tests/test_stacking.py | 14 +++++++++++++ sklearn/utils/_mocking.py | 1 + 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 806e0b8cb354c..90c99e6c04c19 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -182,6 +182,13 @@ Changelog used during `fit`. :pr:`16437` by :user:`Jin-Hwan CHO `. +- |Fix| Fixed a bug in :class:`ensemble.StackingClassifier` and + :class:`ensemble.StackingRegressor` where the `sample_weight` + argument was not being passed to `cross_val_predict` when + evaluating the base estimators on cross-validation folds + to obtain the input to the meta estimator. + :pr:`16539` by :user:`Bill DeRose `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index cd18a684a4518..ba817613523f6 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -122,6 +122,10 @@ def fit(self, X, y, sample_weight=None): Note that this is supported only if all underlying estimators support sample weights. + .. versionchanged:: 0.23 + when not None, `sample_weight` is passed to all underlying + estimators + Returns ------- self : object @@ -166,10 +170,13 @@ def fit(self, X, y, sample_weight=None): self._method_name(name, est, meth) for name, est, meth in zip(names, all_estimators, stack_method) ] - + fit_params = ({"sample_weight": sample_weight} + if sample_weight is not None + else None) predictions = Parallel(n_jobs=self.n_jobs)( delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, + fit_params=fit_params, verbose=self.verbose) for est, meth in zip(all_estimators, self.stack_method_) if est != 'drop' @@ -183,21 +190,8 @@ def fit(self, X, y, sample_weight=None): ] X_meta = self._concatenate_predictions(X, predictions) - if sample_weight is not None: - try: - self.final_estimator_.fit( - X_meta, y, sample_weight=sample_weight - ) - except TypeError as exc: - if "unexpected keyword argument 'sample_weight'" in str(exc): - raise TypeError( - "Underlying estimator {} does not support sample " - "weights." - .format(self.final_estimator_.__class__.__name__) - ) from exc - raise - else: - self.final_estimator_.fit(X_meta, y) + _fit_single_estimator(self.final_estimator_, X_meta, y, + sample_weight=sample_weight) return self diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 1eff7ba5f7de7..f8a3f290e96b5 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -38,6 +38,7 @@ from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold +from sklearn.utils._mocking import CheckingClassifier from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import ignore_warnings @@ -439,6 +440,19 @@ def test_stacking_with_sample_weight(stacker, X, y): assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0 +def test_stacking_classifier_sample_weight_fit_param(): + # check sample_weight is passed to all invocations of fit + stacker = StackingClassifier( + estimators=[ + ('lr', CheckingClassifier(expected_fit_params=['sample_weight'])) + ], + final_estimator=CheckingClassifier( + expected_fit_params=['sample_weight'] + ) + ) + stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0])) + + @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") @pytest.mark.parametrize( "stacker, X, y", diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index 25b60f7955b99..cff4183ea9bc4 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -95,6 +95,7 @@ def fit(self, X, y, **fit_params): assert self.check_X(X) if self.check_y is not None: assert self.check_y(y) + self.n_features_in_ = len(X) self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True)) if self.expected_fit_params: From eb3f5dfe562b15e507c1f8e3ab16848ec2ab6f84 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 9 Mar 2020 06:37:53 -0400 Subject: [PATCH 442/448] BLD Turns off memory_profiler in examples to fix CircleCI (#16629) --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index b7eb635b15f40..d8350a9713ebd 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -297,7 +297,7 @@ def __call__(self, directory): sphinx_gallery_conf = { 'doc_module': 'sklearn', 'backreferences_dir': os.path.join('modules', 'generated'), - 'show_memory': True, + 'show_memory': False, 'reference_url': { 'sklearn': None}, 'examples_dirs': ['../examples'], From 018c6dc57d21c89c7d1278c686c7d5d62f32ee48 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 9 Mar 2020 06:38:26 -0400 Subject: [PATCH 443/448] BLD Updates osx vm image in azure pipelines (#16647) --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index df504a4ab3bf7..1aad015849b2e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -133,7 +133,7 @@ jobs: - template: build_tools/azure/posix.yml parameters: name: macOS - vmImage: xcode9-macos10.13 + vmImage: macOS-10.14 dependsOn: [linting] matrix: pylatest_conda_mkl: From b189bf60708af22dde82a00aca7b5a54290b666d Mon Sep 17 00:00:00 2001 From: Maura Pintor Date: Tue, 10 Mar 2020 05:24:26 +0100 Subject: [PATCH 444/448] FIX: normalizer l_inf should take maximum of absolute values (#16633) --- doc/whats_new/v0.23.rst | 5 +++++ sklearn/preprocessing/_data.py | 11 +++++++---- sklearn/preprocessing/tests/test_data.py | 23 ++++++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 90c99e6c04c19..e465b94f104ba 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -357,6 +357,11 @@ Changelog computing statistics when calling `partial_fit` on sparse inputs. :pr:`16466` by :user:`Guillaume Lemaitre `. +- |Fix| Fix a bug in :class:`preprocessing.Normalizer` with norm='max', + which was not taking the absolute value of the maximum values before + normalizing the vectors. :pr:`16632` by + :user:`Maura Pintor ` and :user:`Battista Biggio `. + :mod:`sklearn.svm` .................. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 72ad6bacd43b4..33e2bac562489 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1718,7 +1718,8 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): elif norm == 'l2': inplace_csr_row_normalize_l2(X) elif norm == 'max': - _, norms = min_max_axis(X, 1) + mins, maxes = min_max_axis(X, 1) + norms = np.maximum(abs(mins), maxes) norms_elementwise = norms.repeat(np.diff(X.indptr)) mask = norms_elementwise != 0 X.data[mask] /= norms_elementwise[mask] @@ -1728,7 +1729,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): elif norm == 'l2': norms = row_norms(X) elif norm == 'max': - norms = np.max(X, axis=1) + norms = np.max(abs(X), axis=1) norms = _handle_zeros_in_scale(norms, copy=False) X /= norms[:, np.newaxis] @@ -1746,7 +1747,7 @@ class Normalizer(TransformerMixin, BaseEstimator): Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so - that its norm (l1 or l2) equals one. + that its norm (l1, l2 or inf) equals one. This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of @@ -1763,7 +1764,9 @@ class Normalizer(TransformerMixin, BaseEstimator): Parameters ---------- norm : 'l1', 'l2', or 'max', optional ('l2' by default) - The norm to use to normalize each non zero sample. + The norm to use to normalize each non zero sample. If norm='max' + is used, values will be rescaled by the maximum of the absolute + values. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 95721a0508091..7999df083631c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1947,7 +1947,7 @@ def test_normalizer_max(): X_norm2 = toarray(X_norm2) for X_norm in (X_norm1, X_norm2): - row_maxs = X_norm.max(axis=1) + row_maxs = abs(X_norm).max(axis=1) for i in range(3): assert_almost_equal(row_maxs[i], 1.0) assert_almost_equal(row_maxs[3], 0.0) @@ -1966,6 +1966,27 @@ def test_normalizer_max(): assert_almost_equal(la.norm(X_norm[3]), 0.0) +def test_normalizer_max_sign(): + # check that we normalize by a positive number even for negative data + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + # set the row number 3 to zero + X_dense[3, :] = 0.0 + # check for mixed data where the value with + # largest magnitude is negative + X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 + X_all_neg = -np.abs(X_dense) + X_all_neg_sparse = sparse.csr_matrix(X_all_neg) + + for X in (X_dense, X_all_neg, X_all_neg_sparse): + normalizer = Normalizer(norm='max') + X_norm = normalizer.transform(X) + assert X_norm is not X + X_norm = toarray(X_norm) + assert_array_equal( + np.sign(X_norm), np.sign(toarray(X))) + + def test_normalize(): # Test normalize function # Only tests functionality not used by the tests for Normalizer. From f763c614f44193d149f3d64a1e8a1eac3fa9f898 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 09:58:55 +0100 Subject: [PATCH 445/448] ENH Add check for non binary variables in OneHotEncoder. (#16585) Co-authored-by: Thomas J Fan Co-authored-by: Guillaume Lemaitre --- sklearn/preprocessing/_encoders.py | 47 ++++++++++---------- sklearn/preprocessing/tests/test_encoders.py | 27 ++++++++--- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3be4540498591..86be9d335bd9e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -225,13 +225,13 @@ class OneHotEncoder(_BaseEncoder): (if any). drop_idx_ : array of shape (n_features,) - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to - be dropped for each feature. - ``drop_idx_[i] = -1`` if no category is to be dropped from the feature - with index ``i``, e.g. when `drop='if_binary'` and the feature isn't - binary - - ``drop_idx_ = None`` if all the transformed features will be retained. + - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category + to be dropped for each feature. + - ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the + feature isn't binary. + - ``drop_idx_ = None`` if all the transformed features will be + retained. See Also -------- @@ -316,10 +316,10 @@ def _compute_drop_idx(self): return None elif isinstance(self.drop, str): if self.drop == 'first': - return np.zeros(len(self.categories_), dtype=np.int_) + return np.zeros(len(self.categories_), dtype=np.object) elif self.drop == 'if_binary': - return np.array([0 if len(cats) == 2 else -1 - for cats in self.categories_], dtype=np.int_) + return np.array([0 if len(cats) == 2 else None + for cats in self.categories_], dtype=np.object) else: msg = ( "Wrong input for parameter `drop`. Expected " @@ -354,7 +354,8 @@ def _compute_drop_idx(self): raise ValueError(msg) return np.array([np.where(cat_list == val)[0][0] for (val, cat_list) in - zip(self.drop, self.categories_)], dtype=np.int_) + zip(self.drop, self.categories_)], + dtype=np.object) def fit(self, X, y=None): """ @@ -421,7 +422,7 @@ def transform(self, X): n_samples, n_features = X_int.shape - if self.drop is not None: + if self.drop_idx_ is not None: to_drop = self.drop_idx_.copy() # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. @@ -431,7 +432,7 @@ def transform(self, X): n_cats = len(cats) # drop='if_binary' but feature isn't binary - if to_drop[i] == -1: + if to_drop[i] is None: # set to cardinality to not drop from X_int to_drop[i] = n_cats n_values.append(n_cats) @@ -484,16 +485,14 @@ def inverse_transform(self, X): n_samples, _ = X.shape n_features = len(self.categories_) - if self.drop is None: + if self.drop_idx_ is None: n_transformed_features = sum(len(cats) for cats in self.categories_) - elif isinstance(self.drop, str) and self.drop == 'if_binary': - n_transformed_features = sum(1 if len(cats) == 2 - else len(cats) - for cats in self.categories_) else: - n_transformed_features = sum(len(cats) - 1 - for cats in self.categories_) + n_transformed_features = sum( + len(cats) - 1 if to_drop is not None else len(cats) + for cats, to_drop in zip(self.categories_, self.drop_idx_) + ) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " @@ -509,7 +508,7 @@ def inverse_transform(self, X): found_unknown = {} for i in range(n_features): - if self.drop is None: + if self.drop_idx_ is None or self.drop_idx_[i] is None: cats = self.categories_[i] else: cats = np.delete(self.categories_[i], self.drop_idx_[i]) @@ -532,9 +531,9 @@ def inverse_transform(self, X): if unknown.any(): found_unknown[i] = unknown # drop will either be None or handle_unknown will be error. If - # self.drop is not None, then we can safely assume that all of + # self.drop_idx_ is not None, then we can safely assume that all of # the nulls in each column are the dropped value - elif self.drop is not None: + elif self.drop_idx_ is not None: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] @@ -581,7 +580,7 @@ def get_feature_names(self, input_features=None): for i in range(len(cats)): names = [ input_features[i] + '_' + str(t) for t in cats[i]] - if self.drop is not None: + if self.drop_idx_ is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 2a872c2e06c49..7e23aa2d485c2 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -268,6 +268,22 @@ def test_one_hot_encoder_inverse_if_binary(): assert_array_equal(ohe.inverse_transform(X_tr), X) +# check that resetting drop option without refitting does not throw an error +@pytest.mark.parametrize('drop', ['if_binary', 'first', None]) +@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None]) +def test_one_hot_encoder_drop_reset(drop, reset_drop): + X = np.array([['Male', 1], + ['Female', 3], + ['Female', 2]], dtype=object) + ohe = OneHotEncoder(drop=drop, sparse=False) + ohe.fit(X) + X_tr = ohe.transform(X) + feature_names = ohe.get_feature_names() + ohe.set_params(drop=reset_drop) + assert_array_equal(ohe.inverse_transform(X_tr), X) + assert_allclose(ohe.transform(X), X_tr) + assert_array_equal(ohe.get_feature_names(), feature_names) + @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @pytest.mark.parametrize("X", [ [1, 2], @@ -388,8 +404,9 @@ def test_one_hot_encoder_pandas(): @pytest.mark.parametrize("drop, expected_names", [('first', ['x0_c', 'x2_b']), + ('if_binary', ['x0_c', 'x1_2', 'x2_b']), (['c', 2, 'b'], ['x0_b', 'x2_a'])], - ids=['first', 'manual']) + ids=['first', 'binary', 'manual']) def test_one_hot_encoder_feature_names_drop(drop, expected_names): X = [['c', 2, 'a'], ['b', 2, 'b']] @@ -409,7 +426,7 @@ def test_one_hot_encoder_drop_equals_if_binary(): expected = np.array([[1., 0., 0., 1.], [0., 1., 0., 0.], [0., 0., 1., 1.]]) - expected_drop_idx = np.array([-1, 0]) + expected_drop_idx = np.array([None, 0]) ohe = OneHotEncoder(drop='if_binary', sparse=False) result = ohe.fit_transform(X) @@ -423,7 +440,7 @@ def test_one_hot_encoder_drop_equals_if_binary(): expected = np.array([[1., 1.], [0., 1.], [0., 1.]]) - expected_drop_idx = np.array([0, -1]) + expected_drop_idx = np.array([0, None]) ohe = OneHotEncoder(drop='if_binary', sparse=False) result = ohe.fit_transform(X) @@ -662,9 +679,9 @@ def test_categories(density, drop): for drop_cat, drop_idx, cat_list in zip(drop, ohe_test.drop_idx_, ohe_test.categories_): - assert cat_list[drop_idx] == drop_cat + assert cat_list[int(drop_idx)] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) - assert ohe_test.drop_idx_.dtype == np.int_ + assert ohe_test.drop_idx_.dtype == np.object @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) From ae159ecee2850c690960e4d64f9e03c72994e2f2 Mon Sep 17 00:00:00 2001 From: Himanshu Garg <35988194+merrcury@users.noreply.github.com> Date: Tue, 10 Mar 2020 14:45:58 +0530 Subject: [PATCH 446/448] DOC Update LICENSE Year (#16660) --- COPYING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COPYING b/COPYING index 0f665f8400d08..b98af18710185 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ New BSD License -Copyright (c) 2007–2019 The scikit-learn developers. +Copyright (c) 2007–2020 The scikit-learn developers. All rights reserved. From 535ef5516bce75c6a51127da95dcb577af1fe35e Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Tue, 10 Mar 2020 09:52:47 +0000 Subject: [PATCH 447/448] BUG Fix issue with KernelPCA.inverse_transform (#16655) --- doc/whats_new/v0.23.rst | 4 ++++ sklearn/decomposition/_kernel_pca.py | 3 ++- sklearn/decomposition/tests/test_kernel_pca.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index e465b94f104ba..d0c568956a353 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -142,6 +142,10 @@ Changelog :func:`decomposition.non_negative_factorization` now preserves float32 dtype. :pr:`16280` by :user:`Jeremie du Boisberranger `. +- |Fix| :class:`decomposition.KernelPCA` method ``inverse_transform`` now + applies the correct inverse transform to the transformed data. :pr:`16655` + by :user:`Lewis Ball `. + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index b1f83c8e0ff81..6f15ebc29f761 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -358,5 +358,6 @@ def inverse_transform(self, X): "the inverse transform is not available.") K = self._get_kernel(X, self.X_transformed_fit_) - + n_samples = self.X_transformed_fit_.shape[0] + K.flat[::n_samples + 1] += self.alpha return np.dot(K, self.dual_coef_) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index a08ae0cb7a43a..a7a9547bfa33a 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -7,6 +7,7 @@ from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles +from sklearn.datasets import make_blobs from sklearn.linear_model import Perceptron from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV @@ -282,3 +283,15 @@ def test_kernel_conditioning(): # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_)) + + +@pytest.mark.parametrize("kernel", + ["linear", "poly", "rbf", "sigmoid", "cosine"]) +def test_kernel_pca_inverse_transform(kernel): + X, *_ = make_blobs(n_samples=100, n_features=4, centers=[[1, 1, 1, 1]], + random_state=0) + + kp = KernelPCA(n_components=2, kernel=kernel, fit_inverse_transform=True) + X_trans = kp.fit_transform(X) + X_inv = kp.inverse_transform(X_trans) + assert_allclose(X, X_inv) From 9135e0b8c1ed9244c0fe4f0aff65740f9a083a20 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 10 Mar 2020 10:53:44 -0400 Subject: [PATCH 448/448] BUG Fixes histgradientboosting bug --- .../_hist_gradient_boosting/gradient_boosting.py | 2 ++ .../tests/test_gradient_boosting.py | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e18d3ac4b1f9b..5db39f07c7ce1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -396,6 +396,8 @@ def fit(self, X, y, sample_weight=None): self.bin_mapper_.missing_values_bin_idx_ ) ) + else: + raw_predictions_val = None should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, sample_weight_train, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index c5b4a143591d6..88ac63f7d05c9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -645,3 +645,16 @@ def test_max_depth_max_leaf_nodes(): tree = est._predictors[0][0] assert tree.get_max_depth() == 2 assert tree.get_n_leaf_nodes() == 3 # would be 4 prior to bug fix + + +def test_early_stopping_on_test_set_with_warm_start(): + # Non regression test for #16661 where second fit fails with + # warm_start=True, early_stopping is on, and no validation set + X, y = make_classification(random_state=0) + gb = HistGradientBoostingClassifier( + max_iter=1, scoring='loss', warm_start=True, early_stopping=True, + n_iter_no_change=1, validation_fraction=None) + + gb.fit(X, y) + # does not raise on second call + gb.fit(X, y)

scikit-learn

Machine Learning in Python

News

scikit-learn

Machine Learning in Python

News

Community

Machine Learning in grid search, cross validation, metrics, - and more... + and more...

News

Machine Learning in grid search, cross validation, metrics, - and more...
+ and more...