diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst index 65a18bca9f11e..a09cc95c5efe7 100644 --- a/doc/modules/kernel_approximation.rst +++ b/doc/modules/kernel_approximation.rst @@ -64,8 +64,8 @@ a linear algorithm, for example a linear SVM:: SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5, - n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2', - power_t=0.5, random_state=None, shuffle=True, tol=None, + n_iter_no_change=5, n_jobs=None, penalty='l2', + power_t=0.5, random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> clf.score(X_features, y) 1.0 diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index cfbfda371cd12..8cd0ba7560ecd 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1305,7 +1305,7 @@ This way, we can solve the XOR problem with a linear classifier:: [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]]) - >>> clf = Perceptron(fit_intercept=False, max_iter=10, tol=None, + >>> clf = Perceptron(fit_intercept=False, max_iter=10, tol=0.001, ... shuffle=False).fit(X, y) And the classifier "predictions" are perfect:: diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 79d915fa1e2df..7cfad5baca7f9 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -58,8 +58,8 @@ class RBFSampler(BaseEstimator, TransformerMixin): SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5, - n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2', - power_t=0.5, random_state=None, shuffle=True, tol=None, + n_iter_no_change=5, n_jobs=None, penalty='l2', + power_t=0.5, random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> clf.score(X_features, y) 1.0 @@ -167,8 +167,8 @@ class SkewedChi2Sampler(BaseEstimator, TransformerMixin): SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10, - n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2', - power_t=0.5, random_state=None, shuffle=True, tol=None, + n_iter_no_change=5, n_jobs=None, penalty='l2', + power_t=0.5, random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> clf.score(X_features, y) 1.0 @@ -287,8 +287,8 @@ class AdditiveChi2Sampler(BaseEstimator, TransformerMixin): SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5, - n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2', - power_t=0.5, random_state=0, shuffle=True, tol=None, + n_iter_no_change=5, n_jobs=None, penalty='l2', + power_t=0.5, random_state=0, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> clf.score(X_transformed, y) # doctest: +ELLIPSIS 0.9543... diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py index 22f1c0fbba121..8efd34859289c 100644 --- a/sklearn/linear_model/passive_aggressive.py +++ b/sklearn/linear_model/passive_aggressive.py @@ -21,18 +21,16 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. - max_iter : int, optional + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the `partial_fit`. - Defaults to 5. Defaults to 1000 from 0.21, or if tol is not None. .. versionadded:: 0.19 - tol : float or None, optional + tol : float or None, default=1e-3 The stopping criterion. If it is not None, the iterations will stop - when (loss > previous_loss - tol). Defaults to None. - Defaults to 1e-3 from 0.21. + when (loss > previous_loss - tol). .. versionadded:: 0.19 @@ -113,13 +111,6 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): .. versionadded:: 0.19 parameter *average* to use weights averaging in SGD - n_iter : int, optional - The number of passes over the training data (aka epochs). - Defaults to None. Deprecated, will be removed in 0.21. - - .. versionchanged:: 0.19 - Deprecated - Attributes ---------- coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\ @@ -143,13 +134,13 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): >>> clf.fit(X, y) PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None, early_stopping=False, fit_intercept=True, loss='hinge', - max_iter=1000, n_iter=None, n_iter_no_change=5, n_jobs=None, - random_state=0, shuffle=True, tol=None, + max_iter=1000, n_iter_no_change=5, n_jobs=None, + random_state=0, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> print(clf.coef_) - [[0.29509834 0.33711843 0.56127352 0.60105546]] + [[-0.6543424 1.54603022 1.35361642 0.22199435]] >>> print(clf.intercept_) - [2.54153383] + [0.63310933] >>> print(clf.predict([[0, 0, 0, 0]])) [1] @@ -166,11 +157,11 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, + def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss="hinge", n_jobs=None, random_state=None, warm_start=False, - class_weight=None, average=False, n_iter=None): + class_weight=None, average=False): super(PassiveAggressiveClassifier, self).__init__( penalty=None, fit_intercept=fit_intercept, @@ -186,8 +177,7 @@ def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, warm_start=warm_start, class_weight=class_weight, average=average, - n_jobs=n_jobs, - n_iter=n_iter) + n_jobs=n_jobs) self.C = C self.loss = loss @@ -275,18 +265,16 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True. - max_iter : int, optional + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the `partial_fit`. - Defaults to 5. Defaults to 1000 from 0.21, or if tol is not None. .. versionadded:: 0.19 - tol : float or None, optional + tol : float or None, default=1e-3 The stopping criterion. If it is not None, the iterations will stop when (loss > previous_loss - tol). Defaults to None. - Defaults to 1e-3 from 0.21. .. versionadded:: 0.19 @@ -352,13 +340,6 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): .. versionadded:: 0.19 parameter *average* to use weights averaging in SGD - n_iter : int, optional - The number of passes over the training data (aka epochs). - Defaults to None. Deprecated, will be removed in 0.21. - - .. versionchanged:: 0.19 - Deprecated - Attributes ---------- coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\ @@ -378,11 +359,11 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): >>> >>> X, y = make_regression(n_features=4, random_state=0) >>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0) - >>> regr.fit(X, y) + >>> regr.fit(X, y) # doctest: +NORMALIZE_WHITESPACE PassiveAggressiveRegressor(C=1.0, average=False, early_stopping=False, epsilon=0.1, fit_intercept=True, loss='epsilon_insensitive', - max_iter=100, n_iter=None, n_iter_no_change=5, - random_state=0, shuffle=True, tol=None, + max_iter=100, n_iter_no_change=5, + random_state=0, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> print(regr.coef_) [20.48736655 34.18818427 67.59122734 87.94731329] @@ -403,12 +384,12 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, + def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss="epsilon_insensitive", epsilon=DEFAULT_EPSILON, random_state=None, warm_start=False, - average=False, n_iter=None): + average=False): super(PassiveAggressiveRegressor, self).__init__( penalty=None, l1_ratio=0, @@ -424,8 +405,7 @@ def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, verbose=verbose, random_state=random_state, warm_start=warm_start, - average=average, - n_iter=n_iter) + average=average) self.C = C self.loss = loss diff --git a/sklearn/linear_model/perceptron.py b/sklearn/linear_model/perceptron.py index 1bc06f4f17276..c46ae3f32be89 100644 --- a/sklearn/linear_model/perceptron.py +++ b/sklearn/linear_model/perceptron.py @@ -23,18 +23,16 @@ class Perceptron(BaseSGDClassifier): Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True. - max_iter : int, optional + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the `partial_fit`. - Defaults to 5. Defaults to 1000 from 0.21, or if tol is not None. .. versionadded:: 0.19 - tol : float or None, optional + tol : float or None, default=1e-3 The stopping criterion. If it is not None, the iterations will stop - when (loss > previous_loss - tol). Defaults to None. - Defaults to 1e-3 from 0.21. + when (loss > previous_loss - tol). .. versionadded:: 0.19 @@ -97,13 +95,6 @@ class Perceptron(BaseSGDClassifier): initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. - n_iter : int, optional - The number of passes over the training data (aka epochs). - Defaults to None. Deprecated, will be removed in 0.21. - - .. versionchanged:: 0.19 - Deprecated - Attributes ---------- coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\ @@ -131,9 +122,9 @@ class Perceptron(BaseSGDClassifier): >>> from sklearn.linear_model import Perceptron >>> X, y = load_digits(return_X_y=True) >>> clf = Perceptron(tol=1e-3, random_state=0) - >>> clf.fit(X, y) + >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0, - fit_intercept=True, max_iter=None, n_iter=None, n_iter_no_change=5, + fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> clf.score(X, y) # doctest: +ELLIPSIS @@ -150,10 +141,10 @@ class Perceptron(BaseSGDClassifier): https://en.wikipedia.org/wiki/Perceptron and references therein. """ def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, - max_iter=None, tol=None, shuffle=True, verbose=0, eta0=1.0, + max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, - class_weight=None, warm_start=False, n_iter=None): + class_weight=None, warm_start=False): super(Perceptron, self).__init__( loss="perceptron", penalty=penalty, alpha=alpha, l1_ratio=0, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, @@ -161,5 +152,4 @@ def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, learning_rate="constant", eta0=eta0, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, power_t=0.5, - warm_start=warm_start, class_weight=class_weight, n_jobs=n_jobs, - n_iter=n_iter) + warm_start=warm_start, class_weight=class_weight, n_jobs=n_jobs) diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 5ac1779ee347b..90db969c92f77 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -25,7 +25,6 @@ from .sgd_fast import plain_sgd, average_sgd from ..utils import compute_class_weight -from ..utils import deprecated from .sgd_fast import Hinge from .sgd_fast import SquaredHinge from .sgd_fast import Log @@ -68,12 +67,11 @@ class BaseSGD(six.with_metaclass(ABCMeta, BaseEstimator, SparseCoefMixin)): """Base class for SGD classification and regression.""" def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0, - l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate="optimal", eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, - n_iter_no_change=5, warm_start=False, average=False, - n_iter=None): + n_iter_no_change=5, warm_start=False, average=False): self.loss = loss self.penalty = penalty self.learning_rate = learning_rate @@ -92,7 +90,6 @@ def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0, self.n_iter_no_change = n_iter_no_change self.warm_start = warm_start self.average = average - self.n_iter = n_iter self.max_iter = max_iter self.tol = tol # current tests expect init to do parameter validation @@ -141,34 +138,6 @@ def _validate_params(self, set_max_iter=True, for_partial_fit=False): if self.loss not in self.loss_functions: raise ValueError("The loss %s is not supported. " % self.loss) - if not set_max_iter: - return - # n_iter deprecation, set self._max_iter, self._tol - self._tol = self.tol - if self.n_iter is not None: - warnings.warn("n_iter parameter is deprecated in 0.19 and will be" - " removed in 0.21. Use max_iter and tol instead.", - DeprecationWarning) - # Same behavior as before 0.19 - max_iter = self.n_iter - self._tol = None - - elif self.tol is None and self.max_iter is None: - if not for_partial_fit: - warnings.warn( - "max_iter and tol parameters have been " - "added in %s in 0.19. If both are left unset, " - "they default to max_iter=5 and tol=None. " - "If tol is not None, max_iter defaults to max_iter=1000. " - "From 0.21, default max_iter will be 1000, and" - " default tol will be 1e-3." % type(self).__name__, - FutureWarning) - # Before 0.19, default was n_iter=5 - max_iter = 5 - else: - max_iter = self.max_iter if self.max_iter is not None else 1000 - self._max_iter = max_iter - def _get_loss_function(self, loss): """Get concrete ``LossFunction`` object for str ``loss``. """ try: @@ -472,13 +441,12 @@ class BaseSGDClassifier(six.with_metaclass(ABCMeta, BaseSGD, @abstractmethod def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, - l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate="optimal", eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, - class_weight=None, warm_start=False, average=False, - n_iter=None): + class_weight=None, warm_start=False, average=False): super(BaseSGDClassifier, self).__init__( loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, @@ -488,16 +456,10 @@ def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, - average=average, n_iter=n_iter) + average=average) self.class_weight = class_weight self.n_jobs = n_jobs - @property - @deprecated("Attribute loss_function was deprecated in version 0.19 and " - "will be removed in 0.21. Use ``loss_function_`` instead") - def loss_function(self): - return self.loss_function_ - def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, @@ -577,11 +539,11 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, # Clear iteration count for multiple call to fit. self.t_ = 1.0 - self._partial_fit(X, y, alpha, C, loss, learning_rate, self._max_iter, + self._partial_fit(X, y, alpha, C, loss, learning_rate, self.max_iter, classes, sample_weight, coef_init, intercept_init) - if (self._tol is not None and self._tol > -np.inf - and self.n_iter_ == self._max_iter): + if (self.tol is not None and self.tol > -np.inf + and self.n_iter_ == self.max_iter): warnings.warn("Maximum number of iteration reached before " "convergence. Consider increasing max_iter to " "improve the fit.", @@ -790,18 +752,16 @@ class SGDClassifier(BaseSGDClassifier): Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True. - max_iter : int, optional + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the `partial_fit`. - Defaults to 5. Defaults to 1000 from 0.21, or if tol is not None. .. versionadded:: 0.19 - tol : float or None, optional + tol : float default=1e-3 The stopping criterion. If it is not None, the iterations will stop - when (loss > previous_loss - tol). Defaults to None. - Defaults to 1e-3 from 0.21. + when (loss > previous_loss - tol). .. versionadded:: 0.19 @@ -909,13 +869,6 @@ class SGDClassifier(BaseSGDClassifier): average. So ``average=10`` will begin averaging after seeing 10 samples. - n_iter : int, optional - The number of passes over the training data (aka epochs). - Defaults to None. Deprecated, will be removed in 0.21. - - .. versionchanged:: 0.19 - Deprecated - Attributes ---------- coef_ : array, shape (1, n_features) if n_classes == 2 else (n_classes,\ @@ -943,8 +896,8 @@ class SGDClassifier(BaseSGDClassifier): SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000, - n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2', - power_t=0.5, random_state=None, shuffle=True, tol=None, + n_iter_no_change=5, n_jobs=None, penalty='l2', + power_t=0.5, random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) >>> print(clf.predict([[-0.8, -1]])) @@ -957,12 +910,12 @@ class SGDClassifier(BaseSGDClassifier): """ def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, max_iter=None, tol=None, shuffle=True, + fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate="optimal", eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, - average=False, n_iter=None): + average=False): super(SGDClassifier, self).__init__( loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, @@ -971,7 +924,7 @@ def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, l1_ratio=0.15, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, class_weight=class_weight, - warm_start=warm_start, average=average, n_iter=n_iter) + warm_start=warm_start, average=average) def _check_proba(self): if self.loss not in ("log", "modified_huber"): @@ -1101,12 +1054,11 @@ class BaseSGDRegressor(BaseSGD, RegressorMixin): @abstractmethod def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, - l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate="invscaling", eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, - n_iter_no_change=5, warm_start=False, average=False, - n_iter=None): + n_iter_no_change=5, warm_start=False, average=False): super(BaseSGDRegressor, self).__init__( loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, @@ -1115,7 +1067,7 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, - average=average, n_iter=n_iter) + average=average) def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init): @@ -1193,11 +1145,11 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, self.t_ = 1.0 self._partial_fit(X, y, alpha, C, loss, learning_rate, - self._max_iter, sample_weight, coef_init, + self.max_iter, sample_weight, coef_init, intercept_init) - if (self._tol is not None and self._tol > -np.inf - and self.n_iter_ == self._max_iter): + if (self.tol is not None and self.tol > -np.inf + and self.n_iter_ == self.max_iter): warnings.warn("Maximum number of iteration reached before " "convergence. Consider increasing max_iter to " "improve the fit.", @@ -1290,8 +1242,6 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate, # Windows seed = random_state.randint(0, np.iinfo(np.int32).max) - tol = self._tol if self._tol is not None else -np.inf - if self.average > 0: self.standard_coef_, self.standard_intercept_, \ self.average_coef_, self.average_intercept_, self.n_iter_ =\ @@ -1307,7 +1257,7 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate, validation_mask, self.early_stopping, validation_score_cb, int(self.n_iter_no_change), - max_iter, tol, + max_iter, self.tol, int(self.fit_intercept), int(self.verbose), int(self.shuffle), @@ -1340,7 +1290,7 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate, validation_mask, self.early_stopping, validation_score_cb, int(self.n_iter_no_change), - max_iter, tol, + max_iter, self.tol, int(self.fit_intercept), int(self.verbose), int(self.shuffle), @@ -1406,18 +1356,16 @@ class SGDRegressor(BaseSGDRegressor): Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True. - max_iter : int, optional + max_iter : int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the `partial_fit`. - Defaults to 5. Defaults to 1000 from 0.21, or if tol is not None. .. versionadded:: 0.19 - tol : float or None, optional + tol : float, default=1e-3 The stopping criterion. If it is not None, the iterations will stop - when (loss > previous_loss - tol). Defaults to None. - Defaults to 1e-3 from 0.21. + when (loss > previous_loss - tol). .. versionadded:: 0.19 @@ -1508,13 +1456,6 @@ class SGDRegressor(BaseSGDRegressor): average. So ``average=10`` will begin averaging after seeing 10 samples. - n_iter : int, optional - The number of passes over the training data (aka epochs). - Defaults to None. Deprecated, will be removed in 0.21. - - .. versionchanged:: 0.19 - Deprecated - Attributes ---------- coef_ : array, shape (n_features,) @@ -1546,8 +1487,8 @@ class SGDRegressor(BaseSGDRegressor): SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, - n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25, - random_state=None, shuffle=True, tol=None, validation_fraction=0.1, + n_iter_no_change=5, penalty='l2', power_t=0.25, + random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False) See also @@ -1556,12 +1497,11 @@ class SGDRegressor(BaseSGDRegressor): """ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, - l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate="invscaling", eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, - n_iter_no_change=5, warm_start=False, average=False, - n_iter=None): + n_iter_no_change=5, warm_start=False, average=False): super(SGDRegressor, self).__init__( loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, @@ -1570,4 +1510,4 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, - average=average, n_iter=n_iter) + average=average) diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index d7658396b3f22..5aabca6f12927 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -158,7 +158,7 @@ def test_huber_and_sgd_same_results(): sgdreg = SGDRegressor( alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000, - fit_intercept=False, epsilon=1.35, tol=None) + fit_intercept=False, epsilon=1.35, tol=-np.inf) sgdreg.fit(X_scale, y_scale) assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1) diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index ee519b7390c5b..11288287a5d90 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -262,7 +262,7 @@ def test_regressor_correctness(loss): for data in (X, X_csr): reg2 = PassiveAggressiveRegressor( - C=1.0, tol=None, loss=loss, fit_intercept=True, max_iter=2, + C=1.0, tol=-np.inf, loss=loss, fit_intercept=True, max_iter=2, shuffle=False) reg2.fit(data, y_bin) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index ceab6d3a744c1..e4ee7adda7be6 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -16,8 +16,6 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_warns -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import ignore_warnings from sklearn import linear_model, datasets, metrics @@ -112,8 +110,6 @@ def factory(self, **kwargs): if "random_state" not in kwargs: kwargs["random_state"] = 42 - if "tol" not in kwargs: - kwargs["tol"] = None if "max_iter" not in kwargs: kwargs["max_iter"] = 5 @@ -306,14 +302,14 @@ def test_validation_set_not_used_for_training(self): random_state=np.random.RandomState(seed), validation_fraction=validation_fraction, learning_rate='constant', eta0=0.01, - tol=None, max_iter=max_iter, shuffle=shuffle) + max_iter=max_iter, shuffle=shuffle, tol=0) clf1.fit(X, Y) assert clf1.n_iter_ == max_iter clf2 = self.factory(early_stopping=False, random_state=np.random.RandomState(seed), learning_rate='constant', eta0=0.01, - tol=None, max_iter=max_iter, shuffle=shuffle) + max_iter=max_iter, shuffle=shuffle, tol=0) if is_classifier(clf2): cv = StratifiedShuffleSplit(test_size=validation_fraction, @@ -705,13 +701,14 @@ def test_equal_class_weight(self): # Test if equal class weights approx. equals no class weights. X = [[1, 0], [1, 0], [0, 1], [0, 1]] y = [0, 0, 1, 1] - clf = self.factory(alpha=0.1, max_iter=1000, class_weight=None) + clf = self.factory(alpha=0.1, max_iter=1000, class_weight=None, + tol=-np.inf) clf.fit(X, y) X = [[1, 0], [0, 1]] y = [0, 1] clf_weighted = self.factory(alpha=0.1, max_iter=1000, - class_weight={0: 0.5, 1: 0.5}) + class_weight={0: .5, 1: .5}, tol=-np.inf) clf_weighted.fit(X, y) # should be similar up to some epsilon due to learning rate schedule @@ -755,13 +752,13 @@ def test_balanced_weight(self): rng.shuffle(idx) X = X[idx] y = y[idx] - clf = self.factory(alpha=0.0001, max_iter=1000, + clf = self.factory(alpha=0.0001, max_iter=1000, tol=-np.inf, class_weight=None, shuffle=False).fit(X, y) f1 = metrics.f1_score(y, clf.predict(X), average='weighted') assert_almost_equal(f1, 0.96, decimal=1) # make the same prediction using balanced class_weight - clf_balanced = self.factory(alpha=0.0001, max_iter=1000, + clf_balanced = self.factory(alpha=0.0001, max_iter=1000, tol=-np.inf, class_weight="balanced", shuffle=False).fit(X, y) f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted') @@ -779,14 +776,15 @@ def test_balanced_weight(self): y_imbalanced = np.concatenate([y] + [y_0] * 10) # fit a model on the imbalanced data without class weight info - clf = self.factory(max_iter=1000, class_weight=None, shuffle=False) + clf = self.factory(max_iter=1000, class_weight=None, shuffle=False, + tol=-np.inf) clf.fit(X_imbalanced, y_imbalanced) y_pred = clf.predict(X) assert_less(metrics.f1_score(y, y_pred, average='weighted'), 0.96) # fit a model with balanced class_weight enabled clf = self.factory(max_iter=1000, class_weight="balanced", - shuffle=False) + shuffle=False, tol=-np.inf) clf.fit(X_imbalanced, y_imbalanced) y_pred = clf.predict(X) assert_greater(metrics.f1_score(y, y_pred, average='weighted'), 0.96) @@ -1315,59 +1313,6 @@ def test_tol_parameter(): assert_equal(model_3.n_iter_, 3) -def test_future_and_deprecation_warnings(): - # Test that warnings are raised. Will be removed in 0.21 - - def init(max_iter=None, tol=None, n_iter=None, for_partial_fit=False): - sgd = SGDClassifier(max_iter=max_iter, tol=tol, n_iter=n_iter) - sgd._validate_params(for_partial_fit=for_partial_fit) - - # When all default values are used - msg_future = "max_iter and tol parameters have been added in " - assert_warns_message(FutureWarning, msg_future, init) - - # When n_iter is specified - msg_deprecation = "n_iter parameter is deprecated" - assert_warns_message(DeprecationWarning, msg_deprecation, init, 6, 0, 5) - - # When n_iter=None, and at least one of tol and max_iter is specified - assert_no_warnings(init, 100, None, None) - assert_no_warnings(init, None, 1e-3, None) - assert_no_warnings(init, 100, 1e-3, None) - - # Test that for_partial_fit will not throw warnings for max_iter or tol - assert_no_warnings(init, None, None, None, True) - - -@ignore_warnings(category=(DeprecationWarning, FutureWarning)) -def test_tol_and_max_iter_default_values(): - # Test that the default values are correctly changed - est = SGDClassifier() - est._validate_params() - assert_equal(est._tol, None) - assert_equal(est._max_iter, 5) - - est = SGDClassifier(n_iter=42) - est._validate_params() - assert_equal(est._tol, None) - assert_equal(est._max_iter, 42) - - est = SGDClassifier(tol=1e-2) - est._validate_params() - assert_equal(est._tol, 1e-2) - assert_equal(est._max_iter, 1000) - - est = SGDClassifier(max_iter=42) - est._validate_params() - assert_equal(est._tol, None) - assert_equal(est._max_iter, 42) - - est = SGDClassifier(max_iter=42, tol=1e-2) - est._validate_params() - assert_equal(est._tol, 1e-2) - assert_equal(est._max_iter, 42) - - def _test_gradient_common(loss_function, cases): # Test gradient of different loss functions # cases is a list of (p, y, expected) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 54369033a75d3..4de169df44099 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2116,11 +2116,6 @@ def param_filter(p): assert_true(init_param.default is None) continue - if (issubclass(Estimator, BaseSGD) and - init_param.name in ['tol', 'max_iter']): - # To remove in 0.21, when they get their future default values - continue - param_value = params[init_param.name] if isinstance(param_value, np.ndarray): assert_array_equal(param_value, init_param.default)