scikit-learn · jeremiedbb · Apr 7, 2022 · Apr 4, 2022 · Apr 4, 2022 · Apr 4, 2022
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
@@ -26,7 +26,7 @@ correspond to a specific family of machine learning models. It is only a
 *way* to train a model. Often, an instance of :class:`SGDClassifier` or
 :class:`SGDRegressor` will have an equivalent estimator in
 the scikit-learn API, potentially using a different optimization technique.
-For example, using `SGDClassifier(loss='log')` results in logistic regression,
+For example, using `SGDClassifier(loss='log_loss')` results in logistic regression,
 i.e. a model equivalent to :class:`~sklearn.linear_model.LogisticRegression`
 which is fitted via SGD instead of being fitted by one of the other solvers
 in :class:`~sklearn.linear_model.LogisticRegression`. Similarly,
@@ -113,7 +113,7 @@ parameter. :class:`SGDClassifier` supports the following loss functions:
 
   * ``loss="hinge"``: (soft-margin) linear Support Vector Machine,
   * ``loss="modified_huber"``: smoothed hinge loss,
-  * ``loss="log"``: logistic regression,
+  * ``loss="log_loss"``: logistic regression,
   * and all regression losses below. In this case the target is encoded as -1
     or 1, and the problem is treated as a regression problem. The predicted
     class then correspond to the sign of the predicted target.
@@ -125,11 +125,11 @@ parameters if an example violates the margin constraint, which makes
 training very efficient and may result in sparser models (i.e. with more zero
 coefficients), even when L2 penalty is used.
 
-Using ``loss="log"`` or ``loss="modified_huber"`` enables the
+Using ``loss="log_loss"`` or ``loss="modified_huber"`` enables the
 ``predict_proba`` method, which gives a vector of probability estimates
 :math:`P(y|x)` per sample :math:`x`::
 
-    >>> clf = SGDClassifier(loss="log", max_iter=5).fit(X, y)
+    >>> clf = SGDClassifier(loss="log_loss", max_iter=5).fit(X, y)
     >>> clf.predict_proba([[1., 1.]]) # doctest: +SKIP
     array([[0.00..., 0.99...]])
 
@@ -168,7 +168,7 @@ one-dimensional array of shape (n_classes,). The i-th row of ``coef_`` holds
 the weight vector of the OVA classifier for the i-th class; classes are
 indexed in ascending order (see attribute ``classes_``).
 Note that, in principle, since they allow to create a probability model,
-``loss="log"`` and ``loss="modified_huber"`` are more suitable for
+``loss="log_loss"`` and ``loss="modified_huber"`` are more suitable for
 one-vs-all classification.
 
 :class:`SGDClassifier` supports both weighted classes and weighted
@@ -419,9 +419,9 @@ Different choices for :math:`L` entail different classifiers or regressors:
 - Modified Huber:
   :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) >
   1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise.
-- Log: equivalent to Logistic Regression.
+- Log Loss: equivalent to Logistic Regression.
   :math:`L(y_i, f(x_i)) = \log(1 + \exp (-y_i f(x_i)))`.
-- Least-Squares: Linear regression (Ridge or Lasso depending on
+- Squared Error: Linear regression (Ridge or Lasso depending on
   :math:`R`).
   :math:`L(y_i, f(x_i)) = \frac{1}{2}(y_i - f(x_i))^2`.
 - Huber: less sensitive to outliers than least-squares. It is equivalent to

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -93,11 +93,15 @@ Changelog
   produce the same models, but are deprecated and will be removed in version
   1.3.
 
-  - For :class:`ensemble.GradientBoostingClassifier`, The `loss` parameter name
+  - For :class:`ensemble.GradientBoostingClassifier`, the `loss` parameter name
     "deviance" is deprecated in favor of the new name "log_loss", which is now the
     default.
     :pr:`23036` by :user:`Christian Lorentzen <lorentzenchr>`.
 
+  - For :class:`linear_model.SGDClassifier`, the `loss` parameter name
+    "log" is deprecated in favor of the new name "log_loss".
+    :pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.
-  - For :class:`linear_model.SGDClassifier`, the `loss` parameter name
-    "log" is deprecated in favor of the new name "log_loss".
-    :pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.
+  - For :class:`linear_model.SGDClassifier`, the `loss` parameter name
+    `"log"` is deprecated in favor of the new name `"log_loss"`.
+    :pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.
-  - For :class:`linear_model.SGDClassifier`, the `loss` parameter name
-    "log" is deprecated in favor of the new name "log_loss".
-    :pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.
+  - For :class:`linear_model.SGDClassifier`, the `loss` parameter name
+    `"log"` is deprecated in favor of the new name `"log_loss"`.
+    :pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Efficiency| Low-level routines for reductions on pairwise distances
   for dense float64 datasets have been refactored. The following functions
   and estimators now benefit from improved performances in terms of hardware

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
@@ -809,7 +809,7 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
         cross-validation.
     SGDRegressor : Implements elastic net regression with incremental training.
     SGDClassifier : Implements logistic regression with elastic net penalty
-        (``SGDClassifier(loss="log", penalty="elasticnet")``).
+        (``SGDClassifier(loss="log_loss", penalty="elasticnet")``).
 
     Notes
     -----

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
@@ -160,13 +160,21 @@ def _validate_params(self, for_partial_fit=False):
         if self.loss not in self.loss_functions:
             raise ValueError("The loss %s is not supported. " % self.loss)
 
+        # TODO(1.2): remove "squared_loss"
         if self.loss == "squared_loss":
             warnings.warn(
                 "The loss 'squared_loss' was deprecated in v1.0 and will be "
                 "removed in version 1.2. Use `loss='squared_error'` which is "
                 "equivalent.",
                 FutureWarning,
             )
+        # TODO(1.3): remove "log"
+        if self.loss == "log":
+            warnings.warn(
+                "The loss 'log' was deprecated in v1.1 and will be removed in version "
+                "1.3. Use `loss='log_loss'` which is equivalent.",
+                FutureWarning,
+            )
 
     def _get_loss_function(self, loss):
         """Get concrete ``LossFunction`` object for str ``loss``."""
@@ -488,11 +496,13 @@ def fit_binary(
 
 class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
 
-    # TODO: Remove squared_loss in v1.2
+    # TODO(1.2): Remove "squared_loss"
+    # TODO(1.3): Remove "log""
     loss_functions = {
         "hinge": (Hinge, 1.0),
         "squared_hinge": (SquaredHinge, 1.0),
         "perceptron": (Hinge, 0.0),
+        "log_loss": (Log,),
         "log": (Log,),
         "modified_huber": (ModifiedHuber,),
         "squared_error": (SquaredLoss,),
@@ -917,22 +927,21 @@ class SGDClassifier(BaseSGDClassifier):
 
     Parameters
     ----------
-    loss : str, default='hinge'
-        The loss function to be used. Defaults to 'hinge', which gives a
-        linear SVM.
-
-        The possible options are 'hinge', 'log', 'modified_huber',
-        'squared_hinge', 'perceptron', or a regression loss: 'squared_error',
-        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
-
-        The 'log' loss gives logistic regression, a probabilistic classifier.
-        'modified_huber' is another smooth loss that brings tolerance to
-        outliers as well as probability estimates.
-        'squared_hinge' is like hinge but is quadratically penalized.
-        'perceptron' is the linear loss used by the perceptron algorithm.
-        The other losses are designed for regression but can be useful in
-        classification as well; see
-        :class:`~sklearn.linear_model.SGDRegressor` for a description.
+    loss : {'hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge',\
+        'perceptron', 'squared_error', 'huber', 'epsilon_insensitive',\
+        'squared_epsilon_insensitive'}, default='hinge'
+        The loss function to be used.
+
+        - 'hinge' gives a linear SVM.
+        - 'log_loss' gives logistic regression, a probabilistic classifier.
+        - 'modified_huber' is another smooth loss that brings tolerance to
+           outliers as well as probability estimates.
+        - 'squared_hinge' is like hinge but is quadratically penalized.
+        - 'perceptron' is the linear loss used by the perceptron algorithm.
+        - The other losses, 'squared_error', 'huber', 'epsilon_insensitive' and
+          'squared_epsilon_insensitive' are designed for regression but can be useful
+          in classification as well; see
+          :class:`~sklearn.linear_model.SGDRegressor` for a description.
 
         More details about the losses formulas can be found in the
         :ref:`User Guide <sgd_mathematical_formulation>`.
@@ -941,6 +950,10 @@ class SGDClassifier(BaseSGDClassifier):
             The loss 'squared_loss' was deprecated in v1.0 and will be removed
             in version 1.2. Use `loss='squared_error'` which is equivalent.
 
+        .. deprecated:: 1.1
+            The loss 'log' was deprecated in v1.1 and will be removed
+            in version 1.3. Use `loss='log_loss'` which is equivalent.
+
     penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
@@ -1204,7 +1217,8 @@ def __init__(
         )
 
     def _check_proba(self):
-        if self.loss not in ("log", "modified_huber"):
+        # TODO(1.3): Remove "log"
+        if self.loss not in ("log_loss", "log", "modified_huber"):
             raise AttributeError(
                 "probability estimates are not available for loss=%r" % self.loss
             )
@@ -1249,7 +1263,8 @@ def predict_proba(self, X):
         """
         check_is_fitted(self)
 
-        if self.loss == "log":
+        # TODO(1.3): Remove "log"
+        if self.loss in ("log_loss", "log"):
             return self._predict_proba_lr(X)
 
         elif self.loss == "modified_huber":
@@ -1287,7 +1302,7 @@ def predict_proba(self, X):
         else:
             raise NotImplementedError(
                 "predict_(log_)proba only supported when"
-                " loss='log' or loss='modified_huber' "
+                " loss='log_loss' or loss='modified_huber' "
                 "(%r given)"
                 % self.loss
             )

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -1775,7 +1775,7 @@ def test_elastic_net_versus_sgd(C, l1_ratio):
         max_iter=2000,
         l1_ratio=l1_ratio,
         alpha=1.0 / C / n_samples,
-        loss="log",
+        loss="log_loss",
     )
     log = LogisticRegression(
         penalty="elasticnet",

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
@@ -543,7 +543,7 @@ def test_not_enough_sample_for_early_stopping(klass):
 def test_sgd_clf(klass):
     # Check that SGD gives any results :-)
 
-    for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
+    for loss in ("hinge", "squared_hinge", "log_loss", "modified_huber"):
         clf = klass(
             penalty="l2",
             alpha=0.01,
@@ -771,7 +771,8 @@ def test_sgd_predict_proba_method_access(klass):
     # details.
     for loss in linear_model.SGDClassifier.loss_functions:
         clf = SGDClassifier(loss=loss)
-        if loss in ("log", "modified_huber"):
+        # TODO(1.3): Remove "log"
+        if loss in ("log_loss", "log", "modified_huber"):
             assert hasattr(clf, "predict_proba")
             assert hasattr(clf, "predict_log_proba")
         else:
@@ -799,7 +800,7 @@ def test_sgd_proba(klass):
 
     # log and modified_huber losses can output probability estimates
     # binary case
-    for loss in ["log", "modified_huber"]:
+    for loss in ["log_loss", "modified_huber"]:
         clf = klass(loss=loss, alpha=0.01, max_iter=10)
         clf.fit(X, Y)
         p = clf.predict_proba([[3, 2]])
@@ -813,7 +814,7 @@ def test_sgd_proba(klass):
         assert p[0, 1] < p[0, 0]
 
     # log loss multiclass probability estimates
-    clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
+    clf = klass(loss="log_loss", alpha=0.01, max_iter=10).fit(X2, Y2)
 
     d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
     p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
@@ -2122,19 +2123,25 @@ def test_SGDClassifier_fit_for_all_backends(backend):
     assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_)
 
 
-# TODO: Remove in v1.2
 @pytest.mark.parametrize(
-    "Estimator", [linear_model.SGDClassifier, linear_model.SGDRegressor]
+    "old_loss, new_loss, Estimator",
+    [
+        # TODO(1.2): Remove "squared_loss"
+        ("squared_loss", "squared_error", linear_model.SGDClassifier),
+        ("squared_loss", "squared_error", linear_model.SGDRegressor),
+        # TODO(1.3): Remove "log"
+        ("log", "log_loss", linear_model.SGDClassifier),
+    ],
 )
-def test_loss_squared_loss_deprecated(Estimator):
+def test_loss_deprecated(old_loss, new_loss, Estimator):
 
     # Note: class BaseSGD calls self._validate_params() in __init__, therefore
-    # even instatiation of class raises FutureWarning for squared_loss.
-    with pytest.warns(FutureWarning, match="The loss 'squared_loss' was deprecated"):
-        est1 = Estimator(loss="squared_loss", random_state=0)
+    # even instantiation of class raises FutureWarning for deprecated losses.
+    with pytest.warns(FutureWarning, match=f"The loss '{old_loss}' was deprecated"):
+        est1 = Estimator(loss=old_loss, random_state=0)
         est1.fit(X, Y)
 
-    est2 = Estimator(loss="squared_error", random_state=0)
+    est2 = Estimator(loss=new_loss, random_state=0)
     est2.fit(X, Y)
     if hasattr(est1, "predict_proba"):
         assert_allclose(est1.predict_proba(X), est2.predict_proba(X))

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -1717,7 +1717,7 @@ def test_stochastic_gradient_loss_param():
     # Make sure the predict_proba works when loss is specified
     # as one of the parameters in the param_grid.
     param_grid = {
-        "loss": ["log"],
+        "loss": ["log_loss"],
     }
     X = np.arange(24).reshape(6, -1)
     y = [0, 0, 0, 1, 1, 1]

diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
@@ -1876,7 +1876,7 @@ def test_cross_val_predict_method_checking():
     X, y = iris.data, iris.target
     X, y = shuffle(X, y, random_state=0)
     for method in ["decision_function", "predict_proba", "predict_log_proba"]:
-        est = SGDClassifier(loss="log", random_state=2)
+        est = SGDClassifier(loss="log_loss", random_state=2)
         check_cross_val_predict_multiclass(est, X, y, method)
 
 

diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py
@@ -168,7 +168,7 @@ def test_multi_target_sample_weights():
 
 
 def test_multi_output_classification_partial_fit_parallelism():
-    sgd_linear_clf = SGDClassifier(loss="log", random_state=1, max_iter=5)
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
     mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
     mor.partial_fit(X, y, classes)
     est1 = mor.estimators_[0]
@@ -189,15 +189,15 @@ def test_hasattr_multi_output_predict_proba():
     assert not hasattr(multi_target_linear, "predict_proba")
 
     # case where predict_proba attribute exists
-    sgd_linear_clf = SGDClassifier(loss="log", random_state=1, max_iter=5)
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
     multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
     multi_target_linear.fit(X, y)
     assert hasattr(multi_target_linear, "predict_proba")
 
 
 # check predict_proba passes
 def test_multi_output_predict_proba():
-    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, loss="log")
+    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, loss="log_loss")
     param = {"loss": ("hinge", "log", "modified_huber")}
 
     # inner function for custom scoring
@@ -229,7 +229,7 @@ def test_multi_output_classification_partial_fit():
     # test if multi_target initializes correctly with base estimator and fit
     # assert predictions work as expected for predict
 
-    sgd_linear_clf = SGDClassifier(loss="log", random_state=1, max_iter=5)
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
     multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
 
     # train the multi_target_linear and also get the predictions.
@@ -257,7 +257,7 @@ def test_multi_output_classification_partial_fit():
 
 
 def test_multi_output_classification_partial_fit_no_first_classes_exception():
-    sgd_linear_clf = SGDClassifier(loss="log", random_state=1, max_iter=5)
+    sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
     multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
     msg = "classes must be passed on the first call to partial_fit."
     with pytest.raises(ValueError, match=msg):