From cd1eb68c9a218c08a27749c9d062122085b22998 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Thu, 26 Nov 2020 17:25:30 +0100
Subject: [PATCH 1/8] add py_loss to _sgd_fast.pyx

---
 sklearn/linear_model/_sgd_fast.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx
index ab1a274d37c8f..3940e5d873669 100644
--- a/sklearn/linear_model/_sgd_fast.pyx
+++ b/sklearn/linear_model/_sgd_fast.pyx
@@ -73,6 +73,13 @@ cdef class LossFunction:
         """
         return self.dloss(p, y)
 
+    def py_loss(self, double p, double y):
+        """Python version of `loss` for testing.
+        
+        Pytest needs a python function and can't use cdef functions.
+        """
+        return self.loss(p, y)
+
     cdef double dloss(self, double p, double y) nogil:
         """Evaluate the derivative of the loss function with respect to
         the prediction `p`.

From a40359f7dc4d18ca930c886245d3adebe36ec91c Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Thu, 26 Nov 2020 19:07:22 +0100
Subject: [PATCH 2/8] modify test for loss functions

---
 sklearn/linear_model/tests/test_sgd.py | 1064 ++++++++++++++----------
 1 file changed, 618 insertions(+), 446 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 6649dbe13622d..67f43cffcecd1 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -64,8 +64,7 @@ def partial_fit(self, X, y, *args, **kw):
     def decision_function(self, X, *args, **kw):
         # XXX untested as of v0.22
         X = sp.csr_matrix(X)
-        return linear_model.SGDRegressor.decision_function(self, X, *args,
-                                                           **kw)
+        return linear_model.SGDRegressor.decision_function(self, X, *args, **kw)
 
 
 def SGDClassifier(**kwargs):
@@ -97,25 +96,51 @@ def SparseSGDRegressor(**kwargs):
 true_result = [1, 2, 2]
 
 # test sample 2; string class labels
-X2 = np.array([[-1, 1], [-0.75, 0.5], [-1.5, 1.5],
-               [1, 1], [0.75, 0.5], [1.5, 1.5],
-               [-1, -1], [0, -0.5], [1, -1]])
+X2 = np.array(
+    [
+        [-1, 1],
+        [-0.75, 0.5],
+        [-1.5, 1.5],
+        [1, 1],
+        [0.75, 0.5],
+        [1.5, 1.5],
+        [-1, -1],
+        [0, -0.5],
+        [1, -1],
+    ]
+)
 Y2 = ["one"] * 3 + ["two"] * 3 + ["three"] * 3
 T2 = np.array([[-1.5, 0.5], [1, 2], [0, -2]])
 true_result2 = ["one", "two", "three"]
 
 # test sample 3
-X3 = np.array([[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0],
-               [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0],
-               [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1],
-               [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0]])
+X3 = np.array(
+    [
+        [1, 1, 0, 0, 0, 0],
+        [1, 1, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 1, 0, 0],
+        [0, 0, 0, 1, 0, 0],
+    ]
+)
 Y3 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
 
 # test sample 4 - two more or less redundant feature groups
-X4 = np.array([[1, 0.9, 0.8, 0, 0, 0], [1, .84, .98, 0, 0, 0],
-               [1, .96, .88, 0, 0, 0], [1, .91, .99, 0, 0, 0],
-               [0, 0, 0, .89, .91, 1], [0, 0, 0, .79, .84, 1],
-               [0, 0, 0, .91, .95, 1], [0, 0, 0, .93, 1, 1]])
+X4 = np.array(
+    [
+        [1, 0.9, 0.8, 0, 0, 0],
+        [1, 0.84, 0.98, 0, 0, 0],
+        [1, 0.96, 0.88, 0, 0, 0],
+        [1, 0.91, 0.99, 0, 0, 0],
+        [0, 0, 0, 0.89, 0.91, 1],
+        [0, 0, 0, 0.79, 0.84, 1],
+        [0, 0, 0, 0.91, 0.95, 1],
+        [0, 0, 0, 0.93, 1, 1],
+    ]
+)
 Y4 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
 
 iris = datasets.load_iris()
@@ -144,7 +169,7 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
 
     # sparse data has a fixed decay of .01
     if klass in (SparseSGDClassifier, SparseSGDRegressor):
-        decay = .01
+        decay = 0.01
 
     for i, entry in enumerate(X):
         p = np.dot(entry, weights)
@@ -165,23 +190,25 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     return average_weights, average_intercept
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_sgd_bad_alpha(klass):
     # Check whether expected ValueError on bad alpha
-    assert_raises(ValueError, klass, alpha=-.1)
+    assert_raises(ValueError, klass, alpha=-0.1)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_sgd_bad_penalty(klass):
     # Check whether expected ValueError on bad penalty
-    assert_raises(ValueError, klass, penalty='foobar',
-                  l1_ratio=0.85)
+    assert_raises(ValueError, klass, penalty="foobar", l1_ratio=0.85)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_sgd_bad_loss(klass):
     # Check whether expected ValueError on bad loss
     assert_raises(ValueError, klass, loss="foobar")
@@ -189,19 +216,16 @@ def test_sgd_bad_loss(klass):
 
 def _test_warm_start(klass, X, Y, lr):
     # Test that explicit warm restart...
-    clf = klass(alpha=0.01, eta0=0.01, shuffle=False,
-                learning_rate=lr)
+    clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
     clf.fit(X, Y)
 
-    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False,
-                 learning_rate=lr)
-    clf2.fit(X, Y,
-             coef_init=clf.coef_.copy(),
-             intercept_init=clf.intercept_.copy())
+    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy())
 
     # ... and implicit warm restart are equivalent.
-    clf3 = klass(alpha=0.01, eta0=0.01, shuffle=False,
-                 warm_start=True, learning_rate=lr)
+    clf3 = klass(
+        alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
+    )
     clf3.fit(X, Y)
 
     assert clf3.t_ == clf.t_
@@ -214,16 +238,17 @@ def _test_warm_start(klass, X, Y, lr):
     assert_array_almost_equal(clf3.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
-@pytest.mark.parametrize('lr',
-                         ["constant", "optimal", "invscaling", "adaptive"])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
 def test_warm_start(klass, lr):
     _test_warm_start(klass, X, Y, lr)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_input_format(klass):
     # Input format tests.
     clf = klass(alpha=0.01, shuffle=False)
@@ -234,56 +259,63 @@ def test_input_format(klass):
     assert_raises(ValueError, clf.fit, X, Y_)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_clone(klass):
     # Test whether clone works ok.
-    clf = klass(alpha=0.01, penalty='l1')
+    clf = klass(alpha=0.01, penalty="l1")
     clf = clone(clf)
-    clf.set_params(penalty='l2')
+    clf.set_params(penalty="l2")
     clf.fit(X, Y)
 
-    clf2 = klass(alpha=0.01, penalty='l2')
+    clf2 = klass(alpha=0.01, penalty="l2")
     clf2.fit(X, Y)
 
     assert_array_equal(clf.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_plain_has_no_average_attr(klass):
-    clf = klass(average=True, eta0=.01)
+    clf = klass(average=True, eta0=0.01)
     clf.fit(X, Y)
 
-    assert hasattr(clf, '_average_coef')
-    assert hasattr(clf, '_average_intercept')
-    assert hasattr(clf, '_standard_intercept')
-    assert hasattr(clf, '_standard_coef')
+    assert hasattr(clf, "_average_coef")
+    assert hasattr(clf, "_average_intercept")
+    assert hasattr(clf, "_standard_intercept")
+    assert hasattr(clf, "_standard_coef")
 
     clf = klass()
     clf.fit(X, Y)
 
-    assert not hasattr(clf, '_average_coef')
-    assert not hasattr(clf, '_average_intercept')
-    assert not hasattr(clf, '_standard_intercept')
-    assert not hasattr(clf, '_standard_coef')
+    assert not hasattr(clf, "_average_coef")
+    assert not hasattr(clf, "_average_intercept")
+    assert not hasattr(clf, "_standard_intercept")
+    assert not hasattr(clf, "_standard_coef")
 
 
 # TODO: remove in 0.25
-@pytest.mark.parametrize('klass', [SGDClassifier, SGDRegressor])
+@pytest.mark.parametrize("klass", [SGDClassifier, SGDRegressor])
 def test_sgd_deprecated_attr(klass):
-    est = klass(average=True, eta0=.01)
+    est = klass(average=True, eta0=0.01)
     est.fit(X, Y)
 
     msg = "Attribute {} was deprecated"
-    for att in ['average_coef_', 'average_intercept_',
-                'standard_coef_', 'standard_intercept_']:
+    for att in [
+        "average_coef_",
+        "average_intercept_",
+        "standard_coef_",
+        "standard_intercept_",
+    ]:
         with pytest.warns(FutureWarning, match=msg.format(att)):
             getattr(est, att)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_late_onset_averaging_not_reached(klass):
     clf1 = klass(average=600)
     clf2 = klass()
@@ -299,96 +331,122 @@ def test_late_onset_averaging_not_reached(klass):
     assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_late_onset_averaging_reached(klass):
-    eta0 = .001
-    alpha = .0001
+    eta0 = 0.001
+    alpha = 0.0001
     Y_encode = np.array(Y)
     Y_encode[Y_encode == 1] = -1.0
     Y_encode[Y_encode == 2] = 1.0
 
-    clf1 = klass(average=7, learning_rate="constant",
-                 loss='squared_loss', eta0=eta0,
-                 alpha=alpha, max_iter=2, shuffle=False)
-    clf2 = klass(average=0, learning_rate="constant",
-                 loss='squared_loss', eta0=eta0,
-                 alpha=alpha, max_iter=1, shuffle=False)
+    clf1 = klass(
+        average=7,
+        learning_rate="constant",
+        loss="squared_loss",
+        eta0=eta0,
+        alpha=alpha,
+        max_iter=2,
+        shuffle=False,
+    )
+    clf2 = klass(
+        average=0,
+        learning_rate="constant",
+        loss="squared_loss",
+        eta0=eta0,
+        alpha=alpha,
+        max_iter=1,
+        shuffle=False,
+    )
 
     clf1.fit(X, Y_encode)
     clf2.fit(X, Y_encode)
 
-    average_weights, average_intercept = \
-        asgd(klass, X, Y_encode, eta0, alpha,
-             weight_init=clf2.coef_.ravel(),
-             intercept_init=clf2.intercept_)
-
-    assert_array_almost_equal(clf1.coef_.ravel(),
-                              average_weights.ravel(),
-                              decimal=16)
+    average_weights, average_intercept = asgd(
+        klass,
+        X,
+        Y_encode,
+        eta0,
+        alpha,
+        weight_init=clf2.coef_.ravel(),
+        intercept_init=clf2.intercept_,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), average_weights.ravel(), decimal=16)
     assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_sgd_bad_alpha_for_optimal_learning_rate(klass):
     # Check whether expected ValueError on bad alpha, i.e. 0
     # since alpha is used to compute the optimal learning rate
-    assert_raises(ValueError, klass,
-                  alpha=0, learning_rate="optimal")
+    assert_raises(ValueError, klass, alpha=0, learning_rate="optimal")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_early_stopping(klass):
     X = iris.data[iris.target > 0]
     Y = iris.target[iris.target > 0]
     for early_stopping in [True, False]:
         max_iter = 1000
-        clf = klass(early_stopping=early_stopping, tol=1e-3,
-                    max_iter=max_iter).fit(X, Y)
+        clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit(
+            X, Y
+        )
         assert clf.n_iter_ < max_iter
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_adaptive_longer_than_constant(klass):
-    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3,
-                 max_iter=100)
+    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
     clf1.fit(iris.data, iris.target)
-    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3,
-                 max_iter=100)
+    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
     clf2.fit(iris.data, iris.target)
     assert clf1.n_iter_ > clf2.n_iter_
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_validation_set_not_used_for_training(klass):
     X, Y = iris.data, iris.target
     validation_fraction = 0.4
     seed = 42
     shuffle = False
     max_iter = 10
-    clf1 = klass(early_stopping=True,
-                 random_state=np.random.RandomState(seed),
-                 validation_fraction=validation_fraction,
-                 learning_rate='constant', eta0=0.01,
-                 tol=None, max_iter=max_iter, shuffle=shuffle)
+    clf1 = klass(
+        early_stopping=True,
+        random_state=np.random.RandomState(seed),
+        validation_fraction=validation_fraction,
+        learning_rate="constant",
+        eta0=0.01,
+        tol=None,
+        max_iter=max_iter,
+        shuffle=shuffle,
+    )
     clf1.fit(X, Y)
     assert clf1.n_iter_ == max_iter
 
-    clf2 = klass(early_stopping=False,
-                 random_state=np.random.RandomState(seed),
-                 learning_rate='constant', eta0=0.01,
-                 tol=None, max_iter=max_iter, shuffle=shuffle)
+    clf2 = klass(
+        early_stopping=False,
+        random_state=np.random.RandomState(seed),
+        learning_rate="constant",
+        eta0=0.01,
+        tol=None,
+        max_iter=max_iter,
+        shuffle=shuffle,
+    )
 
     if is_classifier(clf2):
-        cv = StratifiedShuffleSplit(test_size=validation_fraction,
-                                    random_state=seed)
+        cv = StratifiedShuffleSplit(test_size=validation_fraction, random_state=seed)
     else:
-        cv = ShuffleSplit(test_size=validation_fraction,
-                          random_state=seed)
+        cv = ShuffleSplit(test_size=validation_fraction, random_state=seed)
     idx_train, idx_val = next(cv.split(X, Y))
     idx_train = np.sort(idx_train)  # remove shuffling
     clf2.fit(X[idx_train], Y[idx_train])
@@ -397,22 +455,30 @@ def test_validation_set_not_used_for_training(klass):
     assert_array_equal(clf1.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_n_iter_no_change(klass):
     X, Y = iris.data, iris.target
     # test that n_iter_ increases monotonically with n_iter_no_change
     for early_stopping in [True, False]:
-        n_iter_list = [klass(early_stopping=early_stopping,
-                             n_iter_no_change=n_iter_no_change,
-                             tol=1e-4, max_iter=1000
-                             ).fit(X, Y).n_iter_
-                       for n_iter_no_change in [2, 3, 10]]
+        n_iter_list = [
+            klass(
+                early_stopping=early_stopping,
+                n_iter_no_change=n_iter_no_change,
+                tol=1e-4,
+                max_iter=1000,
+            )
+            .fit(X, Y)
+            .n_iter_
+            for n_iter_no_change in [2, 3, 10]
+        ]
         assert_array_equal(n_iter_list, sorted(n_iter_list))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize(
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+)
 def test_not_enough_sample_for_early_stopping(klass):
     # test an error is raised if the training or validation set is empty
     clf = klass(early_stopping=True, validation_fraction=0.99)
@@ -423,119 +489,127 @@ def test_not_enough_sample_for_early_stopping(klass):
 ###############################################################################
 # Classification Test Case
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_clf(klass):
     # Check that SGD gives any results :-)
 
     for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
-        clf = klass(penalty='l2', alpha=0.01, fit_intercept=True,
-                    loss=loss, max_iter=10, shuffle=True)
+        clf = klass(
+            penalty="l2",
+            alpha=0.01,
+            fit_intercept=True,
+            loss=loss,
+            max_iter=10,
+            shuffle=True,
+        )
         clf.fit(X, Y)
         # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7)
         assert_array_equal(clf.predict(T), true_result)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_l1_ratio(klass):
     # Check whether expected ValueError on bad l1_ratio
     assert_raises(ValueError, klass, l1_ratio=1.1)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_learning_rate_schedule(klass):
     # Check whether expected ValueError on bad learning_rate
     assert_raises(ValueError, klass, learning_rate="<unknown>")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_eta0(klass):
     # Check whether expected ValueError on bad eta0
-    assert_raises(ValueError, klass, eta0=0,
-                  learning_rate="constant")
+    assert_raises(ValueError, klass, eta0=0, learning_rate="constant")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_max_iter_param(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, max_iter=-10000)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_shuffle_param(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, shuffle="false")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_early_stopping_param(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, early_stopping="false")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_validation_fraction(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, validation_fraction=-.1)
+    assert_raises(ValueError, klass, validation_fraction=-0.1)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_n_iter_no_change(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, n_iter_no_change=0)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_argument_coef(klass):
     # Checks coef_init not allowed as model argument (only fit)
     # Provided coef_ does not match dataset
     assert_raises(TypeError, klass, coef_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_provide_coef(klass):
     # Checks coef_init shape for the warm starts
     # Provided coef_ does not match dataset.
-    assert_raises(ValueError, klass().fit,
-                  X, Y, coef_init=np.zeros((3,)))
+    assert_raises(ValueError, klass().fit, X, Y, coef_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept(klass):
     # Checks intercept_ shape for the warm starts
     # Provided intercept_ does not match dataset.
-    assert_raises(ValueError, klass().fit,
-                  X, Y, intercept_init=np.zeros((3,)))
+    assert_raises(ValueError, klass().fit, X, Y, intercept_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_early_stopping_with_partial_fit(klass):
     # Test parameter validity check
-    assert_raises(ValueError,
-                  klass(early_stopping=True).partial_fit, X, Y)
+    assert_raises(ValueError, klass(early_stopping=True).partial_fit, X, Y)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept_binary(klass):
     # Checks intercept_ shape for the warm starts in binary case
     klass().fit(X5, Y5, intercept_init=0)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_average_binary_computed_correctly(klass):
     # Checks the SGDClassifier correctly computes the average weights
-    eta = .1
-    alpha = 2.
+    eta = 0.1
+    alpha = 2.0
     n_samples = 20
     n_features = 10
     rng = np.random.RandomState(0)
     X = rng.normal(size=(n_samples, n_features))
     w = rng.normal(size=n_features)
 
-    clf = klass(loss='squared_loss',
-                learning_rate='constant',
-                eta0=eta, alpha=alpha,
-                fit_intercept=True,
-                max_iter=1, average=True, shuffle=False)
+    clf = klass(
+        loss="squared_loss",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
 
     # simple linear function without noise
     y = np.dot(X, w)
@@ -545,13 +619,11 @@ def test_average_binary_computed_correctly(klass):
 
     average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
     average_weights = average_weights.reshape(1, -1)
-    assert_array_almost_equal(clf.coef_,
-                              average_weights,
-                              decimal=14)
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=14)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept_to_intercept(klass):
     # Checks intercept_ shape consistency for the warm starts
     # Inconsistent intercept_ shape.
@@ -561,31 +633,37 @@ def test_set_intercept_to_intercept(klass):
     klass().fit(X, Y, intercept_init=clf.intercept_)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_at_least_two_labels(klass):
     # Target must have at least two labels
     clf = klass(alpha=0.01, max_iter=20)
     assert_raises(ValueError, clf.fit, X2, np.ones(9))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_weight_class_balanced(klass):
     # partial_fit with class_weight='balanced' not supported"""
-    regex = (r"class_weight 'balanced' is not supported for "
-             r"partial_fit\. In order to use 'balanced' weights, "
-             r"use compute_class_weight\('balanced', classes=classes, y=y\). "
-             r"In place of y you can us a large enough sample "
-             r"of the full training set target to properly "
-             r"estimate the class frequency distributions\. "
-             r"Pass the resulting weights as the class_weight "
-             r"parameter\.")
-    assert_raises_regexp(ValueError,
-                         regex,
-                         klass(class_weight='balanced').partial_fit,
-                         X, Y, classes=np.unique(Y))
-
-
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+    regex = (
+        r"class_weight 'balanced' is not supported for "
+        r"partial_fit\. In order to use 'balanced' weights, "
+        r"use compute_class_weight\('balanced', classes=classes, y=y\). "
+        r"In place of y you can us a large enough sample "
+        r"of the full training set target to properly "
+        r"estimate the class frequency distributions\. "
+        r"Pass the resulting weights as the class_weight "
+        r"parameter\."
+    )
+    assert_raises_regexp(
+        ValueError,
+        regex,
+        klass(class_weight="balanced").partial_fit,
+        X,
+        Y,
+        classes=np.unique(Y),
+    )
+
+
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass(klass):
     # Multi-class test case
     clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
@@ -596,16 +674,21 @@ def test_sgd_multiclass(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass_average(klass):
-    eta = .001
-    alpha = .01
+    eta = 0.001
+    alpha = 0.01
     # Multi-class average test case
-    clf = klass(loss='squared_loss',
-                learning_rate='constant',
-                eta0=eta, alpha=alpha,
-                fit_intercept=True,
-                max_iter=1, average=True, shuffle=False)
+    clf = klass(
+        loss="squared_loss",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
 
     np_Y2 = np.array(Y2)
     clf.fit(X2, np_Y2)
@@ -616,24 +699,21 @@ def test_sgd_multiclass_average(klass):
         y_i[np_Y2 != cl] = -1
         average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
         assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
-        assert_almost_equal(average_intercept,
-                            clf.intercept_[i],
-                            decimal=16)
+        assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass_with_init_coef(klass):
     # Multi-class test case
     clf = klass(alpha=0.01, max_iter=20)
-    clf.fit(X2, Y2, coef_init=np.zeros((3, 2)),
-            intercept_init=np.zeros(3))
+    clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3))
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape, (3,)
     pred = clf.predict(T2)
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass_njobs(klass):
     # Multi-class test case with multi-core support
     clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
@@ -644,7 +724,7 @@ def test_sgd_multiclass_njobs(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_set_coef_multiclass(klass):
     # Checks coef_init and intercept_init shape for multi-class
     # problems
@@ -657,14 +737,13 @@ def test_set_coef_multiclass(klass):
 
     # Provided intercept_ does not match dataset
     clf = klass()
-    assert_raises(ValueError, clf.fit, X2, Y2,
-                  intercept_init=np.zeros((1,)))
+    assert_raises(ValueError, clf.fit, X2, Y2, intercept_init=np.zeros((1,)))
 
     # Provided intercept_ does match dataset.
     clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_predict_proba_method_access(klass):
     # Checks that SGDClassifier predict_proba and predict_log_proba methods
     # can either be accessed or raise an appropriate error message
@@ -673,31 +752,29 @@ def test_sgd_predict_proba_method_access(klass):
     # details.
     for loss in linear_model.SGDClassifier.loss_functions:
         clf = SGDClassifier(loss=loss)
-        if loss in ('log', 'modified_huber'):
-            assert hasattr(clf, 'predict_proba')
-            assert hasattr(clf, 'predict_log_proba')
+        if loss in ("log", "modified_huber"):
+            assert hasattr(clf, "predict_proba")
+            assert hasattr(clf, "predict_log_proba")
         else:
-            message = ("probability estimates are not "
-                       "available for loss={!r}".format(loss))
-            assert not hasattr(clf, 'predict_proba')
-            assert not hasattr(clf, 'predict_log_proba')
-            with pytest.raises(AttributeError,
-                               match=message):
+            message = "probability estimates are not " "available for loss={!r}".format(
+                loss
+            )
+            assert not hasattr(clf, "predict_proba")
+            assert not hasattr(clf, "predict_log_proba")
+            with pytest.raises(AttributeError, match=message):
                 clf.predict_proba
-            with pytest.raises(AttributeError,
-                               match=message):
+            with pytest.raises(AttributeError, match=message):
                 clf.predict_log_proba
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_proba(klass):
     # Check SGD.predict_proba
 
     # Hinge loss does not allow for conditional prob estimate.
     # We cannot use the factory here, because it defines predict_proba
     # anyway.
-    clf = SGDClassifier(loss="hinge", alpha=0.01,
-                        max_iter=10, tol=None).fit(X, Y)
+    clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=10, tol=None).fit(X, Y)
     assert not hasattr(clf, "predict_proba")
     assert not hasattr(clf, "predict_log_proba")
 
@@ -719,8 +796,8 @@ def test_sgd_proba(klass):
     # log loss multiclass probability estimates
     clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
 
-    d = clf.decision_function([[.1, -.1], [.3, .2]])
-    p = clf.predict_proba([[.1, -.1], [.3, .2]])
+    d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
+    p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
     assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
     assert_almost_equal(p[0].sum(), 1)
     assert np.all(p[0] >= 0)
@@ -746,7 +823,7 @@ def test_sgd_proba(klass):
     p = clf.predict_proba([[3, 2]])
     if klass != SparseSGDClassifier:
         assert np.argmax(d, axis=1) == np.argmax(p, axis=1)
-    else:   # XXX the sparse test gets a different X2 (?)
+    else:  # XXX the sparse test gets a different X2 (?)
         assert np.argmin(d, axis=1) == np.argmin(p, axis=1)
 
     # the following sample produces decision_function values < -1,
@@ -756,10 +833,10 @@ def test_sgd_proba(klass):
     d = clf.decision_function([x])
     if np.all(d < -1):  # XXX not true in sparse test case (why?)
         p = clf.predict_proba([x])
-        assert_array_almost_equal(p[0], [1 / 3.] * 3)
+        assert_array_almost_equal(p[0], [1 / 3.0] * 3)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sgd_l1(klass):
     # Test L1 regularization
     n = len(X4)
@@ -770,8 +847,14 @@ def test_sgd_l1(klass):
     X = X4[idx, :]
     Y = Y4[idx]
 
-    clf = klass(penalty='l1', alpha=.2, fit_intercept=False,
-                max_iter=2000, tol=None, shuffle=False)
+    clf = klass(
+        penalty="l1",
+        alpha=0.2,
+        fit_intercept=False,
+        max_iter=2000,
+        tol=None,
+        shuffle=False,
+    )
     clf.fit(X, Y)
     assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
     pred = clf.predict(X)
@@ -790,21 +873,18 @@ def test_sgd_l1(klass):
     assert_array_equal(pred, Y)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_class_weights(klass):
     # Test class weights.
-    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
-                  [1.0, 1.0], [1.0, 0.0]])
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False,
-                class_weight=None)
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
     clf.fit(X, y)
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
 
     # we give a small weights to class 1
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False,
-                class_weight={1: 0.001})
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001})
     clf.fit(X, y)
 
     # now the hyperplane should rotate clock-wise and
@@ -812,7 +892,7 @@ def test_class_weights(klass):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_equal_class_weight(klass):
     # Test if equal class weights approx. equals no class weights.
     X = [[1, 0], [1, 0], [0, 1], [0, 1]]
@@ -822,32 +902,31 @@ def test_equal_class_weight(klass):
 
     X = [[1, 0], [0, 1]]
     y = [0, 1]
-    clf_weighted = klass(alpha=0.1, max_iter=1000,
-                         class_weight={0: 0.5, 1: 0.5})
+    clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
     clf_weighted.fit(X, y)
 
     # should be similar up to some epsilon due to learning rate schedule
     assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_wrong_class_weight_label(klass):
     # ValueError due to not existing class label.
     clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
     assert_raises(ValueError, clf.fit, X, Y)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_wrong_class_weight_format(klass):
     # ValueError due to wrong class_weight argument type.
     clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5])
     assert_raises(ValueError, clf.fit, X, Y)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_weights_multiplied(klass):
     # Tests that class_weight and sample_weight are multiplicative
-    class_weights = {1: .6, 2: .3}
+    class_weights = {1: 0.6, 2: 0.3}
     rng = np.random.RandomState(0)
     sample_weights = rng.random_sample(Y4.shape[0])
     multiplied_together = np.copy(sample_weights)
@@ -863,7 +942,7 @@ def test_weights_multiplied(klass):
     assert_almost_equal(clf1.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_balanced_weight(klass):
     # Test class weights for imbalanced data"""
     # compute reference metrics on iris dataset that is quite balanced by
@@ -875,16 +954,15 @@ def test_balanced_weight(klass):
     rng.shuffle(idx)
     X = X[idx]
     y = y[idx]
-    clf = klass(alpha=0.0001, max_iter=1000,
-                class_weight=None, shuffle=False).fit(X, y)
-    f1 = metrics.f1_score(y, clf.predict(X), average='weighted')
+    clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y)
+    f1 = metrics.f1_score(y, clf.predict(X), average="weighted")
     assert_almost_equal(f1, 0.96, decimal=1)
 
     # make the same prediction using balanced class_weight
-    clf_balanced = klass(alpha=0.0001, max_iter=1000,
-                         class_weight="balanced",
-                         shuffle=False).fit(X, y)
-    f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted')
+    clf_balanced = klass(
+        alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False
+    ).fit(X, y)
+    f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted")
     assert_almost_equal(f1, 0.96, decimal=1)
 
     # Make sure that in the balanced case it does not change anything
@@ -902,21 +980,19 @@ def test_balanced_weight(klass):
     clf = klass(max_iter=1000, class_weight=None, shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
-    assert metrics.f1_score(y, y_pred, average='weighted') < 0.96
+    assert metrics.f1_score(y, y_pred, average="weighted") < 0.96
 
     # fit a model with balanced class_weight enabled
-    clf = klass(max_iter=1000, class_weight="balanced",
-                shuffle=False)
+    clf = klass(max_iter=1000, class_weight="balanced", shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
-    assert metrics.f1_score(y, y_pred, average='weighted') > 0.96
+    assert metrics.f1_score(y, y_pred, average="weighted") > 0.96
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_sample_weights(klass):
     # Test weights on individual samples
-    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
-                  [1.0, 1.0], [1.0, 0.0]])
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
     clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
@@ -931,7 +1007,7 @@ def test_sample_weights(klass):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_wrong_sample_weights(klass):
     # Test if ValueError is raised if sample_weight has wrong shape
     clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
@@ -939,14 +1015,14 @@ def test_wrong_sample_weights(klass):
     assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_exception(klass):
     clf = klass(alpha=0.01)
     # classes was not specified
     assert_raises(ValueError, clf.partial_fit, X3, Y3)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_binary(klass):
     third = X.shape[0] // 3
     clf = klass(alpha=0.01)
@@ -955,7 +1031,7 @@ def test_partial_fit_binary(klass):
     clf.partial_fit(X[:third], Y[:third], classes=classes)
     assert clf.coef_.shape == (1, X.shape[1])
     assert clf.intercept_.shape == (1,)
-    assert clf.decision_function([[0, 0]]).shape == (1, )
+    assert clf.decision_function([[0, 0]]).shape == (1,)
     id1 = id(clf.coef_.data)
 
     clf.partial_fit(X[third:], Y[third:])
@@ -967,7 +1043,7 @@ def test_partial_fit_binary(klass):
     assert_array_equal(y_pred, true_result)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_multiclass(klass):
     third = X2.shape[0] // 3
     clf = klass(alpha=0.01)
@@ -985,7 +1061,7 @@ def test_partial_fit_multiclass(klass):
     assert id1, id2
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_multiclass_average(klass):
     third = X2.shape[0] // 3
     clf = klass(alpha=0.01, average=X2.shape[0])
@@ -1000,30 +1076,27 @@ def test_partial_fit_multiclass_average(klass):
     assert clf.intercept_.shape == (3,)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_fit_then_partial_fit(klass):
     # Partial_fit should work after initial fit in the multiclass case.
     # Non-regression test for #2496; fit would previously produce a
     # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
     clf = klass()
     clf.fit(X2, Y2)
-    clf.partial_fit(X2, Y2)     # no exception here
+    clf.partial_fit(X2, Y2)  # no exception here
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
-@pytest.mark.parametrize('lr',
-                         ["constant", "optimal", "invscaling", "adaptive"])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
 def test_partial_fit_equal_fit_classif(klass, lr):
     for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
-        clf = klass(alpha=0.01, eta0=0.01, max_iter=2,
-                    learning_rate=lr, shuffle=False)
+        clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False)
         clf.fit(X_, Y_)
         y_pred = clf.decision_function(T_)
         t = clf.t_
 
         classes = np.unique(Y_)
-        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr,
-                    shuffle=False)
+        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
         for i in range(2):
             clf.partial_fit(X_, Y_, classes=classes)
         y_pred2 = clf.decision_function(T_)
@@ -1032,18 +1105,26 @@ def test_partial_fit_equal_fit_classif(klass, lr):
         assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_regression_losses(klass):
     random_state = np.random.RandomState(1)
-    clf = klass(alpha=0.01, learning_rate="constant",
-                eta0=0.1, loss="epsilon_insensitive",
-                random_state=random_state)
+    clf = klass(
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        loss="epsilon_insensitive",
+        random_state=random_state,
+    )
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(alpha=0.01, learning_rate="constant",
-                eta0=0.1, loss="squared_epsilon_insensitive",
-                random_state=random_state)
+    clf = klass(
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.1,
+        loss="squared_epsilon_insensitive",
+        random_state=random_state,
+    )
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
@@ -1051,18 +1132,23 @@ def test_regression_losses(klass):
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(alpha=0.01, learning_rate="constant", eta0=0.01,
-                loss="squared_loss", random_state=random_state)
+    clf = klass(
+        alpha=0.01,
+        learning_rate="constant",
+        eta0=0.01,
+        loss="squared_loss",
+        random_state=random_state,
+    )
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_warm_start_multiclass(klass):
     _test_warm_start(klass, X2, Y2, "optimal")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 def test_multiple_fit(klass):
     # Test multiple calls of fit w/ different shaped inputs.
     clf = klass(alpha=0.01, shuffle=False)
@@ -1077,7 +1163,8 @@ def test_multiple_fit(klass):
 ###############################################################################
 # Regression Test Case
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_sgd_reg(klass):
     # Check that SGD gives any results.
     clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
@@ -1085,12 +1172,12 @@ def test_sgd_reg(klass):
     assert clf.coef_[0] == clf.coef_[1]
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_sgd_averaged_computed_correctly(klass):
     # Tests the average regressor matches the naive implementation
 
-    eta = .001
-    alpha = .01
+    eta = 0.001
+    alpha = 0.01
     n_samples = 20
     n_features = 10
     rng = np.random.RandomState(0)
@@ -1100,26 +1187,29 @@ def test_sgd_averaged_computed_correctly(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(loss='squared_loss',
-                learning_rate='constant',
-                eta0=eta, alpha=alpha,
-                fit_intercept=True,
-                max_iter=1, average=True, shuffle=False)
+    clf = klass(
+        loss="squared_loss",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
 
     clf.fit(X, y)
     average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
 
-    assert_array_almost_equal(clf.coef_,
-                              average_weights,
-                              decimal=16)
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_sgd_averaged_partial_fit(klass):
     # Tests whether the partial fit yields the same average as the fit
-    eta = .001
-    alpha = .01
+    eta = 0.001
+    alpha = 0.01
     n_samples = 20
     n_features = 10
     rng = np.random.RandomState(0)
@@ -1129,47 +1219,53 @@ def test_sgd_averaged_partial_fit(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(loss='squared_loss',
-                learning_rate='constant',
-                eta0=eta, alpha=alpha,
-                fit_intercept=True,
-                max_iter=1, average=True, shuffle=False)
-
-    clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)])
-    clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):])
+    clf = klass(
+        loss="squared_loss",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
+
+    clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)])
+    clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :])
     average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
 
-    assert_array_almost_equal(clf.coef_,
-                              average_weights,
-                              decimal=16)
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_average_sparse(klass):
     # Checks the average weights on data with 0s
 
-    eta = .001
-    alpha = .01
-    clf = klass(loss='squared_loss',
-                learning_rate='constant',
-                eta0=eta, alpha=alpha,
-                fit_intercept=True,
-                max_iter=1, average=True, shuffle=False)
+    eta = 0.001
+    alpha = 0.01
+    clf = klass(
+        loss="squared_loss",
+        learning_rate="constant",
+        eta0=eta,
+        alpha=alpha,
+        fit_intercept=True,
+        max_iter=1,
+        average=True,
+        shuffle=False,
+    )
 
     n_samples = Y3.shape[0]
 
-    clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)])
-    clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):])
+    clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)])
+    clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :])
     average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
 
-    assert_array_almost_equal(clf.coef_,
-                              average_weights,
-                              decimal=16)
+    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_sgd_least_squares_fit(klass):
     xmin, xmax = -5, 5
     n_samples = 100
@@ -1179,8 +1275,7 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
-                fit_intercept=False)
+    clf = klass(loss="squared_loss", alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1188,14 +1283,13 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
-                fit_intercept=False)
+    clf = klass(loss="squared_loss", alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_sgd_epsilon_insensitive(klass):
     xmin, xmax = -5, 5
     n_samples = 100
@@ -1205,9 +1299,13 @@ def test_sgd_epsilon_insensitive(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss='epsilon_insensitive', epsilon=0.01,
-                alpha=0.1, max_iter=20,
-                fit_intercept=False)
+    clf = klass(
+        loss="epsilon_insensitive",
+        epsilon=0.01,
+        alpha=0.1,
+        max_iter=20,
+        fit_intercept=False,
+    )
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1215,15 +1313,19 @@ def test_sgd_epsilon_insensitive(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss='epsilon_insensitive', epsilon=0.01,
-                alpha=0.1, max_iter=20,
-                fit_intercept=False)
+    clf = klass(
+        loss="epsilon_insensitive",
+        epsilon=0.01,
+        alpha=0.1,
+        max_iter=20,
+        fit_intercept=False,
+    )
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_sgd_huber_fit(klass):
     xmin, xmax = -5, 5
     n_samples = 100
@@ -1233,8 +1335,7 @@ def test_sgd_huber_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
-                fit_intercept=False)
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1242,14 +1343,13 @@ def test_sgd_huber_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
-                fit_intercept=False)
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_elasticnet_convergence(klass):
     # Check that the SGD output is consistent with coordinate descent
 
@@ -1264,30 +1364,35 @@ def test_elasticnet_convergence(klass):
     # XXX: alpha = 0.1 seems to cause convergence problems
     for alpha in [0.01, 0.001]:
         for l1_ratio in [0.5, 0.8, 1.0]:
-            cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio,
-                                         fit_intercept=False)
+            cd = linear_model.ElasticNet(
+                alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False
+            )
             cd.fit(X, y)
-            sgd = klass(penalty='elasticnet', max_iter=50,
-                        alpha=alpha, l1_ratio=l1_ratio,
-                        fit_intercept=False)
+            sgd = klass(
+                penalty="elasticnet",
+                max_iter=50,
+                alpha=alpha,
+                l1_ratio=l1_ratio,
+                fit_intercept=False,
+            )
             sgd.fit(X, y)
-            err_msg = ("cd and sgd did not converge to comparable "
-                       "results for alpha=%f and l1_ratio=%f"
-                       % (alpha, l1_ratio))
-            assert_almost_equal(cd.coef_, sgd.coef_, decimal=2,
-                                err_msg=err_msg)
+            err_msg = (
+                "cd and sgd did not converge to comparable "
+                "results for alpha=%f and l1_ratio=%f" % (alpha, l1_ratio)
+            )
+            assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg)
 
 
 @ignore_warnings
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_partial_fit(klass):
     third = X.shape[0] // 3
     clf = klass(alpha=0.01)
 
     clf.partial_fit(X[:third], Y[:third])
-    assert clf.coef_.shape == (X.shape[1], )
+    assert clf.coef_.shape == (X.shape[1],)
     assert clf.intercept_.shape == (1,)
-    assert clf.predict([[0, 0]]).shape == (1, )
+    assert clf.predict([[0, 0]]).shape == (1,)
     id1 = id(clf.coef_.data)
 
     clf.partial_fit(X[third:], Y[third:])
@@ -1296,18 +1401,15 @@ def test_partial_fit(klass):
     assert id1, id2
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
-@pytest.mark.parametrize('lr',
-                         ["constant", "optimal", "invscaling", "adaptive"])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
 def test_partial_fit_equal_fit(klass, lr):
-    clf = klass(alpha=0.01, max_iter=2, eta0=0.01,
-                learning_rate=lr, shuffle=False)
+    clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
     clf.fit(X, Y)
     y_pred = clf.predict(T)
     t = clf.t_
 
-    clf = klass(alpha=0.01, eta0=0.01,
-                learning_rate=lr, shuffle=False)
+    clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
     for i in range(2):
         clf.partial_fit(X, Y)
     y_pred2 = clf.predict(T)
@@ -1316,38 +1418,50 @@ def test_partial_fit_equal_fit(klass, lr):
     assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 def test_loss_function_epsilon(klass):
     clf = klass(epsilon=0.9)
     clf.set_params(epsilon=0.1)
-    assert clf.loss_functions['huber'][1] == 0.1
+    assert clf.loss_functions["huber"][1] == 0.1
 
 
 def test_l1_ratio():
     # Test if l1 ratio extremes match L1 and L2 penalty settings.
-    X, y = datasets.make_classification(n_samples=1000,
-                                        n_features=100, n_informative=20,
-                                        random_state=1234)
+    X, y = datasets.make_classification(
+        n_samples=1000, n_features=100, n_informative=20, random_state=1234
+    )
 
     # test if elasticnet with l1_ratio near 1 gives same result as pure l1
-    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
-                           max_iter=6, l1_ratio=0.9999999999,
-                           random_state=42).fit(X, y)
-    est_l1 = SGDClassifier(alpha=0.001, penalty='l1', max_iter=6,
-                           random_state=42, tol=None).fit(X, y)
+    est_en = SGDClassifier(
+        alpha=0.001,
+        penalty="elasticnet",
+        tol=None,
+        max_iter=6,
+        l1_ratio=0.9999999999,
+        random_state=42,
+    ).fit(X, y)
+    est_l1 = SGDClassifier(
+        alpha=0.001, penalty="l1", max_iter=6, random_state=42, tol=None
+    ).fit(X, y)
     assert_array_almost_equal(est_en.coef_, est_l1.coef_)
 
     # test if elasticnet with l1_ratio near 0 gives same result as pure l2
-    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
-                           max_iter=6, l1_ratio=0.0000000001,
-                           random_state=42).fit(X, y)
-    est_l2 = SGDClassifier(alpha=0.001, penalty='l2', max_iter=6,
-                           random_state=42, tol=None).fit(X, y)
+    est_en = SGDClassifier(
+        alpha=0.001,
+        penalty="elasticnet",
+        tol=None,
+        max_iter=6,
+        l1_ratio=0.0000000001,
+        random_state=42,
+    ).fit(X, y)
+    est_l2 = SGDClassifier(
+        alpha=0.001, penalty="l2", max_iter=6, random_state=42, tol=None
+    ).fit(X, y)
     assert_array_almost_equal(est_en.coef_, est_l2.coef_)
 
 
 def test_underflow_or_overlow():
-    with np.errstate(all='raise'):
+    with np.errstate(all="raise"):
         # Generate some weird data with hugely unscaled features
         rng = np.random.RandomState(0)
         n_samples = 100
@@ -1365,41 +1479,57 @@ def test_underflow_or_overlow():
 
         # Define a ground truth on the scaled data
         ground_truth = rng.normal(size=n_features)
-        y = (np.dot(X_scaled, ground_truth) > 0.).astype(np.int32)
+        y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32)
         assert_array_equal(np.unique(y), [0, 1])
 
-        model = SGDClassifier(alpha=0.1, loss='squared_hinge', max_iter=500)
+        model = SGDClassifier(alpha=0.1, loss="squared_hinge", max_iter=500)
 
         # smoke test: model is stable on scaled data
         model.fit(X_scaled, y)
         assert np.isfinite(model.coef_).all()
 
         # model is numerically unstable on unscaled data
-        msg_regxp = (r"Floating-point under-/overflow occurred at epoch #.*"
-                     " Scaling input data with StandardScaler or MinMaxScaler"
-                     " might help.")
+        msg_regxp = (
+            r"Floating-point under-/overflow occurred at epoch #.*"
+            " Scaling input data with StandardScaler or MinMaxScaler"
+            " might help."
+        )
         assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
 
 
 def test_numerical_stability_large_gradient():
     # Non regression test case for numerical stability on scaled problems
     # where the gradient can still explode with some losses
-    model = SGDClassifier(loss='squared_hinge', max_iter=10, shuffle=True,
-                          penalty='elasticnet', l1_ratio=0.3, alpha=0.01,
-                          eta0=0.001, random_state=0, tol=None)
-    with np.errstate(all='raise'):
+    model = SGDClassifier(
+        loss="squared_hinge",
+        max_iter=10,
+        shuffle=True,
+        penalty="elasticnet",
+        l1_ratio=0.3,
+        alpha=0.01,
+        eta0=0.001,
+        random_state=0,
+        tol=None,
+    )
+    with np.errstate(all="raise"):
         model.fit(iris.data, iris.target)
     assert np.isfinite(model.coef_).all()
 
 
-@pytest.mark.parametrize('penalty', ['l2', 'l1', 'elasticnet'])
+@pytest.mark.parametrize("penalty", ["l2", "l1", "elasticnet"])
 def test_large_regularization(penalty):
     # Non regression tests for numerical stability issues caused by large
     # regularization parameters
-    model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1,
-                          penalty=penalty, shuffle=False,
-                          tol=None, max_iter=6)
-    with np.errstate(all='raise'):
+    model = SGDClassifier(
+        alpha=1e5,
+        learning_rate="constant",
+        eta0=0.1,
+        penalty=penalty,
+        shuffle=False,
+        tol=None,
+        max_iter=6,
+    )
+    with np.errstate(all="raise"):
         model.fit(iris.data, iris.target)
     assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_))
 
@@ -1434,126 +1564,175 @@ def test_tol_parameter():
     assert model_3.n_iter_ == 3
 
 
-def _test_gradient_common(loss_function, cases):
+def _test_loss_common(loss_function, cases):
     # Test gradient of different loss functions
     # cases is a list of (p, y, expected)
-    for p, y, expected in cases:
-        assert_almost_equal(loss_function.py_dloss(p, y), expected)
+    for p, y, expected_loss, expected_dloss in cases:
+        assert_almost_equal(loss_function.py_loss(p, y), expected_loss)
+        assert_almost_equal(loss_function.py_dloss(p, y), expected_dloss)
 
 
-def test_gradient_hinge():
+def test_loss_hinge():
     # Test Hinge (hinge / perceptron)
     # hinge
     loss = sgd_fast.Hinge(1.0)
     cases = [
-        # (p, y, expected)
-        (1.1, 1.0, 0.0), (-2.0, -1.0, 0.0),
-        (1.0, 1.0, -1.0), (-1.0, -1.0, 1.0), (0.5, 1.0, -1.0),
-        (2.0, -1.0, 1.0), (-0.5, -1.0, 1.0), (0.0, 1.0, -1.0)
+        # (p, y, expected_loss, expected_dloss)
+        (1.1, 1.0, 0.0, 0.0),
+        (-2.0, -1.0, 0.0, 0.0),
+        (1.0, 1.0, 0.0, -1.0),
+        (-1.0, -1.0, 0.0, 1.0),
+        (0.5, 1.0, 0.5, -1.0),
+        (2.0, -1.0, 3.0, 1.0),
+        (-0.5, -1.0, 0.5, 1.0),
+        (0.0, 1.0, 1, -1.0),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
     # perceptron
     loss = sgd_fast.Hinge(0.0)
     cases = [
-        # (p, y, expected)
-        (1.0, 1.0, 0.0), (-0.1, -1.0, 0.0),
-        (0.0, 1.0, -1.0), (0.0, -1.0, 1.0), (0.5, -1.0, 1.0),
-        (2.0, -1.0, 1.0), (-0.5, 1.0, -1.0), (-1.0, 1.0, -1.0),
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, 0.0, 0.0),
+        (-0.1, -1.0, 0.0, 0.0),
+        (0.0, 1.0, 0.0, -1.0),
+        (0.0, -1.0, 0.0, 1.0),
+        (0.5, -1.0, 0.5, 1.0),
+        (2.0, -1.0, 2.0, 1.0),
+        (-0.5, 1.0, 0.5, -1.0),
+        (-1.0, 1.0, 1.0, -1.0),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
 def test_gradient_squared_hinge():
     # Test SquaredHinge
     loss = sgd_fast.SquaredHinge(1.0)
     cases = [
-        # (p, y, expected)
-        (1.0, 1.0, 0.0), (-2.0, -1.0, 0.0), (1.0, -1.0, 4.0),
-        (-1.0, 1.0, -4.0), (0.5, 1.0, -1.0), (0.5, -1.0, 3.0)
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, 0.0, 0.0),
+        (-2.0, -1.0, 0.0, 0.0),
+        (1.0, -1.0, 4.0, 4.0),
+        (-1.0, 1.0, 4.0, -4.0),
+        (0.5, 1.0, 0.25, -1.0),
+        (0.5, -1.0, 2.25, 3.0),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
-def test_gradient_log():
+def test_loss_log():
     # Test Log (logistic loss)
     loss = sgd_fast.Log()
     cases = [
-        # (p, y, expected)
-        (1.0, 1.0, -1.0 / (np.exp(1.0) + 1.0)),
-        (1.0, -1.0, 1.0 / (np.exp(-1.0) + 1.0)),
-        (-1.0, -1.0, 1.0 / (np.exp(1.0) + 1.0)),
-        (-1.0, 1.0, -1.0 / (np.exp(-1.0) + 1.0)),
-        (0.0, 1.0, -0.5), (0.0, -1.0, 0.5),
-        (17.9, -1.0, 1.0), (-17.9, 1.0, -1.0),
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, np.log(1.0 + exp(-1.0)), -1.0 / (np.exp(1.0) + 1.0)),
+        (1.0, -1.0, np.log(1.0 + exp(1.0)), 1.0 / (np.exp(-1.0) + 1.0)),
+        (-1.0, -1.0, np.log(1.0 + exp(-1.0)), 1.0 / (np.exp(1.0) + 1.0)),
+        (-1.0, 1.0, np.log(1.0 + exp(1.0)), -1.0 / (np.exp(-1.0) + 1.0)),
+        (0.0, 1.0, 0, -0.5),
+        (0.0, -1.0, 0, 0.5),
+        (17.9, -1.0, 17.9, 1.0),
+        (-17.9, 1.0, 17.9, -1.0),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
     assert_almost_equal(loss.py_dloss(18.1, 1.0), np.exp(-18.1) * -1.0, 16)
+    assert_almost_equal(loss.py_loss(18.1, 1.0), np.exp(-18.1), 16)
     assert_almost_equal(loss.py_dloss(-18.1, -1.0), np.exp(-18.1) * 1.0, 16)
+    assert_almost_equal(loss.py_loss(-18.1, 1.0), 18.1, 16)
 
 
-def test_gradient_squared_loss():
+def test_loss_squared_loss():
     # Test SquaredLoss
     loss = sgd_fast.SquaredLoss()
     cases = [
-        # (p, y, expected)
-        (0.0, 0.0, 0.0), (1.0, 1.0, 0.0), (1.0, 0.0, 1.0),
-        (0.5, -1.0, 1.5), (-2.5, 2.0, -4.5)
+        # (p, y, expected_loss, expected_dloss)
+        (0.0, 0.0, 0.0, 0.0),
+        (1.0, 1.0, 0.0, 0.0),
+        (1.0, 0.0, 0.5, 1.0),
+        (0.5, -1.0, 1.125, 1.5),
+        (-2.5, 2.0, 10.125, -4.5),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
-def test_gradient_huber():
+def test_loss_huber():
     # Test Huber
     loss = sgd_fast.Huber(0.1)
     cases = [
-        # (p, y, expected)
-        (0.0, 0.0, 0.0), (0.1, 0.0, 0.1), (0.0, 0.1, -0.1),
-        (3.95, 4.0, -0.05), (5.0, 2.0, 0.1), (-1.0, 5.0, -0.1)
+        # (p, y, expected_loss, expected_dloss)
+        (0.0, 0.0, 0.0, 0.0),
+        (0.1, 0.0, 0.005, 0.1),
+        (0.0, 0.1, 0.005, -0.1),
+        (3.95, 4.0, 0.0125, -0.05),
+        (5.0, 2.0, 0.295, 0.1),
+        (-1.0, 5.0, 0.595, -0.1),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
-def test_gradient_modified_huber():
-    # Test ModifiedHuber
+def test_loss_modified_huber():
+    # (p, y, expected_loss, expected_dloss)
     loss = sgd_fast.ModifiedHuber()
     cases = [
-        # (p, y, expected)
-        (1.0, 1.0, 0.0), (-1.0, -1.0, 0.0), (2.0, 1.0, 0.0),
-        (0.0, 1.0, -2.0), (-1.0, 1.0, -4.0), (0.5, -1.0, 3.0),
-        (0.5, -1.0, 3.0), (-2.0, 1.0, -4.0), (-3.0, 1.0, -4.0)
+        # (p, y, expected_loss, expected_dloss)
+        (1.0, 1.0, 1.0, 0.0),
+        (-1.0, -1.0, 0.0, 0.0),
+        (2.0, 1.0, 0.0, 0.0),
+        (0.0, 1.0, 1.0, -2.0),
+        (-1.0, 1.0, 4.0, -4.0),
+        (0.5, -1.0, 2.25, 3.0),
+        (-2.0, 1.0, 8, -4.0),
+        (-3.0, 1.0, 12, -4.0),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
-def test_gradient_epsilon_insensitive():
+def test_loss_epsilon_insensitive():
     # Test EpsilonInsensitive
     loss = sgd_fast.EpsilonInsensitive(0.1)
     cases = [
-        (0.0, 0.0, 0.0), (0.1, 0.0, 0.0), (-2.05, -2.0, 0.0),
-        (3.05, 3.0, 0.0), (2.2, 2.0, 1.0), (2.0, -1.0, 1.0),
-        (2.0, 2.2, -1.0), (-2.0, 1.0, -1.0)
+        # (p, y, expected_loss, expected_dloss)
+        (0.0, 0.0, 0.0, 0.0),
+        (0.1, 0.0, 0.0, 0.0),
+        (-2.05, -2.0, 0.0, 0.0),
+        (3.05, 3.0, 0.0, 0.0),
+        (2.2, 2.0, 0.1, 1.0),
+        (2.0, -1.0, 2.9, 1.0),
+        (2.0, 2.2, 0.1, -1.0),
+        (-2.0, 1.0, 2.9, -1.0),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
-def test_gradient_squared_epsilon_insensitive():
+def test_loss_squared_epsilon_insensitive():
     # Test SquaredEpsilonInsensitive
     loss = sgd_fast.SquaredEpsilonInsensitive(0.1)
     cases = [
-        (0.0, 0.0, 0.0), (0.1, 0.0, 0.0), (-2.05, -2.0, 0.0),
-        (3.05, 3.0, 0.0), (2.2, 2.0, 0.2), (2.0, -1.0, 5.8),
-        (2.0, 2.2, -0.2), (-2.0, 1.0, -5.8)
+        # (p, y, expected_loss, expected_dloss)
+        (0.0, 0.0, 0.0, 0.0),
+        (0.1, 0.0, 0.0, 0.0),
+        (-2.05, -2.0, 0.0, 0.0),
+        (3.05, 3.0, 0.0, 0.0),
+        (2.2, 2.0, 0.01, 0.2),
+        (2.0, -1.0, 8.41, 5.8),
+        (2.0, 2.2, 0.01, -0.2),
+        (-2.0, 1.0, 8.41, -5.8),
     ]
-    _test_gradient_common(loss, cases)
+    _test_loss_common(loss, cases)
 
 
 def test_multi_thread_multi_class_and_early_stopping():
     # This is a non-regression test for a bad interaction between
     # early stopping internal attribute and thread-based parallelism.
-    clf = SGDClassifier(alpha=1e-3, tol=1e-3, max_iter=1000,
-                        early_stopping=True, n_iter_no_change=100,
-                        random_state=0, n_jobs=2)
+    clf = SGDClassifier(
+        alpha=1e-3,
+        tol=1e-3,
+        max_iter=1000,
+        early_stopping=True,
+        n_iter_no_change=100,
+        random_state=0,
+        n_jobs=2,
+    )
     clf.fit(iris.data, iris.target)
     assert clf.n_iter_ > clf.n_iter_no_change
     assert clf.n_iter_ < clf.n_iter_no_change + 20
@@ -1565,20 +1744,17 @@ def test_multi_core_gridsearch_and_early_stopping():
     # early stopping internal attribute and process-based multi-core
     # parallelism.
     param_grid = {
-        'alpha': np.logspace(-4, 4, 9),
-        'n_iter_no_change': [5, 10, 50],
+        "alpha": np.logspace(-4, 4, 9),
+        "n_iter_no_change": [5, 10, 50],
     }
 
-    clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True,
-                        random_state=0)
-    search = RandomizedSearchCV(clf, param_grid, n_iter=3, n_jobs=2,
-                                random_state=0)
+    clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True, random_state=0)
+    search = RandomizedSearchCV(clf, param_grid, n_iter=3, n_jobs=2, random_state=0)
     search.fit(iris.data, iris.target)
     assert search.best_score_ > 0.8
 
 
-@pytest.mark.parametrize("backend",
-                         ["loky", "multiprocessing", "threading"])
+@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
 def test_SGDClassifier_fit_for_all_backends(backend):
     # This is a non-regression smoke test. In the multi-class case,
     # SGDClassifier.fit fits each class in a one-versus-all fashion using
@@ -1594,28 +1770,24 @@ def test_SGDClassifier_fit_for_all_backends(backend):
     # a segmentation fault when trying to write in a readonly memory mapped
     # buffer.
 
-    if (parse_version(joblib.__version__) < parse_version('0.12')
-            and backend == 'loky'):
-        pytest.skip('loky backend does not exist in joblib <0.12')
+    if parse_version(joblib.__version__) < parse_version("0.12") and backend == "loky":
+        pytest.skip("loky backend does not exist in joblib <0.12")
 
     random_state = np.random.RandomState(42)
 
     # Create a classification problem with 50000 features and 20 classes. Using
     # loky or multiprocessing this make the clf.coef_ exceed the threshold
     # above which memmaping is used in joblib and loky (1MB as of 2018/11/1).
-    X = sp.random(500, 2000, density=0.02, format='csr',
-                  random_state=random_state)
+    X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state)
     y = random_state.choice(20, 500)
 
     # Begin by fitting a SGD classifier sequentially
-    clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1,
-                                   random_state=42)
+    clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1, random_state=42)
     clf_sequential.fit(X, y)
 
     # Fit a SGDClassifier using the specified backend, and make sure the
     # coefficients are equal to those obtained using a sequential fit
-    clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4,
-                                 random_state=42)
+    clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4, random_state=42)
     with joblib.parallel_backend(backend=backend):
         clf_parallel.fit(X, y)
     assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_)

From c7f353fe0bafdee739eb9102c79c496481b1702b Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Thu, 26 Nov 2020 19:59:28 +0100
Subject: [PATCH 3/8] without black

---
 sklearn/linear_model/tests/test_sgd.py | 1002 ++++++++++--------------
 1 file changed, 418 insertions(+), 584 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 67f43cffcecd1..b8d8961e897f3 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -64,7 +64,8 @@ def partial_fit(self, X, y, *args, **kw):
     def decision_function(self, X, *args, **kw):
         # XXX untested as of v0.22
         X = sp.csr_matrix(X)
-        return linear_model.SGDRegressor.decision_function(self, X, *args, **kw)
+        return linear_model.SGDRegressor.decision_function(self, X, *args,
+                                                           **kw)
 
 
 def SGDClassifier(**kwargs):
@@ -96,51 +97,25 @@ def SparseSGDRegressor(**kwargs):
 true_result = [1, 2, 2]
 
 # test sample 2; string class labels
-X2 = np.array(
-    [
-        [-1, 1],
-        [-0.75, 0.5],
-        [-1.5, 1.5],
-        [1, 1],
-        [0.75, 0.5],
-        [1.5, 1.5],
-        [-1, -1],
-        [0, -0.5],
-        [1, -1],
-    ]
-)
+X2 = np.array([[-1, 1], [-0.75, 0.5], [-1.5, 1.5],
+               [1, 1], [0.75, 0.5], [1.5, 1.5],
+               [-1, -1], [0, -0.5], [1, -1]])
 Y2 = ["one"] * 3 + ["two"] * 3 + ["three"] * 3
 T2 = np.array([[-1.5, 0.5], [1, 2], [0, -2]])
 true_result2 = ["one", "two", "three"]
 
 # test sample 3
-X3 = np.array(
-    [
-        [1, 1, 0, 0, 0, 0],
-        [1, 1, 0, 0, 0, 0],
-        [0, 0, 1, 0, 0, 0],
-        [0, 0, 1, 0, 0, 0],
-        [0, 0, 0, 0, 1, 1],
-        [0, 0, 0, 0, 1, 1],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-    ]
-)
+X3 = np.array([[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0],
+               [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0],
+               [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1],
+               [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0]])
 Y3 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
 
 # test sample 4 - two more or less redundant feature groups
-X4 = np.array(
-    [
-        [1, 0.9, 0.8, 0, 0, 0],
-        [1, 0.84, 0.98, 0, 0, 0],
-        [1, 0.96, 0.88, 0, 0, 0],
-        [1, 0.91, 0.99, 0, 0, 0],
-        [0, 0, 0, 0.89, 0.91, 1],
-        [0, 0, 0, 0.79, 0.84, 1],
-        [0, 0, 0, 0.91, 0.95, 1],
-        [0, 0, 0, 0.93, 1, 1],
-    ]
-)
+X4 = np.array([[1, 0.9, 0.8, 0, 0, 0], [1, .84, .98, 0, 0, 0],
+               [1, .96, .88, 0, 0, 0], [1, .91, .99, 0, 0, 0],
+               [0, 0, 0, .89, .91, 1], [0, 0, 0, .79, .84, 1],
+               [0, 0, 0, .91, .95, 1], [0, 0, 0, .93, 1, 1]])
 Y4 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
 
 iris = datasets.load_iris()
@@ -169,7 +144,7 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
 
     # sparse data has a fixed decay of .01
     if klass in (SparseSGDClassifier, SparseSGDRegressor):
-        decay = 0.01
+        decay = .01
 
     for i, entry in enumerate(X):
         p = np.dot(entry, weights)
@@ -190,25 +165,23 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     return average_weights, average_intercept
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_alpha(klass):
     # Check whether expected ValueError on bad alpha
-    assert_raises(ValueError, klass, alpha=-0.1)
+    assert_raises(ValueError, klass, alpha=-.1)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_penalty(klass):
     # Check whether expected ValueError on bad penalty
-    assert_raises(ValueError, klass, penalty="foobar", l1_ratio=0.85)
+    assert_raises(ValueError, klass, penalty='foobar',
+                  l1_ratio=0.85)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_loss(klass):
     # Check whether expected ValueError on bad loss
     assert_raises(ValueError, klass, loss="foobar")
@@ -216,16 +189,19 @@ def test_sgd_bad_loss(klass):
 
 def _test_warm_start(klass, X, Y, lr):
     # Test that explicit warm restart...
-    clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf = klass(alpha=0.01, eta0=0.01, shuffle=False,
+                learning_rate=lr)
     clf.fit(X, Y)
 
-    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
-    clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy())
+    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False,
+                 learning_rate=lr)
+    clf2.fit(X, Y,
+             coef_init=clf.coef_.copy(),
+             intercept_init=clf.intercept_.copy())
 
     # ... and implicit warm restart are equivalent.
-    clf3 = klass(
-        alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
-    )
+    clf3 = klass(alpha=0.01, eta0=0.01, shuffle=False,
+                 warm_start=True, learning_rate=lr)
     clf3.fit(X, Y)
 
     assert clf3.t_ == clf.t_
@@ -238,17 +214,16 @@ def _test_warm_start(klass, X, Y, lr):
     assert_array_almost_equal(clf3.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
-@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
 def test_warm_start(klass, lr):
     _test_warm_start(klass, X, Y, lr)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_input_format(klass):
     # Input format tests.
     clf = klass(alpha=0.01, shuffle=False)
@@ -259,63 +234,56 @@ def test_input_format(klass):
     assert_raises(ValueError, clf.fit, X, Y_)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_clone(klass):
     # Test whether clone works ok.
-    clf = klass(alpha=0.01, penalty="l1")
+    clf = klass(alpha=0.01, penalty='l1')
     clf = clone(clf)
-    clf.set_params(penalty="l2")
+    clf.set_params(penalty='l2')
     clf.fit(X, Y)
 
-    clf2 = klass(alpha=0.01, penalty="l2")
+    clf2 = klass(alpha=0.01, penalty='l2')
     clf2.fit(X, Y)
 
     assert_array_equal(clf.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_plain_has_no_average_attr(klass):
-    clf = klass(average=True, eta0=0.01)
+    clf = klass(average=True, eta0=.01)
     clf.fit(X, Y)
 
-    assert hasattr(clf, "_average_coef")
-    assert hasattr(clf, "_average_intercept")
-    assert hasattr(clf, "_standard_intercept")
-    assert hasattr(clf, "_standard_coef")
+    assert hasattr(clf, '_average_coef')
+    assert hasattr(clf, '_average_intercept')
+    assert hasattr(clf, '_standard_intercept')
+    assert hasattr(clf, '_standard_coef')
 
     clf = klass()
     clf.fit(X, Y)
 
-    assert not hasattr(clf, "_average_coef")
-    assert not hasattr(clf, "_average_intercept")
-    assert not hasattr(clf, "_standard_intercept")
-    assert not hasattr(clf, "_standard_coef")
+    assert not hasattr(clf, '_average_coef')
+    assert not hasattr(clf, '_average_intercept')
+    assert not hasattr(clf, '_standard_intercept')
+    assert not hasattr(clf, '_standard_coef')
 
 
 # TODO: remove in 0.25
-@pytest.mark.parametrize("klass", [SGDClassifier, SGDRegressor])
+@pytest.mark.parametrize('klass', [SGDClassifier, SGDRegressor])
 def test_sgd_deprecated_attr(klass):
-    est = klass(average=True, eta0=0.01)
+    est = klass(average=True, eta0=.01)
     est.fit(X, Y)
 
     msg = "Attribute {} was deprecated"
-    for att in [
-        "average_coef_",
-        "average_intercept_",
-        "standard_coef_",
-        "standard_intercept_",
-    ]:
+    for att in ['average_coef_', 'average_intercept_',
+                'standard_coef_', 'standard_intercept_']:
         with pytest.warns(FutureWarning, match=msg.format(att)):
             getattr(est, att)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_late_onset_averaging_not_reached(klass):
     clf1 = klass(average=600)
     clf2 = klass()
@@ -331,122 +299,96 @@ def test_late_onset_averaging_not_reached(klass):
     assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_late_onset_averaging_reached(klass):
-    eta0 = 0.001
-    alpha = 0.0001
+    eta0 = .001
+    alpha = .0001
     Y_encode = np.array(Y)
     Y_encode[Y_encode == 1] = -1.0
     Y_encode[Y_encode == 2] = 1.0
 
-    clf1 = klass(
-        average=7,
-        learning_rate="constant",
-        loss="squared_loss",
-        eta0=eta0,
-        alpha=alpha,
-        max_iter=2,
-        shuffle=False,
-    )
-    clf2 = klass(
-        average=0,
-        learning_rate="constant",
-        loss="squared_loss",
-        eta0=eta0,
-        alpha=alpha,
-        max_iter=1,
-        shuffle=False,
-    )
+    clf1 = klass(average=7, learning_rate="constant",
+                 loss='squared_loss', eta0=eta0,
+                 alpha=alpha, max_iter=2, shuffle=False)
+    clf2 = klass(average=0, learning_rate="constant",
+                 loss='squared_loss', eta0=eta0,
+                 alpha=alpha, max_iter=1, shuffle=False)
 
     clf1.fit(X, Y_encode)
     clf2.fit(X, Y_encode)
 
-    average_weights, average_intercept = asgd(
-        klass,
-        X,
-        Y_encode,
-        eta0,
-        alpha,
-        weight_init=clf2.coef_.ravel(),
-        intercept_init=clf2.intercept_,
-    )
-
-    assert_array_almost_equal(clf1.coef_.ravel(), average_weights.ravel(), decimal=16)
+    average_weights, average_intercept = \
+        asgd(klass, X, Y_encode, eta0, alpha,
+             weight_init=clf2.coef_.ravel(),
+             intercept_init=clf2.intercept_)
+
+    assert_array_almost_equal(clf1.coef_.ravel(),
+                              average_weights.ravel(),
+                              decimal=16)
     assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_alpha_for_optimal_learning_rate(klass):
     # Check whether expected ValueError on bad alpha, i.e. 0
     # since alpha is used to compute the optimal learning rate
-    assert_raises(ValueError, klass, alpha=0, learning_rate="optimal")
+    assert_raises(ValueError, klass,
+                  alpha=0, learning_rate="optimal")
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_early_stopping(klass):
     X = iris.data[iris.target > 0]
     Y = iris.target[iris.target > 0]
     for early_stopping in [True, False]:
         max_iter = 1000
-        clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit(
-            X, Y
-        )
+        clf = klass(early_stopping=early_stopping, tol=1e-3,
+                    max_iter=max_iter).fit(X, Y)
         assert clf.n_iter_ < max_iter
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_adaptive_longer_than_constant(klass):
-    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
+    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3,
+                 max_iter=100)
     clf1.fit(iris.data, iris.target)
-    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
+    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3,
+                 max_iter=100)
     clf2.fit(iris.data, iris.target)
     assert clf1.n_iter_ > clf2.n_iter_
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_validation_set_not_used_for_training(klass):
     X, Y = iris.data, iris.target
     validation_fraction = 0.4
     seed = 42
     shuffle = False
     max_iter = 10
-    clf1 = klass(
-        early_stopping=True,
-        random_state=np.random.RandomState(seed),
-        validation_fraction=validation_fraction,
-        learning_rate="constant",
-        eta0=0.01,
-        tol=None,
-        max_iter=max_iter,
-        shuffle=shuffle,
-    )
+    clf1 = klass(early_stopping=True,
+                 random_state=np.random.RandomState(seed),
+                 validation_fraction=validation_fraction,
+                 learning_rate='constant', eta0=0.01,
+                 tol=None, max_iter=max_iter, shuffle=shuffle)
     clf1.fit(X, Y)
     assert clf1.n_iter_ == max_iter
 
-    clf2 = klass(
-        early_stopping=False,
-        random_state=np.random.RandomState(seed),
-        learning_rate="constant",
-        eta0=0.01,
-        tol=None,
-        max_iter=max_iter,
-        shuffle=shuffle,
-    )
+    clf2 = klass(early_stopping=False,
+                 random_state=np.random.RandomState(seed),
+                 learning_rate='constant', eta0=0.01,
+                 tol=None, max_iter=max_iter, shuffle=shuffle)
 
     if is_classifier(clf2):
-        cv = StratifiedShuffleSplit(test_size=validation_fraction, random_state=seed)
+        cv = StratifiedShuffleSplit(test_size=validation_fraction,
+                                    random_state=seed)
     else:
-        cv = ShuffleSplit(test_size=validation_fraction, random_state=seed)
+        cv = ShuffleSplit(test_size=validation_fraction,
+                          random_state=seed)
     idx_train, idx_val = next(cv.split(X, Y))
     idx_train = np.sort(idx_train)  # remove shuffling
     clf2.fit(X[idx_train], Y[idx_train])
@@ -455,30 +397,22 @@ def test_validation_set_not_used_for_training(klass):
     assert_array_equal(clf1.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_n_iter_no_change(klass):
     X, Y = iris.data, iris.target
     # test that n_iter_ increases monotonically with n_iter_no_change
     for early_stopping in [True, False]:
-        n_iter_list = [
-            klass(
-                early_stopping=early_stopping,
-                n_iter_no_change=n_iter_no_change,
-                tol=1e-4,
-                max_iter=1000,
-            )
-            .fit(X, Y)
-            .n_iter_
-            for n_iter_no_change in [2, 3, 10]
-        ]
+        n_iter_list = [klass(early_stopping=early_stopping,
+                             n_iter_no_change=n_iter_no_change,
+                             tol=1e-4, max_iter=1000
+                             ).fit(X, Y).n_iter_
+                       for n_iter_no_change in [2, 3, 10]]
         assert_array_equal(n_iter_list, sorted(n_iter_list))
 
 
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_not_enough_sample_for_early_stopping(klass):
     # test an error is raised if the training or validation set is empty
     clf = klass(early_stopping=True, validation_fraction=0.99)
@@ -489,127 +423,119 @@ def test_not_enough_sample_for_early_stopping(klass):
 ###############################################################################
 # Classification Test Case
 
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_clf(klass):
     # Check that SGD gives any results :-)
 
     for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
-        clf = klass(
-            penalty="l2",
-            alpha=0.01,
-            fit_intercept=True,
-            loss=loss,
-            max_iter=10,
-            shuffle=True,
-        )
+        clf = klass(penalty='l2', alpha=0.01, fit_intercept=True,
+                    loss=loss, max_iter=10, shuffle=True)
         clf.fit(X, Y)
         # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7)
         assert_array_equal(clf.predict(T), true_result)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_l1_ratio(klass):
     # Check whether expected ValueError on bad l1_ratio
     assert_raises(ValueError, klass, l1_ratio=1.1)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_learning_rate_schedule(klass):
     # Check whether expected ValueError on bad learning_rate
     assert_raises(ValueError, klass, learning_rate="<unknown>")
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_eta0(klass):
     # Check whether expected ValueError on bad eta0
-    assert_raises(ValueError, klass, eta0=0, learning_rate="constant")
+    assert_raises(ValueError, klass, eta0=0,
+                  learning_rate="constant")
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_max_iter_param(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, max_iter=-10000)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_shuffle_param(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, shuffle="false")
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_early_stopping_param(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, early_stopping="false")
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_validation_fraction(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, validation_fraction=-0.1)
+    assert_raises(ValueError, klass, validation_fraction=-.1)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_n_iter_no_change(klass):
     # Test parameter validity check
     assert_raises(ValueError, klass, n_iter_no_change=0)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_argument_coef(klass):
     # Checks coef_init not allowed as model argument (only fit)
     # Provided coef_ does not match dataset
     assert_raises(TypeError, klass, coef_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_provide_coef(klass):
     # Checks coef_init shape for the warm starts
     # Provided coef_ does not match dataset.
-    assert_raises(ValueError, klass().fit, X, Y, coef_init=np.zeros((3,)))
+    assert_raises(ValueError, klass().fit,
+                  X, Y, coef_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept(klass):
     # Checks intercept_ shape for the warm starts
     # Provided intercept_ does not match dataset.
-    assert_raises(ValueError, klass().fit, X, Y, intercept_init=np.zeros((3,)))
+    assert_raises(ValueError, klass().fit,
+                  X, Y, intercept_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_early_stopping_with_partial_fit(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass(early_stopping=True).partial_fit, X, Y)
+    assert_raises(ValueError,
+                  klass(early_stopping=True).partial_fit, X, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept_binary(klass):
     # Checks intercept_ shape for the warm starts in binary case
     klass().fit(X5, Y5, intercept_init=0)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_average_binary_computed_correctly(klass):
     # Checks the SGDClassifier correctly computes the average weights
-    eta = 0.1
-    alpha = 2.0
+    eta = .1
+    alpha = 2.
     n_samples = 20
     n_features = 10
     rng = np.random.RandomState(0)
     X = rng.normal(size=(n_samples, n_features))
     w = rng.normal(size=n_features)
 
-    clf = klass(
-        loss="squared_loss",
-        learning_rate="constant",
-        eta0=eta,
-        alpha=alpha,
-        fit_intercept=True,
-        max_iter=1,
-        average=True,
-        shuffle=False,
-    )
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
 
     # simple linear function without noise
     y = np.dot(X, w)
@@ -619,11 +545,13 @@ def test_average_binary_computed_correctly(klass):
 
     average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
     average_weights = average_weights.reshape(1, -1)
-    assert_array_almost_equal(clf.coef_, average_weights, decimal=14)
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=14)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept_to_intercept(klass):
     # Checks intercept_ shape consistency for the warm starts
     # Inconsistent intercept_ shape.
@@ -633,37 +561,31 @@ def test_set_intercept_to_intercept(klass):
     klass().fit(X, Y, intercept_init=clf.intercept_)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_at_least_two_labels(klass):
     # Target must have at least two labels
     clf = klass(alpha=0.01, max_iter=20)
     assert_raises(ValueError, clf.fit, X2, np.ones(9))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_weight_class_balanced(klass):
     # partial_fit with class_weight='balanced' not supported"""
-    regex = (
-        r"class_weight 'balanced' is not supported for "
-        r"partial_fit\. In order to use 'balanced' weights, "
-        r"use compute_class_weight\('balanced', classes=classes, y=y\). "
-        r"In place of y you can us a large enough sample "
-        r"of the full training set target to properly "
-        r"estimate the class frequency distributions\. "
-        r"Pass the resulting weights as the class_weight "
-        r"parameter\."
-    )
-    assert_raises_regexp(
-        ValueError,
-        regex,
-        klass(class_weight="balanced").partial_fit,
-        X,
-        Y,
-        classes=np.unique(Y),
-    )
-
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+    regex = (r"class_weight 'balanced' is not supported for "
+             r"partial_fit\. In order to use 'balanced' weights, "
+             r"use compute_class_weight\('balanced', classes=classes, y=y\). "
+             r"In place of y you can us a large enough sample "
+             r"of the full training set target to properly "
+             r"estimate the class frequency distributions\. "
+             r"Pass the resulting weights as the class_weight "
+             r"parameter\.")
+    assert_raises_regexp(ValueError,
+                         regex,
+                         klass(class_weight='balanced').partial_fit,
+                         X, Y, classes=np.unique(Y))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass(klass):
     # Multi-class test case
     clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
@@ -674,21 +596,16 @@ def test_sgd_multiclass(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass_average(klass):
-    eta = 0.001
-    alpha = 0.01
+    eta = .001
+    alpha = .01
     # Multi-class average test case
-    clf = klass(
-        loss="squared_loss",
-        learning_rate="constant",
-        eta0=eta,
-        alpha=alpha,
-        fit_intercept=True,
-        max_iter=1,
-        average=True,
-        shuffle=False,
-    )
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
 
     np_Y2 = np.array(Y2)
     clf.fit(X2, np_Y2)
@@ -699,21 +616,24 @@ def test_sgd_multiclass_average(klass):
         y_i[np_Y2 != cl] = -1
         average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
         assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
-        assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16)
+        assert_almost_equal(average_intercept,
+                            clf.intercept_[i],
+                            decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass_with_init_coef(klass):
     # Multi-class test case
     clf = klass(alpha=0.01, max_iter=20)
-    clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3))
+    clf.fit(X2, Y2, coef_init=np.zeros((3, 2)),
+            intercept_init=np.zeros(3))
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape, (3,)
     pred = clf.predict(T2)
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_multiclass_njobs(klass):
     # Multi-class test case with multi-core support
     clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
@@ -724,7 +644,7 @@ def test_sgd_multiclass_njobs(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_set_coef_multiclass(klass):
     # Checks coef_init and intercept_init shape for multi-class
     # problems
@@ -737,13 +657,14 @@ def test_set_coef_multiclass(klass):
 
     # Provided intercept_ does not match dataset
     clf = klass()
-    assert_raises(ValueError, clf.fit, X2, Y2, intercept_init=np.zeros((1,)))
+    assert_raises(ValueError, clf.fit, X2, Y2,
+                  intercept_init=np.zeros((1,)))
 
     # Provided intercept_ does match dataset.
     clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_predict_proba_method_access(klass):
     # Checks that SGDClassifier predict_proba and predict_log_proba methods
     # can either be accessed or raise an appropriate error message
@@ -752,29 +673,31 @@ def test_sgd_predict_proba_method_access(klass):
     # details.
     for loss in linear_model.SGDClassifier.loss_functions:
         clf = SGDClassifier(loss=loss)
-        if loss in ("log", "modified_huber"):
-            assert hasattr(clf, "predict_proba")
-            assert hasattr(clf, "predict_log_proba")
+        if loss in ('log', 'modified_huber'):
+            assert hasattr(clf, 'predict_proba')
+            assert hasattr(clf, 'predict_log_proba')
         else:
-            message = "probability estimates are not " "available for loss={!r}".format(
-                loss
-            )
-            assert not hasattr(clf, "predict_proba")
-            assert not hasattr(clf, "predict_log_proba")
-            with pytest.raises(AttributeError, match=message):
+            message = ("probability estimates are not "
+                       "available for loss={!r}".format(loss))
+            assert not hasattr(clf, 'predict_proba')
+            assert not hasattr(clf, 'predict_log_proba')
+            with pytest.raises(AttributeError,
+                               match=message):
                 clf.predict_proba
-            with pytest.raises(AttributeError, match=message):
+            with pytest.raises(AttributeError,
+                               match=message):
                 clf.predict_log_proba
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_proba(klass):
     # Check SGD.predict_proba
 
     # Hinge loss does not allow for conditional prob estimate.
     # We cannot use the factory here, because it defines predict_proba
     # anyway.
-    clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=10, tol=None).fit(X, Y)
+    clf = SGDClassifier(loss="hinge", alpha=0.01,
+                        max_iter=10, tol=None).fit(X, Y)
     assert not hasattr(clf, "predict_proba")
     assert not hasattr(clf, "predict_log_proba")
 
@@ -796,8 +719,8 @@ def test_sgd_proba(klass):
     # log loss multiclass probability estimates
     clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
 
-    d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
-    p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
+    d = clf.decision_function([[.1, -.1], [.3, .2]])
+    p = clf.predict_proba([[.1, -.1], [.3, .2]])
     assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
     assert_almost_equal(p[0].sum(), 1)
     assert np.all(p[0] >= 0)
@@ -823,7 +746,7 @@ def test_sgd_proba(klass):
     p = clf.predict_proba([[3, 2]])
     if klass != SparseSGDClassifier:
         assert np.argmax(d, axis=1) == np.argmax(p, axis=1)
-    else:  # XXX the sparse test gets a different X2 (?)
+    else:   # XXX the sparse test gets a different X2 (?)
         assert np.argmin(d, axis=1) == np.argmin(p, axis=1)
 
     # the following sample produces decision_function values < -1,
@@ -833,10 +756,10 @@ def test_sgd_proba(klass):
     d = clf.decision_function([x])
     if np.all(d < -1):  # XXX not true in sparse test case (why?)
         p = clf.predict_proba([x])
-        assert_array_almost_equal(p[0], [1 / 3.0] * 3)
+        assert_array_almost_equal(p[0], [1 / 3.] * 3)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_l1(klass):
     # Test L1 regularization
     n = len(X4)
@@ -847,14 +770,8 @@ def test_sgd_l1(klass):
     X = X4[idx, :]
     Y = Y4[idx]
 
-    clf = klass(
-        penalty="l1",
-        alpha=0.2,
-        fit_intercept=False,
-        max_iter=2000,
-        tol=None,
-        shuffle=False,
-    )
+    clf = klass(penalty='l1', alpha=.2, fit_intercept=False,
+                max_iter=2000, tol=None, shuffle=False)
     clf.fit(X, Y)
     assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
     pred = clf.predict(X)
@@ -873,18 +790,21 @@ def test_sgd_l1(klass):
     assert_array_equal(pred, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_class_weights(klass):
     # Test class weights.
-    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                  [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False,
+                class_weight=None)
     clf.fit(X, y)
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
 
     # we give a small weights to class 1
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001})
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False,
+                class_weight={1: 0.001})
     clf.fit(X, y)
 
     # now the hyperplane should rotate clock-wise and
@@ -892,7 +812,7 @@ def test_class_weights(klass):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_equal_class_weight(klass):
     # Test if equal class weights approx. equals no class weights.
     X = [[1, 0], [1, 0], [0, 1], [0, 1]]
@@ -902,31 +822,32 @@ def test_equal_class_weight(klass):
 
     X = [[1, 0], [0, 1]]
     y = [0, 1]
-    clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
+    clf_weighted = klass(alpha=0.1, max_iter=1000,
+                         class_weight={0: 0.5, 1: 0.5})
     clf_weighted.fit(X, y)
 
     # should be similar up to some epsilon due to learning rate schedule
     assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_wrong_class_weight_label(klass):
     # ValueError due to not existing class label.
     clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
     assert_raises(ValueError, clf.fit, X, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_wrong_class_weight_format(klass):
     # ValueError due to wrong class_weight argument type.
     clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5])
     assert_raises(ValueError, clf.fit, X, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_weights_multiplied(klass):
     # Tests that class_weight and sample_weight are multiplicative
-    class_weights = {1: 0.6, 2: 0.3}
+    class_weights = {1: .6, 2: .3}
     rng = np.random.RandomState(0)
     sample_weights = rng.random_sample(Y4.shape[0])
     multiplied_together = np.copy(sample_weights)
@@ -942,7 +863,7 @@ def test_weights_multiplied(klass):
     assert_almost_equal(clf1.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_balanced_weight(klass):
     # Test class weights for imbalanced data"""
     # compute reference metrics on iris dataset that is quite balanced by
@@ -954,15 +875,16 @@ def test_balanced_weight(klass):
     rng.shuffle(idx)
     X = X[idx]
     y = y[idx]
-    clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y)
-    f1 = metrics.f1_score(y, clf.predict(X), average="weighted")
+    clf = klass(alpha=0.0001, max_iter=1000,
+                class_weight=None, shuffle=False).fit(X, y)
+    f1 = metrics.f1_score(y, clf.predict(X), average='weighted')
     assert_almost_equal(f1, 0.96, decimal=1)
 
     # make the same prediction using balanced class_weight
-    clf_balanced = klass(
-        alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False
-    ).fit(X, y)
-    f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted")
+    clf_balanced = klass(alpha=0.0001, max_iter=1000,
+                         class_weight="balanced",
+                         shuffle=False).fit(X, y)
+    f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted')
     assert_almost_equal(f1, 0.96, decimal=1)
 
     # Make sure that in the balanced case it does not change anything
@@ -980,19 +902,21 @@ def test_balanced_weight(klass):
     clf = klass(max_iter=1000, class_weight=None, shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
-    assert metrics.f1_score(y, y_pred, average="weighted") < 0.96
+    assert metrics.f1_score(y, y_pred, average='weighted') < 0.96
 
     # fit a model with balanced class_weight enabled
-    clf = klass(max_iter=1000, class_weight="balanced", shuffle=False)
+    clf = klass(max_iter=1000, class_weight="balanced",
+                shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
-    assert metrics.f1_score(y, y_pred, average="weighted") > 0.96
+    assert metrics.f1_score(y, y_pred, average='weighted') > 0.96
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sample_weights(klass):
     # Test weights on individual samples
-    X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                  [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
     clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
@@ -1007,7 +931,7 @@ def test_sample_weights(klass):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_wrong_sample_weights(klass):
     # Test if ValueError is raised if sample_weight has wrong shape
     clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
@@ -1015,14 +939,14 @@ def test_wrong_sample_weights(klass):
     assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_exception(klass):
     clf = klass(alpha=0.01)
     # classes was not specified
     assert_raises(ValueError, clf.partial_fit, X3, Y3)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_binary(klass):
     third = X.shape[0] // 3
     clf = klass(alpha=0.01)
@@ -1031,7 +955,7 @@ def test_partial_fit_binary(klass):
     clf.partial_fit(X[:third], Y[:third], classes=classes)
     assert clf.coef_.shape == (1, X.shape[1])
     assert clf.intercept_.shape == (1,)
-    assert clf.decision_function([[0, 0]]).shape == (1,)
+    assert clf.decision_function([[0, 0]]).shape == (1, )
     id1 = id(clf.coef_.data)
 
     clf.partial_fit(X[third:], Y[third:])
@@ -1043,7 +967,7 @@ def test_partial_fit_binary(klass):
     assert_array_equal(y_pred, true_result)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_multiclass(klass):
     third = X2.shape[0] // 3
     clf = klass(alpha=0.01)
@@ -1061,7 +985,7 @@ def test_partial_fit_multiclass(klass):
     assert id1, id2
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_multiclass_average(klass):
     third = X2.shape[0] // 3
     clf = klass(alpha=0.01, average=X2.shape[0])
@@ -1076,27 +1000,30 @@ def test_partial_fit_multiclass_average(klass):
     assert clf.intercept_.shape == (3,)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_fit_then_partial_fit(klass):
     # Partial_fit should work after initial fit in the multiclass case.
     # Non-regression test for #2496; fit would previously produce a
     # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
     clf = klass()
     clf.fit(X2, Y2)
-    clf.partial_fit(X2, Y2)  # no exception here
+    clf.partial_fit(X2, Y2)     # no exception here
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
 def test_partial_fit_equal_fit_classif(klass, lr):
     for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
-        clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False)
+        clf = klass(alpha=0.01, eta0=0.01, max_iter=2,
+                    learning_rate=lr, shuffle=False)
         clf.fit(X_, Y_)
         y_pred = clf.decision_function(T_)
         t = clf.t_
 
         classes = np.unique(Y_)
-        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr,
+                    shuffle=False)
         for i in range(2):
             clf.partial_fit(X_, Y_, classes=classes)
         y_pred2 = clf.decision_function(T_)
@@ -1105,26 +1032,18 @@ def test_partial_fit_equal_fit_classif(klass, lr):
         assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_regression_losses(klass):
     random_state = np.random.RandomState(1)
-    clf = klass(
-        alpha=0.01,
-        learning_rate="constant",
-        eta0=0.1,
-        loss="epsilon_insensitive",
-        random_state=random_state,
-    )
+    clf = klass(alpha=0.01, learning_rate="constant",
+                eta0=0.1, loss="epsilon_insensitive",
+                random_state=random_state)
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(
-        alpha=0.01,
-        learning_rate="constant",
-        eta0=0.1,
-        loss="squared_epsilon_insensitive",
-        random_state=random_state,
-    )
+    clf = klass(alpha=0.01, learning_rate="constant",
+                eta0=0.1, loss="squared_epsilon_insensitive",
+                random_state=random_state)
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
@@ -1132,23 +1051,18 @@ def test_regression_losses(klass):
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(
-        alpha=0.01,
-        learning_rate="constant",
-        eta0=0.01,
-        loss="squared_loss",
-        random_state=random_state,
-    )
+    clf = klass(alpha=0.01, learning_rate="constant", eta0=0.01,
+                loss="squared_loss", random_state=random_state)
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_warm_start_multiclass(klass):
     _test_warm_start(klass, X2, Y2, "optimal")
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_multiple_fit(klass):
     # Test multiple calls of fit w/ different shaped inputs.
     clf = klass(alpha=0.01, shuffle=False)
@@ -1163,8 +1077,7 @@ def test_multiple_fit(klass):
 ###############################################################################
 # Regression Test Case
 
-
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_sgd_reg(klass):
     # Check that SGD gives any results.
     clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
@@ -1172,12 +1085,12 @@ def test_sgd_reg(klass):
     assert clf.coef_[0] == clf.coef_[1]
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_sgd_averaged_computed_correctly(klass):
     # Tests the average regressor matches the naive implementation
 
-    eta = 0.001
-    alpha = 0.01
+    eta = .001
+    alpha = .01
     n_samples = 20
     n_features = 10
     rng = np.random.RandomState(0)
@@ -1187,29 +1100,26 @@ def test_sgd_averaged_computed_correctly(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(
-        loss="squared_loss",
-        learning_rate="constant",
-        eta0=eta,
-        alpha=alpha,
-        fit_intercept=True,
-        max_iter=1,
-        average=True,
-        shuffle=False,
-    )
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
 
     clf.fit(X, y)
     average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
 
-    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_sgd_averaged_partial_fit(klass):
     # Tests whether the partial fit yields the same average as the fit
-    eta = 0.001
-    alpha = 0.01
+    eta = .001
+    alpha = .01
     n_samples = 20
     n_features = 10
     rng = np.random.RandomState(0)
@@ -1219,53 +1129,47 @@ def test_sgd_averaged_partial_fit(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(
-        loss="squared_loss",
-        learning_rate="constant",
-        eta0=eta,
-        alpha=alpha,
-        fit_intercept=True,
-        max_iter=1,
-        average=True,
-        shuffle=False,
-    )
-
-    clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)])
-    clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :])
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)])
+    clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):])
     average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
 
-    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=16)
     assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_average_sparse(klass):
     # Checks the average weights on data with 0s
 
-    eta = 0.001
-    alpha = 0.01
-    clf = klass(
-        loss="squared_loss",
-        learning_rate="constant",
-        eta0=eta,
-        alpha=alpha,
-        fit_intercept=True,
-        max_iter=1,
-        average=True,
-        shuffle=False,
-    )
+    eta = .001
+    alpha = .01
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
 
     n_samples = Y3.shape[0]
 
-    clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)])
-    clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :])
+    clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)])
+    clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):])
     average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
 
-    assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_sgd_least_squares_fit(klass):
     xmin, xmax = -5, 5
     n_samples = 100
@@ -1275,7 +1179,8 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss="squared_loss", alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
+                fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1283,13 +1188,14 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss="squared_loss", alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
+                fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_sgd_epsilon_insensitive(klass):
     xmin, xmax = -5, 5
     n_samples = 100
@@ -1299,13 +1205,9 @@ def test_sgd_epsilon_insensitive(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(
-        loss="epsilon_insensitive",
-        epsilon=0.01,
-        alpha=0.1,
-        max_iter=20,
-        fit_intercept=False,
-    )
+    clf = klass(loss='epsilon_insensitive', epsilon=0.01,
+                alpha=0.1, max_iter=20,
+                fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1313,19 +1215,15 @@ def test_sgd_epsilon_insensitive(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(
-        loss="epsilon_insensitive",
-        epsilon=0.01,
-        alpha=0.1,
-        max_iter=20,
-        fit_intercept=False,
-    )
+    clf = klass(loss='epsilon_insensitive', epsilon=0.01,
+                alpha=0.1, max_iter=20,
+                fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_sgd_huber_fit(klass):
     xmin, xmax = -5, 5
     n_samples = 100
@@ -1335,7 +1233,8 @@ def test_sgd_huber_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
+                fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1343,13 +1242,14 @@ def test_sgd_huber_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
+                fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_elasticnet_convergence(klass):
     # Check that the SGD output is consistent with coordinate descent
 
@@ -1364,35 +1264,30 @@ def test_elasticnet_convergence(klass):
     # XXX: alpha = 0.1 seems to cause convergence problems
     for alpha in [0.01, 0.001]:
         for l1_ratio in [0.5, 0.8, 1.0]:
-            cd = linear_model.ElasticNet(
-                alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False
-            )
+            cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio,
+                                         fit_intercept=False)
             cd.fit(X, y)
-            sgd = klass(
-                penalty="elasticnet",
-                max_iter=50,
-                alpha=alpha,
-                l1_ratio=l1_ratio,
-                fit_intercept=False,
-            )
+            sgd = klass(penalty='elasticnet', max_iter=50,
+                        alpha=alpha, l1_ratio=l1_ratio,
+                        fit_intercept=False)
             sgd.fit(X, y)
-            err_msg = (
-                "cd and sgd did not converge to comparable "
-                "results for alpha=%f and l1_ratio=%f" % (alpha, l1_ratio)
-            )
-            assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg)
+            err_msg = ("cd and sgd did not converge to comparable "
+                       "results for alpha=%f and l1_ratio=%f"
+                       % (alpha, l1_ratio))
+            assert_almost_equal(cd.coef_, sgd.coef_, decimal=2,
+                                err_msg=err_msg)
 
 
 @ignore_warnings
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_partial_fit(klass):
     third = X.shape[0] // 3
     clf = klass(alpha=0.01)
 
     clf.partial_fit(X[:third], Y[:third])
-    assert clf.coef_.shape == (X.shape[1],)
+    assert clf.coef_.shape == (X.shape[1], )
     assert clf.intercept_.shape == (1,)
-    assert clf.predict([[0, 0]]).shape == (1,)
+    assert clf.predict([[0, 0]]).shape == (1, )
     id1 = id(clf.coef_.data)
 
     clf.partial_fit(X[third:], Y[third:])
@@ -1401,15 +1296,18 @@ def test_partial_fit(klass):
     assert id1, id2
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
 def test_partial_fit_equal_fit(klass, lr):
-    clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
+    clf = klass(alpha=0.01, max_iter=2, eta0=0.01,
+                learning_rate=lr, shuffle=False)
     clf.fit(X, Y)
     y_pred = clf.predict(T)
     t = clf.t_
 
-    clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+    clf = klass(alpha=0.01, eta0=0.01,
+                learning_rate=lr, shuffle=False)
     for i in range(2):
         clf.partial_fit(X, Y)
     y_pred2 = clf.predict(T)
@@ -1418,50 +1316,38 @@ def test_partial_fit_equal_fit(klass, lr):
     assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 def test_loss_function_epsilon(klass):
     clf = klass(epsilon=0.9)
     clf.set_params(epsilon=0.1)
-    assert clf.loss_functions["huber"][1] == 0.1
+    assert clf.loss_functions['huber'][1] == 0.1
 
 
 def test_l1_ratio():
     # Test if l1 ratio extremes match L1 and L2 penalty settings.
-    X, y = datasets.make_classification(
-        n_samples=1000, n_features=100, n_informative=20, random_state=1234
-    )
+    X, y = datasets.make_classification(n_samples=1000,
+                                        n_features=100, n_informative=20,
+                                        random_state=1234)
 
     # test if elasticnet with l1_ratio near 1 gives same result as pure l1
-    est_en = SGDClassifier(
-        alpha=0.001,
-        penalty="elasticnet",
-        tol=None,
-        max_iter=6,
-        l1_ratio=0.9999999999,
-        random_state=42,
-    ).fit(X, y)
-    est_l1 = SGDClassifier(
-        alpha=0.001, penalty="l1", max_iter=6, random_state=42, tol=None
-    ).fit(X, y)
+    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
+                           max_iter=6, l1_ratio=0.9999999999,
+                           random_state=42).fit(X, y)
+    est_l1 = SGDClassifier(alpha=0.001, penalty='l1', max_iter=6,
+                           random_state=42, tol=None).fit(X, y)
     assert_array_almost_equal(est_en.coef_, est_l1.coef_)
 
     # test if elasticnet with l1_ratio near 0 gives same result as pure l2
-    est_en = SGDClassifier(
-        alpha=0.001,
-        penalty="elasticnet",
-        tol=None,
-        max_iter=6,
-        l1_ratio=0.0000000001,
-        random_state=42,
-    ).fit(X, y)
-    est_l2 = SGDClassifier(
-        alpha=0.001, penalty="l2", max_iter=6, random_state=42, tol=None
-    ).fit(X, y)
+    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
+                           max_iter=6, l1_ratio=0.0000000001,
+                           random_state=42).fit(X, y)
+    est_l2 = SGDClassifier(alpha=0.001, penalty='l2', max_iter=6,
+                           random_state=42, tol=None).fit(X, y)
     assert_array_almost_equal(est_en.coef_, est_l2.coef_)
 
 
 def test_underflow_or_overlow():
-    with np.errstate(all="raise"):
+    with np.errstate(all='raise'):
         # Generate some weird data with hugely unscaled features
         rng = np.random.RandomState(0)
         n_samples = 100
@@ -1479,57 +1365,41 @@ def test_underflow_or_overlow():
 
         # Define a ground truth on the scaled data
         ground_truth = rng.normal(size=n_features)
-        y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32)
+        y = (np.dot(X_scaled, ground_truth) > 0.).astype(np.int32)
         assert_array_equal(np.unique(y), [0, 1])
 
-        model = SGDClassifier(alpha=0.1, loss="squared_hinge", max_iter=500)
+        model = SGDClassifier(alpha=0.1, loss='squared_hinge', max_iter=500)
 
         # smoke test: model is stable on scaled data
         model.fit(X_scaled, y)
         assert np.isfinite(model.coef_).all()
 
         # model is numerically unstable on unscaled data
-        msg_regxp = (
-            r"Floating-point under-/overflow occurred at epoch #.*"
-            " Scaling input data with StandardScaler or MinMaxScaler"
-            " might help."
-        )
+        msg_regxp = (r"Floating-point under-/overflow occurred at epoch #.*"
+                     " Scaling input data with StandardScaler or MinMaxScaler"
+                     " might help.")
         assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
 
 
 def test_numerical_stability_large_gradient():
     # Non regression test case for numerical stability on scaled problems
     # where the gradient can still explode with some losses
-    model = SGDClassifier(
-        loss="squared_hinge",
-        max_iter=10,
-        shuffle=True,
-        penalty="elasticnet",
-        l1_ratio=0.3,
-        alpha=0.01,
-        eta0=0.001,
-        random_state=0,
-        tol=None,
-    )
-    with np.errstate(all="raise"):
+    model = SGDClassifier(loss='squared_hinge', max_iter=10, shuffle=True,
+                          penalty='elasticnet', l1_ratio=0.3, alpha=0.01,
+                          eta0=0.001, random_state=0, tol=None)
+    with np.errstate(all='raise'):
         model.fit(iris.data, iris.target)
     assert np.isfinite(model.coef_).all()
 
 
-@pytest.mark.parametrize("penalty", ["l2", "l1", "elasticnet"])
+@pytest.mark.parametrize('penalty', ['l2', 'l1', 'elasticnet'])
 def test_large_regularization(penalty):
     # Non regression tests for numerical stability issues caused by large
     # regularization parameters
-    model = SGDClassifier(
-        alpha=1e5,
-        learning_rate="constant",
-        eta0=0.1,
-        penalty=penalty,
-        shuffle=False,
-        tol=None,
-        max_iter=6,
-    )
-    with np.errstate(all="raise"):
+    model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1,
+                          penalty=penalty, shuffle=False,
+                          tol=None, max_iter=6)
+    with np.errstate(all='raise'):
         model.fit(iris.data, iris.target)
     assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_))
 
@@ -1578,14 +1448,9 @@ def test_loss_hinge():
     loss = sgd_fast.Hinge(1.0)
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (1.1, 1.0, 0.0, 0.0),
-        (-2.0, -1.0, 0.0, 0.0),
-        (1.0, 1.0, 0.0, -1.0),
-        (-1.0, -1.0, 0.0, 1.0),
-        (0.5, 1.0, 0.5, -1.0),
-        (2.0, -1.0, 3.0, 1.0),
-        (-0.5, -1.0, 0.5, 1.0),
-        (0.0, 1.0, 1, -1.0),
+        (1.1, 1.0, 0.0, 0.0), (-2.0, -1.0, 0.0, 0.0),
+        (1.0, 1.0, 0.0, -1.0), (-1.0, -1.0, 0.0, 1.0), (0.5, 1.0, 0.5, -1.0),
+        (2.0, -1.0, 3.0, 1.0), (-0.5, -1.0, 0.5, 1.0), (0.0, 1.0, 1, -1.0)
     ]
     _test_loss_common(loss, cases)
 
@@ -1593,14 +1458,9 @@ def test_loss_hinge():
     loss = sgd_fast.Hinge(0.0)
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (1.0, 1.0, 0.0, 0.0),
-        (-0.1, -1.0, 0.0, 0.0),
-        (0.0, 1.0, 0.0, -1.0),
-        (0.0, -1.0, 0.0, 1.0),
-        (0.5, -1.0, 0.5, 1.0),
-        (2.0, -1.0, 2.0, 1.0),
-        (-0.5, 1.0, 0.5, -1.0),
-        (-1.0, 1.0, 1.0, -1.0),
+        (1.0, 1.0, 0.0, 0.0), (-0.1, -1.0, 0.0, 0.0),
+        (0.0, 1.0, 0.0, -1.0), (0.0, -1.0, 0.0, 1.0), (0.5, -1.0, 0.5, 1.0),
+        (2.0, -1.0, 2.0, 1.0), (-0.5, 1.0, 0.5, -1.0), (-1.0, 1.0, 1.0, -1.0),
     ]
     _test_loss_common(loss, cases)
 
@@ -1610,12 +1470,8 @@ def test_gradient_squared_hinge():
     loss = sgd_fast.SquaredHinge(1.0)
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (1.0, 1.0, 0.0, 0.0),
-        (-2.0, -1.0, 0.0, 0.0),
-        (1.0, -1.0, 4.0, 4.0),
-        (-1.0, 1.0, 4.0, -4.0),
-        (0.5, 1.0, 0.25, -1.0),
-        (0.5, -1.0, 2.25, 3.0),
+        (1.0, 1.0, 0.0, 0.0), (-2.0, -1.0, 0.0, 0.0), (1.0, -1.0, 4.0, 4.0),
+        (-1.0, 1.0, 4.0, -4.0), (0.5, 1.0, 0.25, -1.0), (0.5, -1.0, 2.25, 3.0)
     ]
     _test_loss_common(loss, cases)
 
@@ -1629,16 +1485,15 @@ def test_loss_log():
         (1.0, -1.0, np.log(1.0 + exp(1.0)), 1.0 / (np.exp(-1.0) + 1.0)),
         (-1.0, -1.0, np.log(1.0 + exp(-1.0)), 1.0 / (np.exp(1.0) + 1.0)),
         (-1.0, 1.0, np.log(1.0 + exp(1.0)), -1.0 / (np.exp(-1.0) + 1.0)),
-        (0.0, 1.0, 0, -0.5),
-        (0.0, -1.0, 0, 0.5),
-        (17.9, -1.0, 17.9, 1.0),
-        (-17.9, 1.0, 17.9, -1.0),
+        (0.0, 1.0, 0, -0.5), (0.0, -1.0, 0, 0.5),
+        (17.9, -1.0, 17.9, 1.0), (-17.9, 1.0, 17.9, -1.0),
     ]
     _test_loss_common(loss, cases)
     assert_almost_equal(loss.py_dloss(18.1, 1.0), np.exp(-18.1) * -1.0, 16)
-    assert_almost_equal(loss.py_loss(18.1, 1.0), np.exp(-18.1), 16)
+    assert_almost_equal(loss.py_loss(18.1, 1.0), np.exp(-18.1) , 16)
     assert_almost_equal(loss.py_dloss(-18.1, -1.0), np.exp(-18.1) * 1.0, 16)
-    assert_almost_equal(loss.py_loss(-18.1, 1.0), 18.1, 16)
+    assert_almost_equal(loss.py_loss(-18.1, 1.0), 18.1 , 16)
+
 
 
 def test_loss_squared_loss():
@@ -1646,11 +1501,8 @@ def test_loss_squared_loss():
     loss = sgd_fast.SquaredLoss()
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (0.0, 0.0, 0.0, 0.0),
-        (1.0, 1.0, 0.0, 0.0),
-        (1.0, 0.0, 0.5, 1.0),
-        (0.5, -1.0, 1.125, 1.5),
-        (-2.5, 2.0, 10.125, -4.5),
+        (0.0, 0.0, 0.0, 0.0), (1.0, 1.0, 0.0, 0.0), (1.0, 0.0, 0.5, 1.0),
+        (0.5, -1.0, 1.125, 1.5), (-2.5, 2.0, 10.125, -4.5)
     ]
     _test_loss_common(loss, cases)
 
@@ -1660,12 +1512,8 @@ def test_loss_huber():
     loss = sgd_fast.Huber(0.1)
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (0.0, 0.0, 0.0, 0.0),
-        (0.1, 0.0, 0.005, 0.1),
-        (0.0, 0.1, 0.005, -0.1),
-        (3.95, 4.0, 0.0125, -0.05),
-        (5.0, 2.0, 0.295, 0.1),
-        (-1.0, 5.0, 0.595, -0.1),
+        (0.0, 0.0, 0.0, 0.0), (0.1, 0.0, 0.005, 0.1), (0.0, 0.1, 0.005, -0.1),
+        (3.95, 4.0, 0.0125, -0.05), (5.0, 2.0, 0.295, 0.1), (-1.0, 5.0, 0.595, -0.1)
     ]
     _test_loss_common(loss, cases)
 
@@ -1675,14 +1523,9 @@ def test_loss_modified_huber():
     loss = sgd_fast.ModifiedHuber()
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (1.0, 1.0, 1.0, 0.0),
-        (-1.0, -1.0, 0.0, 0.0),
-        (2.0, 1.0, 0.0, 0.0),
-        (0.0, 1.0, 1.0, -2.0),
-        (-1.0, 1.0, 4.0, -4.0),
-        (0.5, -1.0, 2.25, 3.0),
-        (-2.0, 1.0, 8, -4.0),
-        (-3.0, 1.0, 12, -4.0),
+        (1.0, 1.0, 1.0, 0.0), (-1.0, -1.0, 0.0, 0.0), (2.0, 1.0, 0.0, 0.0),
+        (0.0, 1.0, 1.0, -2.0), (-1.0, 1.0, 4.0, -4.0), (0.5, -1.0, 2.25, 3.0),
+        (-2.0, 1.0, 8, -4.0), (-3.0, 1.0, 12, -4.0)
     ]
     _test_loss_common(loss, cases)
 
@@ -1692,14 +1535,9 @@ def test_loss_epsilon_insensitive():
     loss = sgd_fast.EpsilonInsensitive(0.1)
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (0.0, 0.0, 0.0, 0.0),
-        (0.1, 0.0, 0.0, 0.0),
-        (-2.05, -2.0, 0.0, 0.0),
-        (3.05, 3.0, 0.0, 0.0),
-        (2.2, 2.0, 0.1, 1.0),
-        (2.0, -1.0, 2.9, 1.0),
-        (2.0, 2.2, 0.1, -1.0),
-        (-2.0, 1.0, 2.9, -1.0),
+        (0.0, 0.0, 0.0, 0.0), (0.1, 0.0, 0.0, 0.0), (-2.05, -2.0, 0.0, 0.0),
+        (3.05, 3.0, 0.0, 0.0), (2.2, 2.0, 0.1, 1.0), (2.0, -1.0, 2.9, 1.0),
+        (2.0, 2.2, 0.1, -1.0), (-2.0, 1.0, 2.9, -1.0)
     ]
     _test_loss_common(loss, cases)
 
@@ -1709,14 +1547,9 @@ def test_loss_squared_epsilon_insensitive():
     loss = sgd_fast.SquaredEpsilonInsensitive(0.1)
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (0.0, 0.0, 0.0, 0.0),
-        (0.1, 0.0, 0.0, 0.0),
-        (-2.05, -2.0, 0.0, 0.0),
-        (3.05, 3.0, 0.0, 0.0),
-        (2.2, 2.0, 0.01, 0.2),
-        (2.0, -1.0, 8.41, 5.8),
-        (2.0, 2.2, 0.01, -0.2),
-        (-2.0, 1.0, 8.41, -5.8),
+        (0.0, 0.0, 0.0, 0.0), (0.1, 0.0, 0.0, 0.0), (-2.05, -2.0, 0.0, 0.0),
+        (3.05, 3.0, 0.0, 0.0), (2.2, 2.0, 0.01, 0.2), (2.0, -1.0, 8.41, 5.8),
+        (2.0, 2.2, 0.01, -0.2), (-2.0, 1.0, 8.41, -5.8)
     ]
     _test_loss_common(loss, cases)
 
@@ -1724,15 +1557,9 @@ def test_loss_squared_epsilon_insensitive():
 def test_multi_thread_multi_class_and_early_stopping():
     # This is a non-regression test for a bad interaction between
     # early stopping internal attribute and thread-based parallelism.
-    clf = SGDClassifier(
-        alpha=1e-3,
-        tol=1e-3,
-        max_iter=1000,
-        early_stopping=True,
-        n_iter_no_change=100,
-        random_state=0,
-        n_jobs=2,
-    )
+    clf = SGDClassifier(alpha=1e-3, tol=1e-3, max_iter=1000,
+                        early_stopping=True, n_iter_no_change=100,
+                        random_state=0, n_jobs=2)
     clf.fit(iris.data, iris.target)
     assert clf.n_iter_ > clf.n_iter_no_change
     assert clf.n_iter_ < clf.n_iter_no_change + 20
@@ -1744,17 +1571,20 @@ def test_multi_core_gridsearch_and_early_stopping():
     # early stopping internal attribute and process-based multi-core
     # parallelism.
     param_grid = {
-        "alpha": np.logspace(-4, 4, 9),
-        "n_iter_no_change": [5, 10, 50],
+        'alpha': np.logspace(-4, 4, 9),
+        'n_iter_no_change': [5, 10, 50],
     }
 
-    clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True, random_state=0)
-    search = RandomizedSearchCV(clf, param_grid, n_iter=3, n_jobs=2, random_state=0)
+    clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True,
+                        random_state=0)
+    search = RandomizedSearchCV(clf, param_grid, n_iter=3, n_jobs=2,
+                                random_state=0)
     search.fit(iris.data, iris.target)
     assert search.best_score_ > 0.8
 
 
-@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
+@pytest.mark.parametrize("backend",
+                         ["loky", "multiprocessing", "threading"])
 def test_SGDClassifier_fit_for_all_backends(backend):
     # This is a non-regression smoke test. In the multi-class case,
     # SGDClassifier.fit fits each class in a one-versus-all fashion using
@@ -1770,24 +1600,28 @@ def test_SGDClassifier_fit_for_all_backends(backend):
     # a segmentation fault when trying to write in a readonly memory mapped
     # buffer.
 
-    if parse_version(joblib.__version__) < parse_version("0.12") and backend == "loky":
-        pytest.skip("loky backend does not exist in joblib <0.12")
+    if (parse_version(joblib.__version__) < parse_version('0.12')
+            and backend == 'loky'):
+        pytest.skip('loky backend does not exist in joblib <0.12')
 
     random_state = np.random.RandomState(42)
 
     # Create a classification problem with 50000 features and 20 classes. Using
     # loky or multiprocessing this make the clf.coef_ exceed the threshold
     # above which memmaping is used in joblib and loky (1MB as of 2018/11/1).
-    X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state)
+    X = sp.random(500, 2000, density=0.02, format='csr',
+                  random_state=random_state)
     y = random_state.choice(20, 500)
 
     # Begin by fitting a SGD classifier sequentially
-    clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1, random_state=42)
+    clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1,
+                                   random_state=42)
     clf_sequential.fit(X, y)
 
     # Fit a SGDClassifier using the specified backend, and make sure the
     # coefficients are equal to those obtained using a sequential fit
-    clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4, random_state=42)
+    clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4,
+                                 random_state=42)
     with joblib.parallel_backend(backend=backend):
         clf_parallel.fit(X, y)
     assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_)

From 02f0c0cfb5f6812821dba57e86e7630ef9040e4f Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Thu, 26 Nov 2020 23:58:29 +0100
Subject: [PATCH 4/8] fix typos

---
 sklearn/linear_model/tests/test_sgd.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index b8d8961e897f3..724c646e74f2c 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1481,10 +1481,10 @@ def test_loss_log():
     loss = sgd_fast.Log()
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (1.0, 1.0, np.log(1.0 + exp(-1.0)), -1.0 / (np.exp(1.0) + 1.0)),
-        (1.0, -1.0, np.log(1.0 + exp(1.0)), 1.0 / (np.exp(-1.0) + 1.0)),
-        (-1.0, -1.0, np.log(1.0 + exp(-1.0)), 1.0 / (np.exp(1.0) + 1.0)),
-        (-1.0, 1.0, np.log(1.0 + exp(1.0)), -1.0 / (np.exp(-1.0) + 1.0)),
+        (1.0, 1.0, np.log(1.0 + np.exp(-1.0)), -1.0 / (np.exp(1.0) + 1.0)),
+        (1.0, -1.0, np.log(1.0 + np.exp(1.0)), 1.0 / (np.exp(-1.0) + 1.0)),
+        (-1.0, -1.0, np.log(1.0 + np.exp(-1.0)), 1.0 / (np.exp(1.0) + 1.0)),
+        (-1.0, 1.0, np.log(1.0 + np.exp(1.0)), -1.0 / (np.exp(-1.0) + 1.0)),
         (0.0, 1.0, 0, -0.5), (0.0, -1.0, 0, 0.5),
         (17.9, -1.0, 17.9, 1.0), (-17.9, 1.0, 17.9, -1.0),
     ]
@@ -1523,7 +1523,7 @@ def test_loss_modified_huber():
     loss = sgd_fast.ModifiedHuber()
     cases = [
         # (p, y, expected_loss, expected_dloss)
-        (1.0, 1.0, 1.0, 0.0), (-1.0, -1.0, 0.0, 0.0), (2.0, 1.0, 0.0, 0.0),
+        (1.0, 1.0, 0.0, 0.0), (-1.0, -1.0, 0.0, 0.0), (2.0, 1.0, 0.0, 0.0),
         (0.0, 1.0, 1.0, -2.0), (-1.0, 1.0, 4.0, -4.0), (0.5, -1.0, 2.25, 3.0),
         (-2.0, 1.0, 8, -4.0), (-3.0, 1.0, 12, -4.0)
     ]

From 53b0b081d441969ec9e16a1ed738ddd8cbf2fb0a Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 27 Nov 2020 00:05:09 +0100
Subject: [PATCH 5/8] fix lint

---
 sklearn/linear_model/tests/test_sgd.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 724c646e74f2c..ff0363df9b2aa 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1492,8 +1492,7 @@ def test_loss_log():
     assert_almost_equal(loss.py_dloss(18.1, 1.0), np.exp(-18.1) * -1.0, 16)
     assert_almost_equal(loss.py_loss(18.1, 1.0), np.exp(-18.1) , 16)
     assert_almost_equal(loss.py_dloss(-18.1, -1.0), np.exp(-18.1) * 1.0, 16)
-    assert_almost_equal(loss.py_loss(-18.1, 1.0), 18.1 , 16)
-
+    assert_almost_equal(loss.py_loss(-18.1, 1.0), 18.1, 16)
 
 
 def test_loss_squared_loss():
@@ -1513,7 +1512,8 @@ def test_loss_huber():
     cases = [
         # (p, y, expected_loss, expected_dloss)
         (0.0, 0.0, 0.0, 0.0), (0.1, 0.0, 0.005, 0.1), (0.0, 0.1, 0.005, -0.1),
-        (3.95, 4.0, 0.0125, -0.05), (5.0, 2.0, 0.295, 0.1), (-1.0, 5.0, 0.595, -0.1)
+        (3.95, 4.0, 0.0125, -0.05), (5.0, 2.0, 0.295, 0.1),
+        (-1.0, 5.0, 0.595, -0.1)
     ]
     _test_loss_common(loss, cases)
 

From 93ef98eab1230272006c6a4dfb5f443e21b9cba5 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 27 Nov 2020 00:08:00 +0100
Subject: [PATCH 6/8] fix lint v2

---
 sklearn/linear_model/tests/test_sgd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index ff0363df9b2aa..214287aa6990b 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1490,7 +1490,7 @@ def test_loss_log():
     ]
     _test_loss_common(loss, cases)
     assert_almost_equal(loss.py_dloss(18.1, 1.0), np.exp(-18.1) * -1.0, 16)
-    assert_almost_equal(loss.py_loss(18.1, 1.0), np.exp(-18.1) , 16)
+    assert_almost_equal(loss.py_loss(18.1, 1.0), np.exp(-18.1), 16)
     assert_almost_equal(loss.py_dloss(-18.1, -1.0), np.exp(-18.1) * 1.0, 16)
     assert_almost_equal(loss.py_loss(-18.1, 1.0), 18.1, 16)
 

From a70d37bc3bd64562b0b666e107c9f080508a9e7a Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 27 Nov 2020 09:00:02 +0100
Subject: [PATCH 7/8] fix typos

---
 sklearn/linear_model/tests/test_sgd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 214287aa6990b..ff8c3a911da58 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1485,7 +1485,7 @@ def test_loss_log():
         (1.0, -1.0, np.log(1.0 + np.exp(1.0)), 1.0 / (np.exp(-1.0) + 1.0)),
         (-1.0, -1.0, np.log(1.0 + np.exp(-1.0)), 1.0 / (np.exp(1.0) + 1.0)),
         (-1.0, 1.0, np.log(1.0 + np.exp(1.0)), -1.0 / (np.exp(-1.0) + 1.0)),
-        (0.0, 1.0, 0, -0.5), (0.0, -1.0, 0, 0.5),
+        (0.0, 1.0, np.log(2), -0.5), (0.0, -1.0, np.log(2), 0.5),
         (17.9, -1.0, 17.9, 1.0), (-17.9, 1.0, 17.9, -1.0),
     ]
     _test_loss_common(loss, cases)
@@ -1512,7 +1512,7 @@ def test_loss_huber():
     cases = [
         # (p, y, expected_loss, expected_dloss)
         (0.0, 0.0, 0.0, 0.0), (0.1, 0.0, 0.005, 0.1), (0.0, 0.1, 0.005, -0.1),
-        (3.95, 4.0, 0.0125, -0.05), (5.0, 2.0, 0.295, 0.1),
+        (3.95, 4.0, 0.00125, -0.05), (5.0, 2.0, 0.295, 0.1),
         (-1.0, 5.0, 0.595, -0.1)
     ]
     _test_loss_common(loss, cases)

From dc8cf7da6af7dcd89c3aa307820c2a5a8ed6133f Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Thu, 3 Dec 2020 14:15:34 +0100
Subject: [PATCH 8/8] fix typo comment test

---
 sklearn/linear_model/tests/test_sgd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index ff8c3a911da58..8e8d3f94b6c99 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1435,7 +1435,7 @@ def test_tol_parameter():
 
 
 def _test_loss_common(loss_function, cases):
-    # Test gradient of different loss functions
+    # Test the different loss functions
     # cases is a list of (p, y, expected)
     for p, y, expected_loss, expected_dloss in cases:
         assert_almost_equal(loss_function.py_loss(p, y), expected_loss)