From 3ad001636b1d33e45906af6ee77019259ebee36f Mon Sep 17 00:00:00 2001
From: cgsavard <claire.savard@colorado.edu>
Date: Mon, 2 Dec 2019 12:54:37 -0700
Subject: [PATCH 01/85] DOC fixed default values in dbscan (#15753)

---
 sklearn/cluster/_dbscan.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index 3b3ccb1fbe6dc..a464e3951673a 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -156,18 +156,18 @@ class DBSCAN(ClusterMixin, BaseEstimator):
 
     Parameters
     ----------
-    eps : float, optional
+    eps : float, default=0.5
         The maximum distance between two samples for one to be considered
         as in the neighborhood of the other. This is not a maximum bound
         on the distances of points within a cluster. This is the most
         important DBSCAN parameter to choose appropriately for your data set
         and distance function.
 
-    min_samples : int, optional
+    min_samples : int, default=5
         The number of samples (or total weight) in a neighborhood for a point
         to be considered as a core point. This includes the point itself.
 
-    metric : string, or callable
+    metric : string, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
         the options allowed by :func:`sklearn.metrics.pairwise_distances` for
@@ -179,27 +179,27 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         .. versionadded:: 0.17
            metric *precomputed* to accept precomputed sparse matrix.
 
-    metric_params : dict, optional
+    metric_params : dict, default=None
         Additional keyword arguments for the metric function.
 
         .. versionadded:: 0.19
 
-    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
         See NearestNeighbors module documentation for details.
 
-    leaf_size : int, optional (default = 30)
+    leaf_size : int, default=30
         Leaf size passed to BallTree or cKDTree. This can affect the speed
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
 
-    p : float, optional
+    p : float, default=None
         The power of the Minkowski metric to be used to calculate distance
         between points.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int or None, default=None
         The number of parallel jobs to run.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`

From 577e92f56d892a37f0f8c98aeb7eb5ddb4a045d6 Mon Sep 17 00:00:00 2001
From: cgsavard <claire.savard@colorado.edu>
Date: Wed, 4 Dec 2019 00:08:36 -0700
Subject: [PATCH 02/85] DOC fix incorrect branch reference in contributing doc
 (#15779)

---
 doc/developers/contributing.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 863ecfb7741b3..8d12187ade00b 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -251,7 +251,7 @@ modifying code and submitting a PR:
    to record your changes in Git, then push the changes to your GitHub
    account with::
 
-       $ git push -u origin my-feature
+       $ git push -u origin my_feature
 
 10. Follow `these
     <https://help.github.com/articles/creating-a-pull-request-from-a-fork>`_

From 1b4bb1c0f11d29280207e1b14f5f29c67c8eae05 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 4 Dec 2019 18:09:57 +1100
Subject: [PATCH 03/85] DOC relabel Feature -> Efficiency in change log
 (#15770)

---
 doc/whats_new/v0.22.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index f3f69a8299b0a..7b0c031f9196b 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -416,12 +416,6 @@ Changelog
 :mod:`sklearn.gaussian_process`
 ...............................
 
-- |Feature| :func:`gaussian_process.GaussianProcessClassifier.log_marginal_likelihood`
-  and :func:`gaussian_process.GaussianProcessRegressor.log_marginal_likelihood` now
-  accept a ``clone_kernel=True`` keyword argument. When set to ``False``,
-  the kernel attribute is modified, but may result in a performance improvement.
-  :pr:`14378` by :user:`Masashi Shibata <c-bata>`.
-
 - |Feature| Gaussian process models on structured data: :class:`gaussian_process.GaussianProcessRegressor`
   and :class:`gaussian_process.GaussianProcessClassifier` can now accept a list
   of generic objects (e.g. strings, trees, graphs, etc.) as the ``X`` argument
@@ -431,6 +425,12 @@ Changelog
   to notify the GPR/GPC model that it handles non-vectorial samples.
   :pr:`15557` by :user:`Yu-Hang Tang <yhtang>`.
 
+- |Efficiency| :func:`gaussian_process.GaussianProcessClassifier.log_marginal_likelihood`
+  and :func:`gaussian_process.GaussianProcessRegressor.log_marginal_likelihood` now
+  accept a ``clone_kernel=True`` keyword argument. When set to ``False``,
+  the kernel attribute is modified, but may result in a performance improvement.
+  :pr:`14378` by :user:`Masashi Shibata <c-bata>`.
+
 - |API| From version 0.24 :meth:`gaussian_process.kernels.Kernel.get_params` will raise an
   ``AttributeError`` rather than return ``None`` for parameters that are in the
   estimator's constructor but not stored as attributes on the instance.

From b0824990e065c0e1b6725cbfd0f0813df1a8df90 Mon Sep 17 00:00:00 2001
From: cgsavard <claire.savard@colorado.edu>
Date: Wed, 4 Dec 2019 03:24:00 -0700
Subject: [PATCH 04/85] DOC fixed Birch default value (#15780)

---
 sklearn/cluster/_birch.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 349ec19c6ff9c..0a16586caae9a 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -334,20 +334,20 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
 
     Parameters
     ----------
-    threshold : float, default 0.5
+    threshold : float, default=0.5
         The radius of the subcluster obtained by merging a new sample and the
         closest subcluster should be lesser than the threshold. Otherwise a new
         subcluster is started. Setting this value to be very low promotes
         splitting and vice-versa.
 
-    branching_factor : int, default 50
+    branching_factor : int, default=50
         Maximum number of CF subclusters in each node. If a new samples enters
         such that the number of subclusters exceed the branching_factor then
         that node is split into two nodes with the subclusters redistributed
         in each. The parent subcluster of that node is removed and two new
         subclusters are added as parents of the 2 split nodes.
 
-    n_clusters : int, instance of sklearn.cluster model, default 3
+    n_clusters : int, instance of sklearn.cluster model, default=3
         Number of clusters after the final clustering step, which treats the
         subclusters from the leaves as new samples.
 
@@ -361,10 +361,10 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
         - `int` : the model fit is :class:`AgglomerativeClustering` with
           `n_clusters` set to be equal to the int.
 
-    compute_labels : bool, default True
+    compute_labels : bool, default=True
         Whether or not to compute labels for each fit.
 
-    copy : bool, default True
+    copy : bool, default=True
         Whether or not to make a copy of the given data. If set to False,
         the initial data will be overwritten.
 

From 4be82f384c87d15c825ab3c3e4fcb061eadaf3d3 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 4 Dec 2019 12:25:56 -0500
Subject: [PATCH 05/85] STY Minior change on code padding in website theme
 (#15768)

---
 doc/themes/scikit-learn-modern/static/css/theme.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 9fb35c73c27bd..590d2679223b7 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -37,6 +37,7 @@ code {
   background-color: #ecf0f3;
   border-radius: 0.2rem;
   white-space: nowrap;
+  padding: 0.15rem;
 }
 
 nav {

From 2b0463c8f35d99b473d0ae2b15de4171713f9d26 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 5 Dec 2019 23:39:18 +0100
Subject: [PATCH 06/85] DOC Fix yticklabels order in permutation importances
 example (#15799)

* Fix yticklabels order in permutation importances example

* Trigger ci
---
 .../inspection/plot_permutation_importance_multicollinear.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py
index eb8b591bb4f2b..5f832ffbd4228 100644
--- a/examples/inspection/plot_permutation_importance_multicollinear.py
+++ b/examples/inspection/plot_permutation_importance_multicollinear.py
@@ -60,11 +60,11 @@
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
 ax1.barh(tree_indices,
          clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
-ax1.set_yticklabels(data.feature_names)
+ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx])
 ax1.set_yticks(tree_indices)
 ax1.set_ylim((0, len(clf.feature_importances_)))
 ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
-            labels=data.feature_names)
+            labels=data.feature_names[perm_sorted_idx])
 fig.tight_layout()
 plt.show()
 

From 628d96f7bb81601f102a2dfd831c7825f58b0813 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 5 Dec 2019 18:02:53 -0500
Subject: [PATCH 07/85] STY Update wrapper width (#15793)

---
 doc/themes/scikit-learn-modern/static/css/theme.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 590d2679223b7..782800eb31915 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -513,7 +513,7 @@ div.sk-sidebar-toc-logo {
 
 div.sk-sidebar-toc-wrapper {
   font-size: 0.9rem;
-  width: 120%;
+  width: 252px;
   overflow-x: hidden;
   overflow-y: scroll;
   height: 100vh;

From 71870969e7a5e493a48553c41d1dfd03b4004620 Mon Sep 17 00:00:00 2001
From: Matt Hall <matt@agilegeoscience.com>
Date: Fri, 6 Dec 2019 08:15:37 +0000
Subject: [PATCH 08/85] DOC Long sentence was hard to parse and ambiguous in
 _classification.py (#15769)

---
 sklearn/metrics/_classification.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 8a975a6f59802..10f91934f79da 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -1916,10 +1916,10 @@ def classification_report(y_true, y_pred, labels=None, target_names=None,
 
         The reported averages include macro average (averaging the unweighted
         mean per label), weighted average (averaging the support-weighted mean
-        per label), sample average (only for multilabel classification) and
-        micro average (averaging the total true positives, false negatives and
-        false positives) it is only shown for multi-label or multi-class
-        with a subset of classes because it is accuracy otherwise.
+        per label), and sample average (only for multilabel classification).
+        Micro average (averaging the total true positives, false negatives and
+        false positives) is only shown for multi-label or multi-class
+        with a subset of classes, because it corresponds to accuracy otherwise.
         See also :func:`precision_recall_fscore_support` for more details
         on averages.
 

From 65e5b05b6f8317b220139332716c0976037840cd Mon Sep 17 00:00:00 2001
From: Kathryn Poole <kathryn.poole2@gmail.com>
Date: Fri, 6 Dec 2019 12:11:37 +0000
Subject: [PATCH 09/85] DOC Removed duplicate 'classes_' attribute in Naive
 Bayes classifiers (#15811)

---
 sklearn/naive_bayes.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 8ebf19125dbf4..585ba69fbb1ce 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -157,9 +157,6 @@ class labels known to the classifier
     epsilon_ : float
         absolute additive value to variances
 
-    classes_ : array-like, shape (n_classes,)
-        Unique class labels.
-
     Examples
     --------
     >>> import numpy as np
@@ -718,9 +715,6 @@ class MultinomialNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
-    classes_ : array-like, shape (n_classes,)
-        Unique class labels.
-
     Examples
     --------
     >>> import numpy as np
@@ -828,9 +822,6 @@ class ComplementNB(_BaseDiscreteNB):
         Number of samples encountered for each feature during fitting. This
         value is weighted by the sample weight when provided.
 
-    classes_ : array of shape (n_classes,)
-        The classes labels.
-
     Examples
     --------
     >>> import numpy as np
@@ -939,9 +930,6 @@ class BernoulliNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
-    classes_ : array of shape (n_classes,)
-        The classes labels.
-
     See Also
     ----------
     MultinomialNB: The multinomial Naive Bayes classifier is \

From be0323b2c7dab66c406c009b4ab7952150d40f27 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 6 Dec 2019 07:54:12 -0500
Subject: [PATCH 10/85] BUG Fixes pandas dataframe bug with boolean dtypes
 (#15797)

---
 doc/whats_new/v0.22.rst                | 19 +++++++++++++++++++
 sklearn/utils/tests/test_validation.py | 21 +++++++++++++++++++++
 sklearn/utils/validation.py            |  9 +++++++--
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 7b0c031f9196b..af08b832e9f6f 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -2,6 +2,25 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_0_22_1:
+
+Version 0.22.1
+==============
+
+**In Development**
+
+This is a bug-fix release to primarily resolve some packaging issues in version
+0.22.0. It also includes minor documentation improvements and some bug fixes.
+
+Changelog
+---------
+
+:mod:`sklearn.utils`
+....................
+
+- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with
+  boolean columns to floats. :pr:`15797` by `Thomas Fan`_.
+
 .. _changes_0_22:
 
 Version 0.22.0
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 56efb98a8b2d8..bdd31f9c4859f 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -826,6 +826,27 @@ def test_check_dataframe_warns_on_dtype():
     assert len(record) == 0
 
 
+def test_check_dataframe_mixed_float_dtypes():
+    # pandas dataframe will coerce a boolean into a object, this is a mismatch
+    # with np.result_type which will return a float
+    # check_array needs to explicitly check for bool dtype in a dataframe for
+    # this situation
+    # https://github.com/scikit-learn/scikit-learn/issues/15787
+
+    pd = importorskip("pandas")
+    df = pd.DataFrame({
+        'int': [1, 2, 3],
+        'float': [0, 0.1, 2.1],
+        'bool': [True, False, True]}, columns=['int', 'float', 'bool'])
+
+    array = check_array(df, dtype=(np.float64, np.float32, np.float16))
+    expected_array = np.array(
+        [[1.0, 0.0, 1.0],
+         [2.0, 0.1, 0.0],
+         [3.0, 2.1, 1.0]], dtype=np.float)
+    assert_allclose_dense_sparse(array, expected_array)
+
+
 class DummyMemory:
     def cache(self, func):
         return func
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 424cf4b5180a3..fb34f3b3cccbd 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -454,9 +454,14 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
-        dtypes_orig = np.array(array.dtypes)
+        dtypes_orig = list(array.dtypes)
+        # pandas boolean dtype __array__ interface coerces bools to objects
+        for i, dtype_iter in enumerate(dtypes_orig):
+            if dtype_iter.kind == 'b':
+                dtypes_orig[i] = np.object
+
         if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
-            dtype_orig = np.result_type(*array.dtypes)
+            dtype_orig = np.result_type(*dtypes_orig)
 
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":

From e547c90814bf5da385544bfeb7429a3a239fbafa Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 6 Dec 2019 08:18:56 -0500
Subject: [PATCH 11/85] BUG Returns only public estimators in all_estimators
 (#15380)

---
 sklearn/tests/test_common.py                 |  2 -
 sklearn/utils/__init__.py                    | 40 ++++++++++++--------
 sklearn/utils/tests/test_estimator_checks.py |  9 +++++
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 221dd52834c90..9fc3075c5fe28 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -78,8 +78,6 @@ def _tested_estimators():
     for name, Estimator in all_estimators():
         if issubclass(Estimator, BiclusterMixin):
             continue
-        if name.startswith("_"):
-            continue
         try:
             estimator = _construct_instance(Estimator)
         except SkipTest:
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 4d4ef606341ca..c20ea5ab11d31 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -3,6 +3,7 @@
 """
 import pkgutil
 import inspect
+from importlib import import_module
 from operator import itemgetter
 from collections.abc import Sequence
 from contextlib import contextmanager
@@ -12,6 +13,7 @@
 import platform
 import struct
 import timeit
+from pathlib import Path
 
 import warnings
 import numpy as np
@@ -1155,7 +1157,6 @@ def all_estimators(include_meta_estimators=None,
         and ``class`` is the actuall type of the class.
     """
     # lazy import to avoid circular imports from sklearn.base
-    import sklearn
     from ._testing import ignore_warnings
     from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
                         TransformerMixin, ClusterMixin)
@@ -1183,20 +1184,29 @@ def is_abstract(c):
                       DeprecationWarning)
 
     all_classes = []
-    # get parent folder
-    path = sklearn.__path__
-    for importer, modname, ispkg in pkgutil.walk_packages(
-            path=path, prefix='sklearn.', onerror=lambda x: None):
-        if ".tests." in modname or "externals" in modname:
-            continue
-        if IS_PYPY and ('_svmlight_format' in modname or
-                        'feature_extraction._hashing' in modname):
-            continue
-        # Ignore deprecation warnings triggered at import time.
-        with ignore_warnings(category=FutureWarning):
-            module = __import__(modname, fromlist="dummy")
-        classes = inspect.getmembers(module, inspect.isclass)
-        all_classes.extend(classes)
+    modules_to_ignore = {"tests", "externals", "setup", "conftest"}
+    root = str(Path(__file__).parent.parent)  # sklearn package
+    # Ignore deprecation warnings triggered at import time and from walking
+    # packages
+    with ignore_warnings(category=FutureWarning):
+        for importer, modname, ispkg in pkgutil.walk_packages(
+                path=[root], prefix='sklearn.'):
+            mod_parts = modname.split(".")
+            if (any(part in modules_to_ignore for part in mod_parts)
+                    or '._' in modname):
+                continue
+            module = import_module(modname)
+            classes = inspect.getmembers(module, inspect.isclass)
+            classes = [(name, est_cls) for name, est_cls in classes
+                       if not name.startswith("_")]
+
+            # TODO: Remove when FeatureHasher is implemented in PYPY
+            # Skips FeatureHasher for PYPY
+            if IS_PYPY and 'feature_extraction' in modname:
+                classes = [(name, est_cls) for name, est_cls in classes
+                           if name == "FeatureHasher"]
+
+            all_classes.extend(classes)
 
     all_classes = set(all_classes)
 
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index f0c014829483f..15b423d6e0ce8 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -34,6 +34,7 @@
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.validation import check_X_y, check_array
+from sklearn.utils import all_estimators
 
 
 class CorrectNotFittedError(ValueError):
@@ -572,6 +573,14 @@ def test_check_class_weight_balanced_linear_classifier():
                         BadBalancedWeightsClassifier)
 
 
+def test_all_estimators_all_public():
+    # all_estimator should not fail when pytest is not installed and return
+    # only public estimators
+    estimators = all_estimators()
+    for est in estimators:
+        assert not est.__class__.__name__.startswith("_")
+
+
 if __name__ == '__main__':
     # This module is run as a script to check that we have no dependency on
     # pytest for estimator checks.

From 1237e40569b2c64370275134e1c10ad360c8b524 Mon Sep 17 00:00:00 2001
From: lucyleeow <jliu176@gmail.com>
Date: Fri, 6 Dec 2019 17:28:30 +0100
Subject: [PATCH 12/85] DOC improve doc for multiclass and types_of_target
 (#15333)

---
 doc/modules/multiclass.rst | 197 +++++++++++++++++++++++++++++--------
 1 file changed, 155 insertions(+), 42 deletions(-)

diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 195ecc0adcf6f..5613fc2334e73 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -14,45 +14,138 @@ Multiclass and multilabel algorithms
 
 The :mod:`sklearn.multiclass` module implements *meta-estimators* to solve
 ``multiclass`` and ``multilabel`` classification problems
-by decomposing such problems into binary classification problems. Multitarget
+by decomposing such problems into binary classification problems. ``multioutput``
 regression is also supported.
 
-- **Multiclass classification** means a classification task with more than
-  two classes; e.g., classify a set of images of fruits which may be oranges,
-  apples, or pears. Multiclass classification makes the assumption that each
-  sample is assigned to one and only one label: a fruit can be either an
-  apple or a pear but not both at the same time.
-
-- **Multilabel classification** assigns to each sample a set of target
-  labels. This can be thought as predicting properties of a data-point
-  that are not mutually exclusive, such as topics that are relevant for a
-  document. A text might be about any of religion, politics, finance or
-  education at the same time or none of these.
-
-- **Multioutput regression** assigns each sample a set of target
-  values.  This can be thought of as predicting several properties
-  for each data-point, such as wind direction and magnitude at a
-  certain location.
-
-- **Multioutput-multiclass classification** and **multi-task classification**
-  means that a single estimator has to handle several joint classification
-  tasks. This is both a generalization of the multi-label classification
-  task, which only considers binary classification, as well as a
-  generalization of the multi-class classification task.  *The output format
-  is a 2d numpy array or sparse matrix.*
-
-  The set of labels can be different for each output variable.
-  For instance, a sample could be assigned "pear" for an output variable that
-  takes possible values in a finite set of species such as "pear", "apple"; 
-  and "blue" or "green" for a second output variable that takes possible values
-  in a finite set of colors such as "green", "red", "blue", "yellow"...
-
-  This means that any classifiers handling multi-output
-  multiclass or multi-task classification tasks,
-  support the multi-label classification task as a special case.
-  Multi-task classification is similar to the multi-output
-  classification task with different model formulations. For
-  more information, see the relevant estimator documentation.
+- **Multiclass classification**: classification task with more than two classes.
+  Each sample can only be labelled as one class.
+
+  For example, classification using features extracted from a set of images of
+  fruit, where each image may either be of an orange, an apple, or a pear.
+  Each image is one sample and is labelled as one of the 3 possible classes.
+  Multiclass classification makes the assumption that each sample is assigned
+  to one and only one label - one sample cannot, for example, be both a pear
+  and an apple.
+
+  Valid :term:`multiclass` representations for
+  :func:`~utils.multiclass.type_of_target` (`y`) are:
+
+    - 1d or column vector containing more than two discrete values. An
+      example of a vector ``y`` for 3 samples:
+
+        >>> import numpy as np
+        >>> y = np.array(['apple', 'pear', 'apple'])
+        >>> print(y)
+        ['apple' 'pear' 'apple']
+
+    - sparse :term:`binary` matrix of shape ``(n_samples, n_classes)`` with a
+      single element per row, where each column represents one class. An
+      example of a sparse :term:`binary` matrix ``y`` for 3 samples, where
+      the columns, in order, are orange, apple and pear:
+
+        >>> from scipy import sparse
+        >>> row_ind = np.array([0, 1, 2])
+        >>> col_ind = np.array([1, 2, 1])
+        >>> y_sparse = sparse.csr_matrix((np.ones(3), (row_ind, col_ind)))
+        >>> print(y_sparse)
+          (0, 1)	1.0
+          (1, 2)	1.0
+          (2, 1)	1.0
+
+
+- **Multilabel classification**: classification task labelling each sample with
+  ``x`` labels from ``n_classes`` possible classes, where ``x`` can be 0 to
+  ``n_classes`` inclusive. This can be thought of as predicting properties of a
+  sample that are not mutually exclusive. Formally, a binary output is assigned
+  to each class, for every sample. Positive classes are indicated with 1 and
+  negative classes with 0 or -1. It is thus comparable to running ``n_classes``
+  binary classification tasks, for example with
+  :class:`sklearn.multioutput.MultiOutputClassifier`. This approach treats
+  each label independently whereas multilabel classifiers *may* treat the
+  multiple classes simultaneously, accounting for correlated behaviour amoung
+  them.
+
+  For example, prediction of the topics relevant to a text document or video.
+  The document or video may be about one of 'religion', 'politics', 'finance'
+  or 'education', several of the topic classes or all of the topic classes.
+
+  Valid representation of :term:`multilabel` `y` is either dense (or sparse)
+  :term:`binary` matrix of shape ``(n_samples, n_classes)``. Each column
+  represents a class. The ``1``'s in each row denote the positive classes a
+  sample has been labelled with. An example of a dense matrix ``y`` for 3
+  samples:
+
+    >>> y = np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]])
+    >>> print(y)
+    [[1 0 0 1]
+     [0 0 1 1]
+     [0 0 0 0]]
+
+  An example of the same ``y`` in sparse matrix form:
+
+    >>> y_sparse = sparse.csr_matrix(y)
+    >>> print(y_sparse)
+      (0, 0)	1
+      (0, 3)	1
+      (1, 2)	1
+      (1, 3)	1
+
+
+- **Multioutput regression**: predicts multiple numerical properties for each
+  sample. Each property is a numerical variable and the number of properties
+  to be predicted for each sample is greater than or equal to 2. Some estimators
+  that support multioutput regression are faster than just running ``n_output``
+  estimators.
+
+  For example, prediction of both wind speed and wind direction, in degrees,
+  using data obtained at a certain location. Each sample would be data
+  obtained at one location and both wind speed and directtion would be
+  output for each sample.
+
+  Valid representation of :term:`multilabel` `y` is dense matrix of shape
+  ``(n_samples, n_classes)`` of floats. A column wise concatenation of
+  :term:`continuous` variables. An example of ``y`` for 3 samples:
+
+    >>> y = np.array([[31.4, 94], [40.5, 109], [25.0, 30]])
+    >>> print(y)
+    [[ 31.4  94. ]
+     [ 40.5 109. ]
+     [ 25.   30. ]]
+
+
+- **Multioutput-multiclass classification**
+  (also known as **multitask classification**):
+  classification task which labels each sample with a set of **non-binary**
+  properties. Both the number of properties and the number of
+  classes per property is greater than 2. A single estimator thus
+  handles several joint classification tasks. This is both a generalization of
+  the multi\ *label* classification task, which only considers binary
+  attributes, as well as a generalization of the multi\ *class* classification
+  task, where only one property is considered.
+
+  For example, classification of the properties "type of fruit" and "colour"
+  for a set of images of fruit. The property "type of fruit" has the possible
+  classes: "apple", "pear" and "orange". The property "colour" has the
+  possible classes: "green", "red", "yellow" and "orange". Each sample is an
+  image of a fruit, a label is output for both properties and each label is
+  one of the possible classes of the corresponding property.
+
+  Valid representation of :term:`multilabel` `y` is dense matrix of shape
+  ``(n_samples, n_classes)`` of floats. A column wise concatenation of 1d
+  :term:`multiclass` variables. An example of ``y`` for 3 samples:
+
+    >>> y = np.array([['apple', 'green'], ['orange', 'orange'], ['pear', 'green']])
+    >>> print(y)
+    [['apple' 'green']
+     ['orange' 'orange']
+     ['pear' 'green']]
+
+  Note that any classifiers handling multioutput-multiclass (also known as
+  multitask classification) tasks, support the multilabel classification task
+  as a special case. Multitask classification is similar to the multioutput
+  classification task with different model formulations. For more information,
+  see the relevant estimator documentation.
+
 
 All scikit-learn classifiers are capable of multiclass classification,
 but the meta-estimators offered by :mod:`sklearn.multiclass`
@@ -60,6 +153,26 @@ permit changing the way they handle more than two classes
 because this may have an effect on classifier performance
 (either in terms of generalization error or required computational resources).
 
+**Summary**
+
++-----------------+-------------+-------------+------------------------------------------+
+|                 | Number of   | Target      | Valid                                    |
+|                 | targets     | cardinality | :func:`~utils.multiclass.type_of_target` |
++=================+=============+=============+==========================================+
+| Multiclass      |  1          | >2          | - 'multiclass'                           |
+| classification  |             |             |                                          |
++-----------------+-------------+-------------+------------------------------------------+
+| Multilabel      | >1          |  2 (0 or 1) | - 'multilabel-indicator'                 |
+| classification  |             |             |                                          |
++-----------------+-------------+-------------+------------------------------------------+
+| Multioutput     | >1          | Continuous  | - 'continuous-multioutput'               |
+| regression      |             |             |                                          |
++-----------------+-------------+-------------+------------------------------------------+
+| Multioutput-    | >1          | >2          | - 'multiclass-multioutput'               |
+| multiclass      |             |             |                                          |
+| classification  |             |             |                                          |
++-----------------+-------------+-------------+------------------------------------------+
+
 Below is a summary of the classifiers supported by scikit-learn
 grouped by strategy; you don't need the meta-estimators in this class
 if you're using one of these, unless you want custom multiclass behavior:
@@ -94,7 +207,7 @@ if you're using one of these, unless you want custom multiclass behavior:
   - :class:`sklearn.gaussian_process.GaussianProcessClassifier` (setting multi_class = "one_vs_one")
 
 
-- **Multiclass as One-Vs-All:**
+- **Multiclass as One-Vs-The-Rest:**
 
   - :class:`sklearn.ensemble.GradientBoostingClassifier`
   - :class:`sklearn.gaussian_process.GaussianProcessClassifier` (setting multi_class = "one_vs_rest")
@@ -167,7 +280,7 @@ This strategy, also known as **one-vs-all**, is implemented in
 per class. For each classifier, the class is fitted against all the other
 classes. In addition to its computational efficiency (only `n_classes`
 classifiers are needed), one advantage of this approach is its
-interpretability. Since each class is represented by one and only one classifier, 
+interpretability. Since each class is represented by one and only one classifier,
 it is possible to gain knowledge about the class by inspecting its
 corresponding classifier. This is the most commonly used strategy and is a fair
 default choice.
@@ -431,7 +544,7 @@ averaged together.
 Regressor Chain
 ================
 
-Regressor chains (see :class:`RegressorChain`) is analogous to 
-ClassifierChain as a way of combining a number of regressions 
-into a single multi-target model that is capable of exploiting 
+Regressor chains (see :class:`RegressorChain`) is analogous to
+ClassifierChain as a way of combining a number of regressions
+into a single multi-target model that is capable of exploiting
 correlations among targets.

From 444ab2b843446d1cf7a3e146bc949df3b230040e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 6 Dec 2019 22:54:40 -0600
Subject: [PATCH 13/85] TST Increases tol for
 check_pca_float_dtype_preservation assertion (#15775)

---
 sklearn/decomposition/tests/test_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index 826a8cc082c3a..d2c5452c10461 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -532,7 +532,7 @@ def check_pca_float_dtype_preservation(svd_solver):
     assert pca_64.transform(X_64).dtype == np.float64
     assert pca_32.transform(X_32).dtype == np.float32
 
-    assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4)
+    assert_allclose(pca_64.components_, pca_32.components_, rtol=2e-4)
 
 
 def check_pca_int_dtype_upcast_to_double(svd_solver):

From f0f812a341faf22701e05e4976012dc26fdd72ef Mon Sep 17 00:00:00 2001
From: JJmistry <jayminm22@gmail.com>
Date: Sun, 8 Dec 2019 18:23:59 +0000
Subject: [PATCH 14/85] update _alpha_grid class in _coordinate_descent.py
 (#15835)

---
 sklearn/linear_model/_coordinate_descent.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index efe5612845157..30ccb0c9f702f 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -43,26 +43,26 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
     y : ndarray, shape (n_samples,)
         Target values
 
-    Xy : array-like, optional
+    Xy : array-like, default=None
         Xy = np.dot(X.T, y) that can be precomputed.
 
-    l1_ratio : float
+    l1_ratio : float, default=1.0
         The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.
         For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not
         supported) ``For l1_ratio = 1`` it is an L1 penalty. For
         ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.
 
-    eps : float, optional
+    eps : float, default=1e-3
         Length of the path. ``eps=1e-3`` means that
         ``alpha_min / alpha_max = 1e-3``
 
-    n_alphas : int, optional
+    n_alphas : int, default=100
         Number of alphas along the regularization path
 
-    fit_intercept : boolean, default True
+    fit_intercept : boolean, default=True
         Whether to fit an intercept or not
 
-    normalize : boolean, optional, default False
+    normalize : boolean, default=False
         This parameter is ignored when ``fit_intercept`` is set to False.
         If True, the regressors X will be normalized before regression by
         subtracting the mean and dividing by the l2-norm.
@@ -70,7 +70,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
+    copy_X : boolean, optional, default=True
         If ``True``, X will be copied; else, it may be overwritten.
     """
     if l1_ratio == 0:

From 1fa5f22d16d8e51bbb11b642a76e05e7fe656cdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Mon, 9 Dec 2019 08:45:54 +0100
Subject: [PATCH 15/85] FIX Explicit conversion of ndarray to object dtype.
 (#15832)

---
 sklearn/neighbors/_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 1ff45332b1e70..3a1fdadfb94b7 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -276,8 +276,8 @@ def _radius_neighbors_from_graph(graph, radius, return_distance):
     indices = indices.astype(np.intp, copy=no_filter_needed)
 
     if return_distance:
-        neigh_dist = np.array(np.split(data, indptr[1:-1]))
-    neigh_ind = np.array(np.split(indices, indptr[1:-1]))
+        neigh_dist = np.array(np.split(data, indptr[1:-1]), dtype=object)
+    neigh_ind = np.array(np.split(indices, indptr[1:-1]), dtype=object)
 
     if return_distance:
         return neigh_dist, neigh_ind

From f244e51bc2f5d7e722d46644bd4bb2c6d8a0d0e3 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 9 Dec 2019 07:17:43 -0600
Subject: [PATCH 16/85] BLD Parallelize sphinx builds on circle ci (#15745)

---
 doc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/Makefile b/doc/Makefile
index 11c5d58749bec..1cbce7dba9662 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -2,7 +2,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -j auto
 SPHINXBUILD  ?= sphinx-build
 PAPER         =
 BUILDDIR      = _build

From 42a4bfdc110d60e8510613ab14bb935f005a0bd4 Mon Sep 17 00:00:00 2001
From: SylvainLan <sylvain.lannuzel@student.ecp.fr>
Date: Tue, 10 Dec 2019 13:58:42 +0100
Subject: [PATCH 17/85] DOC correct url for preprocessing (#15853)

---
 sklearn/utils/optimize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index 3534d85f1edef..fa682e8c2d97d 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -239,7 +239,7 @@ def _check_optimize_result(solver, result, max_iter=None,
                 "Increase the number of iterations (max_iter) "
                 "or scale the data as shown in:\n"
                 "    https://scikit-learn.org/stable/modules/"
-                "preprocessing.html."
+                "preprocessing.html"
             ).format(solver, result.status, result.message.decode("latin1"))
             if extra_warning_msg is not None:
                 warning_msg += "\n" + extra_warning_msg

From 6887f4591b3702c0e9558f765c305d19db28498b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 10 Dec 2019 08:03:36 -0500
Subject: [PATCH 18/85] MNT avoid generating too many cross links in examples
 (#15844)

---
 doc/conf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 7959a0862f547..0386f7676e0be 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -304,7 +304,9 @@ def __call__(self, directory):
         'branch': binder_branch,
         'dependencies': './binder/requirements.txt',
         'use_jupyter_lab': True
-    }
+    },
+    # avoid generating too many cross links
+    'inspect_global_variables': False,
 }
 
 

From 94876f493ca49c4b624748bc10337a3535699007 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 10 Dec 2019 21:07:22 +0800
Subject: [PATCH 19/85] DOC Correct wrong doc in
 precision_recall_fscore_support (#15833)

---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 10f91934f79da..67f88177df4bd 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -1426,7 +1426,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     fbeta_score : float (if average is not None) or array of float, shape =\
         [n_unique_labels]
 
-    support : int (if average is not None) or array of int, shape =\
+    support : None (if average is not None) or array of int, shape =\
         [n_unique_labels]
         The number of occurrences of each label in ``y_true``.
 

From db288e12670aa30428fa87503e537e8456137534 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Tue, 10 Dec 2019 05:14:00 -0800
Subject: [PATCH 20/85] DOC add comment in check_pca_float_dtype_preservation
 (#15819)

Documenting the changes in https://github.com/scikit-learn/scikit-learn/pull/15775
---
 sklearn/decomposition/tests/test_pca.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index d2c5452c10461..65624215b1158 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -532,6 +532,9 @@ def check_pca_float_dtype_preservation(svd_solver):
     assert pca_64.transform(X_64).dtype == np.float64
     assert pca_32.transform(X_32).dtype == np.float32
 
+    # the rtol is set such that the test passes on all platforms tested on
+    # conda-forge: PR#15775
+    # see: https://github.com/conda-forge/scikit-learn-feedstock/pull/113
     assert_allclose(pca_64.components_, pca_32.components_, rtol=2e-4)
 
 

From 50b3304ed2d8f2f888093d97af604872f3c363d8 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 10 Dec 2019 21:46:57 +0800
Subject: [PATCH 21/85] DOC correct indents in docstring _split.py (#15843)

---
 sklearn/model_selection/_split.py | 66 +++++++++++++++----------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index ff3a3ba5bf365..9a85b4049a3c3 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -135,10 +135,10 @@ class LeaveOneOut(BaseCrossValidator):
     >>> print(loo)
     LeaveOneOut()
     >>> for train_index, test_index in loo.split(X):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
+    ...     print(X_train, X_test, y_train, y_test)
     TRAIN: [1] TEST: [0]
     [[3 4]] [[1 2]] [2] [1]
     TRAIN: [0] TEST: [1]
@@ -222,9 +222,9 @@ class LeavePOut(BaseCrossValidator):
     >>> print(lpo)
     LeavePOut(p=2)
     >>> for train_index, test_index in lpo.split(X):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
     TRAIN: [2 3] TEST: [0 1]
     TRAIN: [1 3] TEST: [0 2]
     TRAIN: [1 2] TEST: [0 3]
@@ -398,9 +398,9 @@ class KFold(_BaseKFold):
     >>> print(kf)
     KFold(n_splits=2, random_state=None, shuffle=False)
     >>> for train_index, test_index in kf.split(X):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
     TRAIN: [2 3] TEST: [0 1]
     TRAIN: [0 1] TEST: [2 3]
 
@@ -604,9 +604,9 @@ class StratifiedKFold(_BaseKFold):
     >>> print(skf)
     StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
     >>> for train_index, test_index in skf.split(X, y):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
     TRAIN: [1 3] TEST: [0 2]
     TRAIN: [0 2] TEST: [1 3]
 
@@ -769,9 +769,9 @@ class TimeSeriesSplit(_BaseKFold):
     >>> print(tscv)
     TimeSeriesSplit(max_train_size=None, n_splits=5)
     >>> for train_index, test_index in tscv.split(X):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
     TRAIN: [0] TEST: [1]
     TRAIN: [0 1] TEST: [2]
     TRAIN: [0 1 2] TEST: [3]
@@ -861,10 +861,10 @@ class LeaveOneGroupOut(BaseCrossValidator):
     >>> print(logo)
     LeaveOneGroupOut()
     >>> for train_index, test_index in logo.split(X, y, groups):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
+    ...     print(X_train, X_test, y_train, y_test)
     TRAIN: [2 3] TEST: [0 1]
     [[5 6]
      [7 8]] [[1 2]
@@ -980,10 +980,10 @@ class LeavePGroupsOut(BaseCrossValidator):
     >>> print(lpgo)
     LeavePGroupsOut(n_groups=2)
     >>> for train_index, test_index in lpgo.split(X, y, groups):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
+    ...     print(X_train, X_test, y_train, y_test)
     TRAIN: [2] TEST: [0 1]
     [[5 6]] [[1 2]
      [3 4]] [1] [1 2]
@@ -1405,7 +1405,7 @@ class ShuffleSplit(BaseShuffleSplit):
     >>> print(rs)
     ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)
     >>> for train_index, test_index in rs.split(X):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
     TRAIN: [1 3 0 4] TEST: [5 2]
     TRAIN: [4 0 2 5] TEST: [1 3]
     TRAIN: [1 2 4 0] TEST: [3 5]
@@ -1414,7 +1414,7 @@ class ShuffleSplit(BaseShuffleSplit):
     >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
     ...                   random_state=0)
     >>> for train_index, test_index in rs.split(X):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
     TRAIN: [1 3 0] TEST: [5 2]
     TRAIN: [4 0 2] TEST: [1 3]
     TRAIN: [1 2 4] TEST: [3 5]
@@ -1508,7 +1508,7 @@ class GroupShuffleSplit(ShuffleSplit):
     >>> gss.get_n_splits()
     2
     >>> for train_idx, test_idx in gss.split(X, y, groups):
-    ...    print("TRAIN:", train_idx, "TEST:", test_idx)
+    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
     TRAIN: [2 3 4 5 6 7] TEST: [0 1]
     TRAIN: [0 1 5 6 7] TEST: [2 3 4]
     '''
@@ -1620,9 +1620,9 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     >>> print(sss)
     StratifiedShuffleSplit(n_splits=5, random_state=0, ...)
     >>> for train_index, test_index in sss.split(X, y):
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
     TRAIN: [5 2 3] TEST: [4 1 0]
     TRAIN: [5 1 4] TEST: [0 2 3]
     TRAIN: [5 0 2] TEST: [4 3 1]
@@ -1837,9 +1837,9 @@ class PredefinedSplit(BaseCrossValidator):
     >>> print(ps)
     PredefinedSplit(test_fold=array([ 0,  1, -1,  1]))
     >>> for train_index, test_index in ps.split():
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
     TRAIN: [1 2 3] TEST: [0]
     TRAIN: [0 2] TEST: [1 3]
     """

From 548b190b2c53161b59827a46b54dfc6918003bd4 Mon Sep 17 00:00:00 2001
From: cgsavard <claire.savard@colorado.edu>
Date: Tue, 10 Dec 2019 08:56:17 -0700
Subject: [PATCH 22/85] DOC fix docstring of KMeans based on sklearn guideline
 (#15754)

---
 sklearn/cluster/_k_means.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_k_means.py
index 52f2b5fee4dac..f470d61423b2c 100644
--- a/sklearn/cluster/_k_means.py
+++ b/sklearn/cluster/_k_means.py
@@ -654,11 +654,12 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     Parameters
     ----------
 
-    n_clusters : int, optional, default: 8
+    n_clusters : int, default=8
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random' or an ndarray}
+    init : {'k-means++', 'random'} or ndarray of shape \
+            (n_clusters, n_features), default='k-means++'
         Method for initialization, defaults to 'k-means++':
 
         'k-means++' : selects initial cluster centers for k-mean
@@ -671,19 +672,19 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
-    n_init : int, default: 10
+    n_init : int, default=10
         Number of time the k-means algorithm will be run with different
         centroid seeds. The final results will be the best output of
         n_init consecutive runs in terms of inertia.
 
-    max_iter : int, default: 300
+    max_iter : int, default=300
         Maximum number of iterations of the k-means algorithm for a
         single run.
 
-    tol : float, default: 1e-4
+    tol : float, default=1e-4
         Relative tolerance with regards to inertia to declare convergence.
 
-    precompute_distances : {'auto', True, False}
+    precompute_distances : 'auto' or bool, default='auto'
         Precompute distances (faster but takes more memory).
 
         'auto' : do not precompute distances if n_samples * n_clusters > 12
@@ -694,15 +695,15 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
 
         False : never precompute distances.
 
-    verbose : int, default 0
+    verbose : int, default=0
         Verbosity mode.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance, default=None
         Determines random number generation for centroid initialization. Use
         an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
-    copy_x : bool, optional
+    copy_x : bool, default=True
         When pre-computing distances it is more numerically accurate to center
         the data first.  If copy_x is True (default), then the original data is
         not modified, ensuring X is C-contiguous.  If False, the original data
@@ -711,7 +712,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         the data mean, in this case it will also not ensure that data is
         C-contiguous which may cause a significant slowdown.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of jobs to use for the computation. This works by computing
         each of the n_init runs in parallel.
 
@@ -719,7 +720,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    algorithm : "auto", "full" or "elkan", default="auto"
+    algorithm : {"auto", "full", "elkan"}, default="auto"
         K-means algorithm to use. The classical EM-style algorithm is "full".
         The "elkan" variation is more efficient by using the triangle
         inequality, but currently doesn't support sparse data. "auto" chooses
@@ -727,12 +728,12 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
 
     Attributes
     ----------
-    cluster_centers_ : array, [n_clusters, n_features]
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Coordinates of cluster centers. If the algorithm stops before fully
         converging (see ``tol`` and ``max_iter``), these will not be
         consistent with ``labels_``.
 
-    labels_ : array, shape (n_samples,)
+    labels_ : ndarray of shape (n_samples,)
         Labels of each point
 
     inertia_ : float

From f1feacaf58bff08aa70a31a29e83445d32ae2774 Mon Sep 17 00:00:00 2001
From: Vachan D A <vachanda@users.noreply.github.com>
Date: Tue, 10 Dec 2019 09:05:18 -0700
Subject: [PATCH 23/85] DOC fix docstring of AgglomerativeClustering based on
 sklearn guideline (#15764)

---
 sklearn/cluster/_hierarchical.py | 38 +++++++++++++++++---------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py
index eb3b989c7c815..56195209f125c 100644
--- a/sklearn/cluster/_hierarchical.py
+++ b/sklearn/cluster/_hierarchical.py
@@ -663,23 +663,23 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
 
     Parameters
     ----------
-    n_clusters : int or None, optional (default=2)
+    n_clusters : int or None, default=2
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
 
-    affinity : string or callable, default: "euclidean"
+    affinity : str or callable, default='euclidean'
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
         "manhattan", "cosine", or "precomputed".
         If linkage is "ward", only "euclidean" is accepted.
         If "precomputed", a distance matrix (instead of a similarity matrix)
         is needed as input for the fit method.
 
-    memory : None, str or object with the joblib.Memory interface, optional
+    memory : str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
 
-    connectivity : array-like or callable, optional
+    connectivity : array-like or callable, default=None
         Connectivity matrix. Defines for each sample the neighboring
         samples following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
@@ -687,17 +687,19 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         kneighbors_graph. Default is None, i.e, the
         hierarchical clustering algorithm is unstructured.
 
-    compute_full_tree : bool or 'auto' (optional)
-        Stop early the construction of the tree at n_clusters. This is
-        useful to decrease computation time if the number of clusters is
-        not small compared to the number of samples. This option is
-        useful only when specifying a connectivity matrix. Note also that
-        when varying the number of clusters and using caching, it may
-        be advantageous to compute the full tree. It must be ``True`` if
-        ``distance_threshold`` is not ``None``.
-
-    linkage : {"ward", "complete", "average", "single"}, optional \
-            (default="ward")
+    compute_full_tree : 'auto' or bool, default='auto'
+        Stop early the construction of the tree at n_clusters. This is useful
+        to decrease computation time if the number of clusters is not small
+        compared to the number of samples. This option is useful only when
+        specifying a connectivity matrix. Note also that when varying the
+        number of clusters and using caching, it may be advantageous to compute
+        the full tree. It must be ``True`` if ``distance_threshold`` is not
+        ``None``. By default `compute_full_tree` is "auto", which is equivalent
+        to `True` when `distance_threshold` is not `None` or that `n_clusters`
+        is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is
+        equivalent to `False`.
+
+    linkage : {"ward", "complete", "average", "single"}, default="ward"
         Which linkage criterion to use. The linkage criterion determines which
         distance to use between sets of observation. The algorithm will merge
         the pairs of cluster that minimize this criterion.
@@ -710,7 +712,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         - single uses the minimum of the distances between all observations
           of the two sets.
 
-    distance_threshold : float, optional (default=None)
+    distance_threshold : float, default=None
         The linkage distance threshold above which, clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
         ``compute_full_tree`` must be ``True``.
@@ -724,7 +726,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         ``distance_threshold=None``, it will be equal to the given
         ``n_clusters``.
 
-    labels_ : array [n_samples]
+    labels_ : ndarray of shape (n_samples)
         cluster labels for each point
 
     n_leaves_ : int
@@ -733,7 +735,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     n_connected_components_ : int
         The estimated number of connected components in the graph.
 
-    children_ : array-like, shape (n_samples-1, 2)
+    children_ : array-like of shape (n_samples-1, 2)
         The children of each non-leaf node. Values less than `n_samples`
         correspond to leaves of the tree which are the original samples.
         A node `i` greater than or equal to `n_samples` is a non-leaf

From b085f25c40b3aec42c4f3fcdeb4ff67df9c87389 Mon Sep 17 00:00:00 2001
From: cgsavard <claire.savard@colorado.edu>
Date: Tue, 10 Dec 2019 09:07:39 -0700
Subject: [PATCH 24/85] DOC fix docstring of AffinityPropagation based on
 sklearn guideline (#15777)

---
 sklearn/cluster/_affinity_propagation.py | 26 ++++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 3393e0686bd02..eaba9ccf1ec20 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -242,51 +242,51 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
 
     Parameters
     ----------
-    damping : float, optional, default: 0.5
+    damping : float, default=0.5
         Damping factor (between 0.5 and 1) is the extent to
         which the current value is maintained relative to
         incoming values (weighted 1 - damping). This in order
         to avoid numerical oscillations when updating these
         values (messages).
 
-    max_iter : int, optional, default: 200
+    max_iter : int, default=200
         Maximum number of iterations.
 
-    convergence_iter : int, optional, default: 15
+    convergence_iter : int, default=15
         Number of iterations with no change in the number
         of estimated clusters that stops the convergence.
 
-    copy : boolean, optional, default: True
+    copy : bool, default=True
         Make a copy of input data.
 
-    preference : array-like, shape (n_samples,) or float, optional
+    preference : array-like of shape (n_samples,) or float, default=None
         Preferences for each point - points with larger values of
         preferences are more likely to be chosen as exemplars. The number
         of exemplars, ie of clusters, is influenced by the input
         preferences value. If the preferences are not passed as arguments,
         they will be set to the median of the input similarities.
 
-    affinity : string, optional, default=``euclidean``
-        Which affinity to use. At the moment ``precomputed`` and
-        ``euclidean`` are supported. ``euclidean`` uses the
+    affinity : {'euclidean', 'precomputed'}, default='euclidean'
+        Which affinity to use. At the moment 'precomputed' and
+        ``euclidean`` are supported. 'euclidean' uses the
         negative squared euclidean distance between points.
 
-    verbose : boolean, optional, default: False
+    verbose : bool, default=False
         Whether to be verbose.
 
 
     Attributes
     ----------
-    cluster_centers_indices_ : array, shape (n_clusters,)
+    cluster_centers_indices_ : ndarray of shape (n_clusters,)
         Indices of cluster centers
 
-    cluster_centers_ : array, shape (n_clusters, n_features)
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Cluster centers (if affinity != ``precomputed``).
 
-    labels_ : array, shape (n_samples,)
+    labels_ : ndarray of shape (n_samples,)
         Labels of each point
 
-    affinity_matrix_ : array, shape (n_samples, n_samples)
+    affinity_matrix_ : ndarray of shape (n_samples, n_samples)
         Stores the affinity matrix used in ``fit``.
 
     n_iter_ : int

From e22cb6f65bb6c90f38cd7242318d6bae739da9d8 Mon Sep 17 00:00:00 2001
From: cgsavard <claire.savard@colorado.edu>
Date: Tue, 10 Dec 2019 09:09:45 -0700
Subject: [PATCH 25/85] DOC fixed SpectralCoclustering and SpectralBiclustering
 docstrings following sklearn guideline (#15778)

---
 sklearn/cluster/_bicluster.py | 70 ++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index 5bfd335549012..cced1674e167b 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -191,10 +191,10 @@ class SpectralCoclustering(BaseSpectral):
 
     Parameters
     ----------
-    n_clusters : integer, optional, default: 3
+    n_clusters : int, default=3
         The number of biclusters to find.
 
-    svd_method : string, optional, default: 'randomized'
+    svd_method : {'randomized', 'arpack'}, default='randomized'
         Selects the algorithm for finding singular vectors. May be
         'randomized' or 'arpack'. If 'randomized', use
         :func:`sklearn.utils.extmath.randomized_svd`, which may be faster
@@ -202,20 +202,21 @@ class SpectralCoclustering(BaseSpectral):
         :func:`scipy.sparse.linalg.svds`, which is more accurate, but
         possibly slower in some cases.
 
-    n_svd_vecs : int, optional, default: None
+    n_svd_vecs : int, default=None
         Number of vectors to use in calculating the SVD. Corresponds
         to `ncv` when `svd_method=arpack` and `n_oversamples` when
         `svd_method` is 'randomized`.
 
-    mini_batch : bool, optional, default: False
+    mini_batch : bool, default=False
         Whether to use mini-batch k-means, which is faster but may get
         different results.
 
-    init : {'k-means++', 'random' or an ndarray}
-         Method for initialization of k-means algorithm; defaults to
-         'k-means++'.
+    init : {'k-means++', 'random', or ndarray of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization of k-means algorithm; defaults to
+        'k-means++'.
 
-    n_init : int, optional, default: 10
+    n_init : int, default=10
         Number of random initializations that are tried with the
         k-means algorithm.
 
@@ -223,7 +224,7 @@ class SpectralCoclustering(BaseSpectral):
         chosen and the algorithm runs once. Otherwise, the algorithm
         is run for each initialization and the best solution chosen.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of jobs to use for the computation. This works by breaking
         down the pairwise matrix into n_jobs even slices and computing them in
         parallel.
@@ -232,24 +233,24 @@ class SpectralCoclustering(BaseSpectral):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance, default=None
         Used for randomizing the singular value decomposition and the k-means
         initialization. Use an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
-    rows_ : array-like, shape (n_row_clusters, n_rows)
+    rows_ : array-like of shape (n_row_clusters, n_rows)
         Results of the clustering. `rows[i, r]` is True if
         cluster `i` contains row `r`. Available only after calling ``fit``.
 
-    columns_ : array-like, shape (n_column_clusters, n_columns)
+    columns_ : array-like of shape (n_column_clusters, n_columns)
         Results of the clustering, like `rows`.
 
-    row_labels_ : array-like, shape (n_rows,)
+    row_labels_ : array-like of shape (n_rows,)
         The bicluster label of each row.
 
-    column_labels_ : array-like, shape (n_cols,)
+    column_labels_ : array-like of shape (n_cols,)
         The bicluster label of each column.
 
     Examples
@@ -319,26 +320,28 @@ class SpectralBiclustering(BaseSpectral):
 
     Parameters
     ----------
-    n_clusters : integer or tuple (n_row_clusters, n_column_clusters)
+    n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
         The number of row and column clusters in the checkerboard
         structure.
 
-    method : string, optional, default: 'bistochastic'
+    method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
         Method of normalizing and converting singular vectors into
         biclusters. May be one of 'scale', 'bistochastic', or 'log'.
         The authors recommend using 'log'. If the data is sparse,
         however, log normalization will not work, which is why the
-        default is 'bistochastic'. CAUTION: if `method='log'`, the
-        data must not be sparse.
+        default is 'bistochastic'.
 
-    n_components : integer, optional, default: 6
+        .. warning::
+           if `method='log'`, the data must be sparse.
+
+    n_components : int, default=6
         Number of singular vectors to check.
 
-    n_best : integer, optional, default: 3
+    n_best : int, default=3
         Number of best singular vectors to which to project the data
         for clustering.
 
-    svd_method : string, optional, default: 'randomized'
+    svd_method : {'randomized', 'arpack'}, default='randomized'
         Selects the algorithm for finding singular vectors. May be
         'randomized' or 'arpack'. If 'randomized', uses
         :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
@@ -346,20 +349,21 @@ class SpectralBiclustering(BaseSpectral):
         `scipy.sparse.linalg.svds`, which is more accurate, but
         possibly slower in some cases.
 
-    n_svd_vecs : int, optional, default: None
+    n_svd_vecs : int, default=None
         Number of vectors to use in calculating the SVD. Corresponds
         to `ncv` when `svd_method=arpack` and `n_oversamples` when
         `svd_method` is 'randomized`.
 
-    mini_batch : bool, optional, default: False
+    mini_batch : bool, default=False
         Whether to use mini-batch k-means, which is faster but may get
         different results.
 
-    init : {'k-means++', 'random' or an ndarray}
-         Method for initialization of k-means algorithm; defaults to
-         'k-means++'.
+    init : {'k-means++', 'random'} or ndarray of (n_clusters, n_features), \
+            default='k-means++'
+        Method for initialization of k-means algorithm; defaults to
+        'k-means++'.
 
-    n_init : int, optional, default: 10
+    n_init : int, default=10
         Number of random initializations that are tried with the
         k-means algorithm.
 
@@ -367,7 +371,7 @@ class SpectralBiclustering(BaseSpectral):
         chosen and the algorithm runs once. Otherwise, the algorithm
         is run for each initialization and the best solution chosen.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of jobs to use for the computation. This works by breaking
         down the pairwise matrix into n_jobs even slices and computing them in
         parallel.
@@ -376,24 +380,24 @@ class SpectralBiclustering(BaseSpectral):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance, default=None
         Used for randomizing the singular value decomposition and the k-means
         initialization. Use an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
-    rows_ : array-like, shape (n_row_clusters, n_rows)
+    rows_ : array-like of shape (n_row_clusters, n_rows)
         Results of the clustering. `rows[i, r]` is True if
         cluster `i` contains row `r`. Available only after calling ``fit``.
 
-    columns_ : array-like, shape (n_column_clusters, n_columns)
+    columns_ : array-like of shape (n_column_clusters, n_columns)
         Results of the clustering, like `rows`.
 
-    row_labels_ : array-like, shape (n_rows,)
+    row_labels_ : array-like of shape (n_rows,)
         Row partition labels.
 
-    column_labels_ : array-like, shape (n_cols,)
+    column_labels_ : array-like of shape (n_cols,)
         Column partition labels.
 
     Examples

From 25030f5dab4db84f78b7f6f04c61103d624e4034 Mon Sep 17 00:00:00 2001
From: Vachan D A <vachanda@users.noreply.github.com>
Date: Tue, 10 Dec 2019 09:36:39 -0700
Subject: [PATCH 26/85] DOC fix FeatureAgglomeration and MiniBatchKMeans
 docstring following sklearn guideline (#15809)

---
 sklearn/cluster/_hierarchical.py | 44 +++++++++++++++++---------------
 sklearn/cluster/_k_means.py      | 29 +++++++++++----------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py
index 56195209f125c..d39fb7ba64f5d 100644
--- a/sklearn/cluster/_hierarchical.py
+++ b/sklearn/cluster/_hierarchical.py
@@ -696,8 +696,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         the full tree. It must be ``True`` if ``distance_threshold`` is not
         ``None``. By default `compute_full_tree` is "auto", which is equivalent
         to `True` when `distance_threshold` is not `None` or that `n_clusters`
-        is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is
-        equivalent to `False`.
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
 
     linkage : {"ward", "complete", "average", "single"}, default="ward"
         Which linkage criterion to use. The linkage criterion determines which
@@ -911,21 +911,21 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
 
     Parameters
     ----------
-    n_clusters : int or None, optional (default=2)
+    n_clusters : int, default=2
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
 
-    affinity : string or callable, default "euclidean"
+    affinity : str or callable, default='euclidean'
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
         "manhattan", "cosine", or 'precomputed'.
         If linkage is "ward", only "euclidean" is accepted.
 
-    memory : None, str or object with the joblib.Memory interface, optional
+    memory : str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
 
-    connectivity : array-like or callable, optional
+    connectivity : array-like or callable, default=None
         Connectivity matrix. Defines for each feature the neighboring
         features following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
@@ -933,17 +933,19 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         kneighbors_graph. Default is None, i.e, the
         hierarchical clustering algorithm is unstructured.
 
-    compute_full_tree : bool or 'auto', optional, default "auto"
-        Stop early the construction of the tree at n_clusters. This is
-        useful to decrease computation time if the number of clusters is
-        not small compared to the number of features. This option is
-        useful only when specifying a connectivity matrix. Note also that
-        when varying the number of clusters and using caching, it may
-        be advantageous to compute the full tree. It must be ``True`` if
-        ``distance_threshold`` is not ``None``.
+    compute_full_tree : 'auto' or bool, optional, default='auto'
+        Stop early the construction of the tree at n_clusters. This is useful
+        to decrease computation time if the number of clusters is not small
+        compared to the number of features. This option is useful only when
+        specifying a connectivity matrix. Note also that when varying the
+        number of clusters and using caching, it may be advantageous to compute
+        the full tree. It must be ``True`` if ``distance_threshold`` is not
+        ``None``. By default `compute_full_tree` is "auto", which is equivalent
+        to `True` when `distance_threshold` is not `None` or that `n_clusters`
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
 
-    linkage : {"ward", "complete", "average", "single"}, optional\
-            (default="ward")
+    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
         Which linkage criterion to use. The linkage criterion determines which
         distance to use between sets of features. The algorithm will merge
         the pairs of cluster that minimize this criterion.
@@ -956,12 +958,12 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         - single uses the minimum of the distances between all observations
           of the two sets.
 
-    pooling_func : callable, default np.mean
+    pooling_func : callable, default=np.mean
         This combines the values of agglomerated features into a single
         value, and should accept an array of shape [M, N] and the keyword
         argument `axis=1`, and reduce it to an array of size [M].
 
-    distance_threshold : float, optional (default=None)
+    distance_threshold : float, default=None
         The linkage distance threshold above which, clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
         ``compute_full_tree`` must be ``True``.
@@ -975,7 +977,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         ``distance_threshold=None``, it will be equal to the given
         ``n_clusters``.
 
-    labels_ : array-like, (n_features,)
+    labels_ : array-like of (n_features,)
         cluster labels for each feature.
 
     n_leaves_ : int
@@ -984,7 +986,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     n_connected_components_ : int
         The estimated number of connected components in the graph.
 
-    children_ : array-like, shape (n_nodes-1, 2)
+    children_ : array-like of shape (n_nodes-1, 2)
         The children of each non-leaf node. Values less than `n_features`
         correspond to leaves of the tree which are the original samples.
         A node `i` greater than or equal to `n_features` is a non-leaf
@@ -992,7 +994,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         at the i-th iteration, children[i][0] and children[i][1]
         are merged to form node `n_features + i`
 
-    distances_ : array-like, shape (n_nodes-1,)
+    distances_ : array-like of shape (n_nodes-1,)
         Distances between nodes in the corresponding place in `children_`.
         Only computed if distance_threshold is not None.
 
diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_k_means.py
index f470d61423b2c..71be86a087629 100644
--- a/sklearn/cluster/_k_means.py
+++ b/sklearn/cluster/_k_means.py
@@ -1336,12 +1336,13 @@ class MiniBatchKMeans(KMeans):
     Parameters
     ----------
 
-    n_clusters : int, optional, default: 8
+    n_clusters : int, default=8
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
-        Method for initialization, defaults to 'k-means++':
+    init : {'k-means++', 'random'} or ndarray of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization
 
         'k-means++' : selects initial cluster centers for k-mean
         clustering in a smart way to speed up convergence. See section
@@ -1353,26 +1354,26 @@ class MiniBatchKMeans(KMeans):
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
-    max_iter : int, optional
+    max_iter : int, default=100
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
 
-    batch_size : int, optional, default: 100
+    batch_size : int, default=100
         Size of the mini batches.
 
-    verbose : bool, optional
+    verbose : int, default=0
         Verbosity mode.
 
     compute_labels : bool, default=True
         Compute label assignment and inertia for the complete dataset
         once the minibatch optimization has converged in fit.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance, default=None
         Determines random number generation for centroid initialization and
         random reassignment. Use an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
-    tol : float, default: 0.0
+    tol : float, default=0.0
         Control early stopping based on the relative center changes as
         measured by a smoothed, variance-normalized of the mean center
         squared position changes. This early stopping heuristics is
@@ -1383,25 +1384,27 @@ class MiniBatchKMeans(KMeans):
         To disable convergence detection based on normalized center
         change, set tol to 0.0 (default).
 
-    max_no_improvement : int, default: 10
+    max_no_improvement : int, default=10
         Control early stopping based on the consecutive number of mini
         batches that does not yield an improvement on the smoothed inertia.
 
         To disable convergence detection based on inertia, set
         max_no_improvement to None.
 
-    init_size : int, optional, default: 3 * batch_size
+    init_size : int, default=None
         Number of samples to randomly sample for speeding up the
         initialization (sometimes at the expense of accuracy): the
         only algorithm is initialized by running a batch KMeans on a
         random subset of the data. This needs to be larger than n_clusters.
 
+        If `None`, `init_size= 3 * batch_size`.
+
     n_init : int, default=3
         Number of random initializations that are tried.
         In contrast to KMeans, the algorithm is only run once, using the
         best of the ``n_init`` initializations as measured by inertia.
 
-    reassignment_ratio : float, default: 0.01
+    reassignment_ratio : float, default=0.01
         Control the fraction of the maximum number of counts for a
         center to be reassigned. A higher value means that low count
         centers are more easily reassigned, which means that the
@@ -1411,10 +1414,10 @@ class MiniBatchKMeans(KMeans):
     Attributes
     ----------
 
-    cluster_centers_ : array, [n_clusters, n_features]
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Coordinates of cluster centers
 
-    labels_ :
+    labels_ : int
         Labels of each point (if compute_labels is set to True).
 
     inertia_ : float

From 3c4d3f045a669df8e9df066edcfa172bd9aa35f0 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Wed, 11 Dec 2019 18:33:53 +0800
Subject: [PATCH 27/85] TST Specify random_state in test_cv_iterable_wrapper
 (#15829)

---
 sklearn/model_selection/tests/test_split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 5d3c41900472e..eb5eb192a6921 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -1338,7 +1338,7 @@ def test_cv_iterable_wrapper():
                             list(kf_iter_wrapped.split(X, y)))
     # If the splits are randomized, successive calls to split yields different
     # results
-    kf_randomized_iter = KFold(shuffle=True).split(X, y)
+    kf_randomized_iter = KFold(shuffle=True, random_state=0).split(X, y)
     kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
     # numpy's assert_array_equal properly compares nested lists
     np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),

From 697b605d376b5c4a46ba08d2b65c1f6adb694d83 Mon Sep 17 00:00:00 2001
From: Sambhav Kothari <sambhavs.email@gmail.com>
Date: Fri, 13 Dec 2019 00:44:59 +0000
Subject: [PATCH 28/85] DOC Include LinearSV{C, R} in models that support
 sample_weights (#15871)

---
 doc/modules/svm.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 03020cfd2252c..c5da9f09fa720 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -267,10 +267,11 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
    :scale: 75
 
 
-:class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR` and
-:class:`OneClassSVM` implement also weights for individual samples in method
-``fit`` through keyword ``sample_weight``. Similar to ``class_weight``, these
-set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``.
+:class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`, :class:`LinearSVC`,
+:class:`LinearSVR` and :class:`OneClassSVM` implement also weights for
+individual samples in method ``fit`` through keyword ``sample_weight``. Similar
+to ``class_weight``, these set the parameter ``C`` for the i-th example to
+``C * sample_weight[i]``.
 
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_weighted_samples_001.png

From 84feae2448f1eeebeff58b1794cca5034c61bb03 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Fri, 13 Dec 2019 11:51:17 +0800
Subject: [PATCH 29/85] DOC correct some indents (#15875)

---
 sklearn/exceptions.py     | 4 ++--
 sklearn/utils/_testing.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 590fa416dd807..c3da8e8d7986f 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -62,8 +62,8 @@ class ConvergenceWarning(UserWarning):
     ...                 [1, 0],
     ...                 [1, 0]])  # last point is duplicated
     >>> with warnings.catch_warnings(record=True) as w:
-    ...    km = KMeans(n_clusters=4).fit(X)
-    ...    print(w[-1].message)
+    ...     km = KMeans(n_clusters=4).fit(X)
+    ...     print(w[-1].message)
     Number of distinct clusters (3) found smaller than n_clusters (4).
     Possibly due to duplicate points in X.
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 806f302b78288..7c00d56930273 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -276,8 +276,8 @@ def ignore_warnings(obj=None, category=Warning):
     ...     warnings.warn('buhuhuhu')
 
     >>> def nasty_warn():
-    ...    warnings.warn('buhuhuhu')
-    ...    print(42)
+    ...     warnings.warn('buhuhuhu')
+    ...     print(42)
 
     >>> ignore_warnings(nasty_warn)()
     42

From be0fafb9c49917744aa61edcfbcd6f3bacbea008 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Fri, 13 Dec 2019 23:10:17 +0100
Subject: [PATCH 30/85] DOC Fix documentation of default values in tree classes
 (#15870)

---
 sklearn/tree/_classes.py | 194 +++++++++++++++++++--------------------
 1 file changed, 97 insertions(+), 97 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 5bc66d59acc07..736049fafbd56 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -244,9 +244,9 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             elif self.max_features == "log2":
                 max_features = max(1, int(np.log2(self.n_features_)))
             else:
-                raise ValueError(
-                    'Invalid value for max_features. Allowed string '
-                    'values are "auto", "sqrt" or "log2".')
+                raise ValueError("Invalid value for max_features. "
+                                 "Allowed string values are 'auto', "
+                                 "'sqrt' or 'log2'.")
         elif self.max_features is None:
             max_features = self.n_features_
         elif isinstance(self.max_features, numbers.Integral):
@@ -401,12 +401,12 @@ def predict(self, X, check_input=True):
 
         Parameters
         ----------
-        X : array-like or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input samples. Internally, it will be converted to
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
-        check_input : bool, (default=True)
+        check_input : bool, default=True
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
@@ -445,8 +445,7 @@ def predict(self, X, check_input=True):
                 return proba[:, :, 0]
 
     def apply(self, X, check_input=True):
-        """
-        Return the index of the leaf that each sample is predicted as.
+        """Return the index of the leaf that each sample is predicted as.
 
         .. versionadded:: 0.17
 
@@ -457,13 +456,13 @@ def apply(self, X, check_input=True):
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
-        check_input : bool, (default=True)
+        check_input : bool, default=True
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
         Returns
         -------
-        X_leaves : array_like, shape = [n_samples,]
+        X_leaves : array-like of shape (n_samples,)
             For each datapoint x in X, return the index of the leaf x
             ends up in. Leaves are numbered within
             ``[0; self.tree_.node_count)``, possibly with gaps in the
@@ -485,14 +484,14 @@ def decision_path(self, X, check_input=True):
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
-        check_input : bool, (default=True)
+        check_input : bool, default=True
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
         Returns
         -------
-        indicator : sparse csr array, shape = [n_samples, n_nodes]
-            Return a node indicator matrix where non zero elements
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator CSR matrix where non zero elements
             indicates that the samples goes through the nodes.
         """
         X = self._validate_X_predict(X, check_input)
@@ -570,7 +569,7 @@ def feature_importances_(self):
 
         Returns
         -------
-        feature_importances_ : array, shape = [n_features]
+        feature_importances_ : ndarray of shape (n_features,)
             Normalized total reduction of critera by feature (Gini importance).
         """
         check_is_fitted(self)
@@ -589,21 +588,21 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
     Parameters
     ----------
-    criterion : str, optional (default="gini")
+    criterion : {"gini", "entropy"}, default="gini"
         The function to measure the quality of a split. Supported criteria are
         "gini" for the Gini impurity and "entropy" for the information gain.
 
-    splitter : str, optional (default="best")
+    splitter : {"best", "random"}, default="best"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
         the best random split.
 
-    max_depth : int or None, optional (default=None)
+    max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -614,7 +613,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int or float, default=1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` training samples in each of the left and
@@ -629,12 +628,12 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default=0.0
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-    max_features : int, float, str or None, optional (default=None)
+    max_features : int, float or {"auto", "sqrt", "log2"}, default=None
         The number of features to consider when looking for the best split:
 
             - If int, then consider `max_features` features at each split.
@@ -650,18 +649,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int or RandomState, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int, default=None
         Grow a tree with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default=0.0
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -689,9 +688,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
            ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it
            will be removed in 0.25. Use ``min_impurity_decrease`` instead.
 
-    class_weight : dict, list of dicts, "balanced" or None, default=None
+    class_weight : dict, list of dict or "balanced", default=None
         Weights associated with classes in the form ``{class_label: weight}``.
-        If not given, all classes are supposed to have weight one. For
+        If None, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
@@ -715,7 +714,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. deprecated:: 0.22
 
-    ccp_alpha : non-negative float, optional (default=0.0)
+    ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
@@ -725,7 +724,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
     Attributes
     ----------
-    classes_ : array of shape (n_classes,) or a list of such arrays
+    classes_ : ndarray of shape (n_classes,) or list of ndarray
         The classes labels (single output problem),
         or a list of arrays of class labels (multi-output problem).
 
@@ -735,10 +734,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         total reduction of the criterion brought by that feature.  It is also
         known as the Gini importance [4]_.
 
-    max_features_ : int,
+    max_features_ : int
         The inferred value of max_features.
 
-    n_classes_ : int or list
+    n_classes_ : int or list of int
         The number of classes (for single output problems),
         or a list containing the number of classes for each
         output (for multi-output problems).
@@ -749,7 +748,7 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
-    tree_ : Tree object
+    tree_ : Tree
         The underlying Tree object. Please refer to
         ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
@@ -838,7 +837,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         Parameters
         ----------
-        X : {array-like or sparse matrix} of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The training input samples. Internally, it will be converted to
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csc_matrix``.
@@ -853,11 +852,12 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             ignored if they would result in any single class carrying a
             negative weight in either child node.
 
-        check_input : bool, (default=True)
+        check_input : bool, default=True
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
-        X_idx_sorted : array-like of shape (n_samples, n_features), optional
+        X_idx_sorted : array-like of shape (n_samples, n_features), \
+                default=None
             The indexes of the sorted training input samples. If many tree
             are grown on the same dataset, this allows the ordering to be
             cached between trees. If None, the data will be sorted here.
@@ -865,7 +865,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         Returns
         -------
-        self : object
+        self : DecisionTreeClassifier
             Fitted estimator.
         """
 
@@ -882,24 +882,21 @@ def predict_proba(self, X, check_input=True):
         The predicted class probability is the fraction of samples of the same
         class in a leaf.
 
-        check_input : boolean, (default=True)
-            Allow to bypass several input checking.
-            Don't use this parameter unless you know what you do.
-
         Parameters
         ----------
-        X : array-like or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input samples. Internally, it will be converted to
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
-        check_input : bool
-            Run check_array on X.
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
 
         Returns
         -------
-        proba : array of shape (n_samples, n_classes), or a list of n_outputs \
-            such arrays if n_outputs > 1.
+        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
+            such arrays if n_outputs > 1
             The class probabilities of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
         """
@@ -932,15 +929,15 @@ def predict_log_proba(self, X):
 
         Parameters
         ----------
-        X : array-like or sparse matrix of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input samples. Internally, it will be converted to
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
         Returns
         -------
-        proba : array of shape (n_samples, n_classes), or a list of n_outputs \
-            such arrays if n_outputs > 1.
+        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
+            such arrays if n_outputs > 1
             The class log-probabilities of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
         """
@@ -963,7 +960,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
     Parameters
     ----------
-    criterion : str, optional (default="mse")
+    criterion : {"mse", "friedman_mse", "mae"}, default="mse"
         The function to measure the quality of a split. Supported criteria
         are "mse" for the mean squared error, which is equal to variance
         reduction as feature selection criterion and minimizes the L2 loss
@@ -975,17 +972,17 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
-    splitter : str, optional (default="best")
+    splitter : {"best", "random"}, default="best"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
         the best random split.
 
-    max_depth : int or None, optional (default=None)
+    max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -996,7 +993,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int or float, default=1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` training samples in each of the left and
@@ -1011,12 +1008,12 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default=0.0
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-    max_features : int, float, str or None, optional (default=None)
+    max_features : int, float or {"auto", "sqrt", "log2"}, default=None
         The number of features to consider when looking for the best split:
 
         - If int, then consider `max_features` features at each split.
@@ -1032,18 +1029,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int or RandomState, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int, default=None
         Grow a tree with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default=0.0
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -1076,7 +1073,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
         .. deprecated:: 0.22
 
-    ccp_alpha : non-negative float, optional (default=0.0)
+    ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
@@ -1093,7 +1090,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         (normalized) total reduction of the criterion brought
         by that feature. It is also known as the Gini importance [4]_.
 
-    max_features_ : int,
+    max_features_ : int
         The inferred value of max_features.
 
     n_features_ : int
@@ -1102,7 +1099,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
-    tree_ : Tree object
+    tree_ : Tree
         The underlying Tree object. Please refer to
         ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
@@ -1189,7 +1186,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         Parameters
         ----------
-        X : {array-like or sparse matrix} of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The training input samples. Internally, it will be converted to
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csc_matrix``.
@@ -1203,11 +1200,12 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             that would create child nodes with net zero or negative weight are
             ignored while searching for a split in each node.
 
-        check_input : bool, (default=True)
+        check_input : bool, default=True
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
-        X_idx_sorted : array-like of shape (n_samples, n_features), optional
+        X_idx_sorted : array-like of shape (n_samples, n_features), \
+            default=None
             The indexes of the sorted training input samples. If many tree
             are grown on the same dataset, this allows the ordering to be
             cached between trees. If None, the data will be sorted here.
@@ -1215,7 +1213,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         Returns
         -------
-        self : object
+        self : DecisionTreeRegressor
             Fitted estimator.
         """
 
@@ -1259,21 +1257,21 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
     Parameters
     ----------
-    criterion : str, optional (default="gini")
+    criterion : {"gini", "entropy"}, default="gini"
         The function to measure the quality of a split. Supported criteria are
         "gini" for the Gini impurity and "entropy" for the information gain.
 
-    splitter : str, optional (default="random")
+    splitter : {"random", "best"}, default="random"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
         the best random split.
 
-    max_depth : int or None, optional (default=None)
+    max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -1284,7 +1282,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int or float, default=1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` training samples in each of the left and
@@ -1299,12 +1297,12 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default=0.0
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-    max_features : int, float, str or None, optional (default="auto")
+    max_features : int, float, {"auto", "sqrt", "log2"} or None, default="auto"
         The number of features to consider when looking for the best split:
 
             - If int, then consider `max_features` features at each split.
@@ -1320,18 +1318,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int or RandomState, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int, default=None
         Grow a tree with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default=0.0
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -1359,9 +1357,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
            ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it
            will be removed in 0.25. Use ``min_impurity_decrease`` instead.
 
-    class_weight : dict, list of dicts, "balanced" or None, default=None
+    class_weight : dict, list of dict or "balanced", default=None
         Weights associated with classes in the form ``{class_label: weight}``.
-        If not given, all classes are supposed to have weight one. For
+        If None, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
@@ -1380,7 +1378,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
-    ccp_alpha : non-negative float, optional (default=0.0)
+    ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
@@ -1390,14 +1388,14 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
     Attributes
     ----------
-    classes_ : array of shape (n_classes,) or a list of such arrays
+    classes_ : ndarray of shape (n_classes,) or list of ndarray
         The classes labels (single output problem),
         or a list of arrays of class labels (multi-output problem).
 
-    max_features_ : int,
+    max_features_ : int
         The inferred value of max_features.
 
-    n_classes_ : int or list
+    n_classes_ : int or list of int
         The number of classes (for single output problems),
         or a list containing the number of classes for each
         output (for multi-output problems).
@@ -1412,7 +1410,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
-    tree_ : Tree object
+    tree_ : Tree
         The underlying Tree object. Please refer to
         ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
@@ -1420,8 +1418,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
     See Also
     --------
-    ExtraTreeRegressor, sklearn.ensemble.ExtraTreesClassifier,
-    sklearn.ensemble.ExtraTreesRegressor
+    ExtraTreeRegressor : An extremely randomized tree regressor.
+    sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
+    sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
 
     Notes
     -----
@@ -1483,7 +1482,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     Parameters
     ----------
-    criterion : str, optional (default="mse")
+    criterion : {"mse", "friedman_mse", "mae"}, default="mse"
         The function to measure the quality of a split. Supported criteria
         are "mse" for the mean squared error, which is equal to variance
         reduction as feature selection criterion, and "mae" for the mean
@@ -1492,17 +1491,17 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
-    splitter : str, optional (default="random")
+    splitter : {"random", "best"}, default="random"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
         the best random split.
 
-    max_depth : int or None, optional (default=None)
+    max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -1513,7 +1512,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int or float, default=1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` training samples in each of the left and
@@ -1528,12 +1527,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         .. versionchanged:: 0.18
            Added float values for fractions.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default=0.0
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-    max_features : int, float, str or None, optional (default="auto")
+    max_features : int, float, {"auto", "sqrt", "log2"} or None, default="auto"
         The number of features to consider when looking for the best split:
 
         - If int, then consider `max_features` features at each split.
@@ -1549,13 +1548,13 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int or RandomState, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default=0.0
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -1583,12 +1582,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
            ``min_impurity_split`` will change from 1e-7 to 0 in 0.23 and it
            will be removed in 0.25. Use ``min_impurity_decrease`` instead.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int, default=None
         Grow a tree with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    ccp_alpha : non-negative float, optional (default=0.0)
+    ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
@@ -1598,7 +1597,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     Attributes
     ----------
-    max_features_ : int,
+    max_features_ : int
         The inferred value of max_features.
 
     n_features_ : int
@@ -1607,7 +1606,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
-    tree_ : Tree object
+    tree_ : Tree
         The underlying Tree object. Please refer to
         ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
@@ -1615,8 +1614,9 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     See Also
     --------
-    ExtraTreeClassifier, sklearn.ensemble.ExtraTreesClassifier,
-    sklearn.ensemble.ExtraTreesRegressor
+    ExtraTreeClassifier : An extremely randomized tree classifier.
+    sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
+    sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
 
     Notes
     -----

From 0277f3a1ac5f170de8c8e35787448e799208b395 Mon Sep 17 00:00:00 2001
From: wenliwyan <12013376+wenliwyan@users.noreply.github.com>
Date: Sat, 14 Dec 2019 19:52:54 +0800
Subject: [PATCH 31/85] DOC fix typo in docstring (#15887)

---
 sklearn/neighbors/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 3a1fdadfb94b7..258440d20c836 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -558,7 +558,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
 
         Examples
         --------
-        In the following example, we construct a NeighborsClassifier
+        In the following example, we construct a NearestNeighbors
         class from an array representing our data set and ask who's
         the closest point to [1,1,1]
 

From 2f7afa3249605ce1d6b92d3f2146e5d280bc4da5 Mon Sep 17 00:00:00 2001
From: shivamgargsya <shivam.gargshya@gmail.com>
Date: Sun, 15 Dec 2019 01:37:31 +0530
Subject: [PATCH 32/85] DOC FIX default value for xticks_rotation in
 plot_confusion_matrix (#15890)

---
 sklearn/metrics/_plot/confusion_matrix.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index be59c8dd9a847..fd9cd48489029 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -56,7 +56,7 @@ def plot(self, include_values=True, cmap='viridis',
             Colormap recognized by matplotlib.
 
         xticks_rotation : {'vertical', 'horizontal'} or float, \
-                         default='vertical'
+                         default='horizontal'
             Rotation of xtick labels.
 
         values_format : str, default=None
@@ -160,7 +160,7 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None,
         Includes values in confusion matrix.
 
     xticks_rotation : {'vertical', 'horizontal'} or float, \
-                        default='vertical'
+                        default='horizontal'
         Rotation of xtick labels.
 
     values_format : str, default=None

From b09f905154594bc77dc96eff86b4bcda627d2d41 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 17 Dec 2019 21:09:49 -0500
Subject: [PATCH 33/85] Fix imports in pip3 ubuntu by suffixing affected files
 (#15891)

---
 sklearn/_build_utils/deprecated_modules.py              | 8 ++++----
 sklearn/cluster/__init__.py                             | 6 +++---
 sklearn/cluster/{_hierarchical.py => _agglomerative.py} | 6 +++---
 sklearn/cluster/{_k_means.py => _kmeans.py}             | 0
 sklearn/cluster/_spectral.py                            | 2 +-
 sklearn/cluster/tests/test_hierarchical.py              | 5 +++--
 sklearn/cluster/tests/test_k_means.py                   | 8 ++++----
 sklearn/decomposition/__init__.py                       | 2 +-
 sklearn/decomposition/{_online_lda.py => _lda.py}       | 0
 sklearn/decomposition/tests/test_online_lda.py          | 4 ++--
 sklearn/feature_extraction/__init__.py                  | 2 +-
 sklearn/feature_extraction/{_hashing.py => _hash.py}    | 0
 sklearn/feature_extraction/text.py                      | 2 +-
 13 files changed, 23 insertions(+), 22 deletions(-)
 rename sklearn/cluster/{_hierarchical.py => _agglomerative.py} (99%)
 rename sklearn/cluster/{_k_means.py => _kmeans.py} (100%)
 rename sklearn/decomposition/{_online_lda.py => _lda.py} (100%)
 rename sklearn/feature_extraction/{_hashing.py => _hash.py} (100%)

diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py
index 9ff7c7f224710..5f11ac1714022 100644
--- a/sklearn/_build_utils/deprecated_modules.py
+++ b/sklearn/_build_utils/deprecated_modules.py
@@ -47,9 +47,9 @@
      'SpectralBiclustering'),
     ('_birch', 'sklearn.cluster.birch', 'sklearn.cluster', 'Birch'),
     ('_dbscan', 'sklearn.cluster.dbscan_', 'sklearn.cluster', 'DBSCAN'),
-    ('_hierarchical', 'sklearn.cluster.hierarchical', 'sklearn.cluster',
+    ('_agglomerative', 'sklearn.cluster.hierarchical', 'sklearn.cluster',
      'FeatureAgglomeration'),
-    ('_k_means', 'sklearn.cluster.k_means_', 'sklearn.cluster', 'KMeans'),
+    ('_kmeans', 'sklearn.cluster.k_means_', 'sklearn.cluster', 'KMeans'),
     ('_mean_shift', 'sklearn.cluster.mean_shift_', 'sklearn.cluster',
      'MeanShift'),
     ('_optics', 'sklearn.cluster.optics_', 'sklearn.cluster', 'OPTICS'),
@@ -101,7 +101,7 @@
     ('_kernel_pca', 'sklearn.decomposition.kernel_pca',
      'sklearn.decomposition', 'KernelPCA'),
     ('_nmf', 'sklearn.decomposition.nmf', 'sklearn.decomposition', 'NMF'),
-    ('_online_lda', 'sklearn.decomposition.online_lda',
+    ('_lda', 'sklearn.decomposition.online_lda',
      'sklearn.decomposition', 'LatentDirichletAllocation'),
     ('_online_lda_fast', 'sklearn.decomposition.online_lda_fast',
      'sklearn.decomposition', 'mean_change'),
@@ -140,7 +140,7 @@
 
     ('_dict_vectorizer', 'sklearn.feature_extraction.dict_vectorizer',
      'sklearn.feature_extraction', 'DictVectorizer'),
-    ('_hashing', 'sklearn.feature_extraction.hashing',
+    ('_hash', 'sklearn.feature_extraction.hashing',
      'sklearn.feature_extraction', 'FeatureHasher'),
     ('_stop_words', 'sklearn.feature_extraction.stop_words',
      'sklearn.feature_extraction.text', 'ENGLISH_STOP_WORDS'),
diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py
index 2cdf4b074e1c3..5f3cc58507576 100644
--- a/sklearn/cluster/__init__.py
+++ b/sklearn/cluster/__init__.py
@@ -7,9 +7,9 @@
 from ._mean_shift import (mean_shift, MeanShift,
                           estimate_bandwidth, get_bin_seeds)
 from ._affinity_propagation import affinity_propagation, AffinityPropagation
-from ._hierarchical import (ward_tree, AgglomerativeClustering, linkage_tree,
-                            FeatureAgglomeration)
-from ._k_means import k_means, KMeans, MiniBatchKMeans
+from ._agglomerative import (ward_tree, AgglomerativeClustering,
+                             linkage_tree, FeatureAgglomeration)
+from ._kmeans import k_means, KMeans, MiniBatchKMeans
 from ._dbscan import dbscan, DBSCAN
 from ._optics import (OPTICS, cluster_optics_dbscan, compute_optics_graph,
                       cluster_optics_xi)
diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_agglomerative.py
similarity index 99%
rename from sklearn/cluster/_hierarchical.py
rename to sklearn/cluster/_agglomerative.py
index d39fb7ba64f5d..e4fca1c0a572c 100644
--- a/sklearn/cluster/_hierarchical.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -23,7 +23,7 @@
 from ._feature_agglomeration import AgglomerationTransform
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
-from ..utils import deprecated
+
 
 ###############################################################################
 # For non fully-connected graphs
@@ -247,8 +247,8 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
     else:
         if n_clusters > n_samples:
             raise ValueError('Cannot provide more clusters than samples. '
-                             '%i n_clusters was asked, and there are %i samples.'
-                             % (n_clusters, n_samples))
+                             '%i n_clusters was asked, and there are %i '
+                             'samples.' % (n_clusters, n_samples))
         n_nodes = 2 * n_samples - n_clusters
 
     # create inertia matrix
diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_kmeans.py
similarity index 100%
rename from sklearn/cluster/_k_means.py
rename to sklearn/cluster/_kmeans.py
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index 78cdcc5073ccc..b6c5586e75b47 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -15,7 +15,7 @@
 from ..metrics.pairwise import pairwise_kernels
 from ..neighbors import kneighbors_graph, NearestNeighbors
 from ..manifold import spectral_embedding
-from ._k_means import k_means
+from ._kmeans import k_means
 
 
 def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20,
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index bb93cb20395fd..50ef2f97114a1 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -22,8 +22,9 @@
 
 from sklearn.cluster import ward_tree
 from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
-from sklearn.cluster._hierarchical import (_hc_cut, _TREE_BUILDERS,
-                                           linkage_tree, _fix_connectivity)
+from sklearn.cluster._agglomerative import (_hc_cut, _TREE_BUILDERS,
+                                            linkage_tree,
+                                            _fix_connectivity)
 from sklearn.feature_extraction.image import grid_to_graph
 from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\
     manhattan_distances, pairwise_distances
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index ea7e5b7825437..50c91382d3117 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -22,8 +22,8 @@
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.cluster import KMeans, k_means
 from sklearn.cluster import MiniBatchKMeans
-from sklearn.cluster._k_means import _labels_inertia
-from sklearn.cluster._k_means import _mini_batch_step
+from sklearn.cluster._kmeans import _labels_inertia
+from sklearn.cluster._kmeans import _mini_batch_step
 from sklearn.datasets import make_blobs
 from io import StringIO
 from sklearn.metrics.cluster import homogeneity_score
@@ -734,7 +734,7 @@ def test_k_means_function():
 
 def test_x_squared_norms_init_centroids():
     # Test that x_squared_norms can be None in _init_centroids
-    from sklearn.cluster._k_means import _init_centroids
+    from sklearn.cluster._kmeans import _init_centroids
 
     X_norms = np.sum(X**2, axis=1)
     precompute = _init_centroids(
@@ -921,7 +921,7 @@ def test_sample_weight_length():
 
 
 def test_check_normalize_sample_weight():
-    from sklearn.cluster._k_means import _check_normalize_sample_weight
+    from sklearn.cluster._kmeans import _check_normalize_sample_weight
     sample_weight = None
     checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
     assert _num_samples(X) == _num_samples(checked_sample_weight)
diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py
index 93a51b04f38d2..c8b93ea2e4f9c 100644
--- a/sklearn/decomposition/__init__.py
+++ b/sklearn/decomposition/__init__.py
@@ -16,7 +16,7 @@
                              MiniBatchDictionaryLearning, SparseCoder)
 from ._factor_analysis import FactorAnalysis
 from ..utils.extmath import randomized_svd
-from ._online_lda import LatentDirichletAllocation
+from ._lda import LatentDirichletAllocation
 
 __all__ = ['DictionaryLearning',
            'FastICA',
diff --git a/sklearn/decomposition/_online_lda.py b/sklearn/decomposition/_lda.py
similarity index 100%
rename from sklearn/decomposition/_online_lda.py
rename to sklearn/decomposition/_lda.py
diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py
index fdf993f2759bf..ca8392616e761 100644
--- a/sklearn/decomposition/tests/test_online_lda.py
+++ b/sklearn/decomposition/tests/test_online_lda.py
@@ -8,8 +8,8 @@
 import pytest
 
 from sklearn.decomposition import LatentDirichletAllocation
-from sklearn.decomposition._online_lda import (_dirichlet_expectation_1d,
-                                               _dirichlet_expectation_2d)
+from sklearn.decomposition._lda import (_dirichlet_expectation_1d,
+                                        _dirichlet_expectation_2d)
 
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
diff --git a/sklearn/feature_extraction/__init__.py b/sklearn/feature_extraction/__init__.py
index 2103fc67589c3..4591bfc6980c8 100644
--- a/sklearn/feature_extraction/__init__.py
+++ b/sklearn/feature_extraction/__init__.py
@@ -5,7 +5,7 @@
 """
 
 from ._dict_vectorizer import DictVectorizer
-from ._hashing import FeatureHasher
+from ._hash import FeatureHasher
 from .image import img_to_graph, grid_to_graph
 from . import text
 
diff --git a/sklearn/feature_extraction/_hashing.py b/sklearn/feature_extraction/_hash.py
similarity index 100%
rename from sklearn/feature_extraction/_hashing.py
rename to sklearn/feature_extraction/_hash.py
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index afc8ee4118cdc..07bf4d20b2b54 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -27,7 +27,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import normalize
-from ._hashing import FeatureHasher
+from ._hash import FeatureHasher
 from ._stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
 from ..utils import _IS_32BIT, deprecated

From f035bac5a02232a73d7d1db5e8be7fe1988eb4ba Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Wed, 18 Dec 2019 15:40:03 +0800
Subject: [PATCH 34/85] MNT Raise erorr when normalize is invalid in
 confusion_matrix (#15888)

---
 doc/whats_new/v0.22.rst                      | 7 +++++++
 sklearn/metrics/_classification.py           | 4 ++++
 sklearn/metrics/_plot/confusion_matrix.py    | 4 ----
 sklearn/metrics/tests/test_classification.py | 7 +++++++
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index af08b832e9f6f..19a8327783b20 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -15,6 +15,13 @@ This is a bug-fix release to primarily resolve some packaging issues in version
 Changelog
 ---------
 
+:mod:`sklearn.metrics`
+......................
+
+- |Fix| :func:`metrics.plot_confusion_matrix` now raises error when `normalize`
+  is invalid. Previously, it runs fine with no normalization.
+  :pr:`15888` by `Hanmin Qin`_.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 67f88177df4bd..c5ebf6f4cec10 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -283,6 +283,10 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None,
 
     check_consistent_length(y_true, y_pred, sample_weight)
 
+    if normalize not in ['true', 'pred', 'all', None]:
+        raise ValueError("normalize must be one of {'true', 'pred', "
+                         "'all', None}")
+
     n_labels = labels.size
     label_to_ind = {y: x for x, y in enumerate(labels)}
     # convert yt, yp into index
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index fd9cd48489029..1c3fc2715ffb3 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -184,10 +184,6 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None,
     if not is_classifier(estimator):
         raise ValueError("plot_confusion_matrix only supports classifiers")
 
-    if normalize not in {'true', 'pred', 'all', None}:
-        raise ValueError("normalize must be one of {'true', 'pred', "
-                         "'all', None}")
-
     y_pred = estimator.predict(X)
     cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight,
                           labels=labels, normalize=normalize)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 05aab00a1ce4f..e26e4e32c5e72 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -527,6 +527,13 @@ def test_confusion_matrix_normalize(normalize, cm_dtype, expected_results):
     assert cm.dtype.kind == cm_dtype
 
 
+def test_confusion_matrix_normalize_wrong_option():
+    y_test = [0, 0, 0, 0, 1, 1, 1, 1]
+    y_pred = [0, 0, 0, 0, 0, 0, 0, 0]
+    with pytest.raises(ValueError, match='normalize must be one of'):
+        confusion_matrix(y_test, y_pred, normalize=True)
+
+
 def test_confusion_matrix_normalize_single_class():
     y_test = [0, 0, 0, 0, 1, 1, 1, 1]
     y_pred = [0, 0, 0, 0, 0, 0, 0, 0]

From a7aedd5af5be40b0b4c6f199a40bdd472a32e660 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 18 Dec 2019 07:21:50 -0500
Subject: [PATCH 35/85] [MRG] DOC Increases search results for API object
 results (#15574)

---
 .../static/js/searchtools.js                  | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/doc/themes/scikit-learn-modern/static/js/searchtools.js b/doc/themes/scikit-learn-modern/static/js/searchtools.js
index ca53abe4f0038..0d4ca2328b079 100644
--- a/doc/themes/scikit-learn-modern/static/js/searchtools.js
+++ b/doc/themes/scikit-learn-modern/static/js/searchtools.js
@@ -11,7 +11,9 @@
  * - Removes ajax call to get context for each result
  * - Adjusts Search.query to remove duplicates in search results.
  * - Adjusts Scorer to rank objects higher.
- * - Adds Search._total_results to limit the number of search results.
+ * - Adds Search._total_non_object_results to limit the number of search non
+ * object results. Object results do not perform another GET resquest, so they
+ * are cheap to display.
  */
 
 if (!Scorer) {
@@ -63,10 +65,10 @@ var Search = {
     _index: null,
     _queued_query: null,
     _pulse_status: -1,
-    _total_results: 10,
+    _total_non_object_results: 10,
 
     htmlToText: function (htmlString) {
-        var htmlString = htmlString.replace(/<img.+?>/g, "");
+        var htmlString = htmlString.replace(/<img[\s\S]+?>/g, "");
         var htmlElement = document.createElement("span");
         htmlElement.innerHTML = htmlString;
         $(htmlElement)
@@ -218,22 +220,23 @@ var Search = {
                 objectterms.slice(i + 1, objectterms.length)
             );
 
-            if (results.length < this._total_results) {
-                results = $u.uniq(results.concat(
-                    this.performObjectSearch(objectterms[i], others)
-                ), false, function (item) {return item[1]});
-            }
+            results = $u.uniq(results.concat(
+                this.performObjectSearch(objectterms[i], others)
+            ), false, function (item) {return item[1]});
         }
 
-        if (results.length < this._total_results) {
-            // lookup as search terms in fulltext
-            results = results.concat(
-                this.performTermsSearch(searchterms, excluded, terms, titleterms)
-            );
-        }
+        var total_object_results = results.length;
+
+        // lookup as search terms in fulltext
+        results = results.concat(
+            this.performTermsSearch(searchterms, excluded, terms, titleterms)
+        );
 
-        if (results.length > this._total_results) {
-            results = results.slice(0, this._total_results);
+        // Only have _total_non_object_results results above the number of
+        // total number of object results
+        var results_limit = total_object_results + this._total_non_object_results
+        if (results.length > results_limit) {
+            results = results.slice(0, results_limit);
         }
 
         // let the scorer override scores with a custom scoring function

From 098cd3a46d4ef16087a8e0befa12adc96309960a Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 18 Dec 2019 15:23:02 -0500
Subject: [PATCH 36/85] MNT Ignores warning in pyamg for deprecated
 scipy.random (#15914)

---
 sklearn/cluster/tests/test_spectral.py            | 4 ++++
 sklearn/manifold/tests/test_spectral_embedding.py | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
index dc79f427afcdf..f5591c7348ebe 100644
--- a/sklearn/cluster/tests/test_spectral.py
+++ b/sklearn/cluster/tests/test_spectral.py
@@ -191,6 +191,10 @@ def test_discretize(n_samples):
         assert adjusted_rand_score(y_true, y_pred) > 0.8
 
 
+# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
+# https://github.com/scikit-learn/scikit-learn/issues/15913
+@pytest.mark.filterwarnings(
+    "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
 def test_spectral_clustering_with_arpack_amg_solvers():
     # Test that spectral_clustering is the same for arpack and amg solver
     # Based on toy example from plot_segmentation_toy.py
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
index 295367a422f04..f99eae3783c05 100644
--- a/sklearn/manifold/tests/test_spectral_embedding.py
+++ b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -181,6 +181,10 @@ def test_spectral_embedding_callable_affinity(X, seed=36):
     assert _check_with_col_sign_flipping(embed_rbf, embed_callable, 0.05)
 
 
+# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
+# https://github.com/scikit-learn/scikit-learn/issues/15913
+@pytest.mark.filterwarnings(
+    "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
 def test_spectral_embedding_amg_solver(seed=36):
     # Test spectral embedding with amg solver
     pytest.importorskip('pyamg')
@@ -211,6 +215,10 @@ def test_spectral_embedding_amg_solver(seed=36):
     assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 1e-5)
 
 
+# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
+# https://github.com/scikit-learn/scikit-learn/issues/15913
+@pytest.mark.filterwarnings(
+    "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
 def test_spectral_embedding_amg_solver_failure(seed=36):
     # Test spectral embedding with amg solver failure, see issue #13393
     pytest.importorskip('pyamg')

From ce49cb48f87fb18daed9ff28e54c58f2360fc9bc Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 18 Dec 2019 21:29:16 +0100
Subject: [PATCH 37/85] DOC Instructions to troubleshoot Windows path length
 limit (#15916)

---
 doc/install.rst | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/doc/install.rst b/doc/install.rst
index 1e6ed734e1085..886ed272a65ba 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -126,7 +126,7 @@ If you have not installed NumPy or SciPy yet, you can also install these using
 conda or pip. When using pip, please ensure that *binary wheels* are used,
 and NumPy and SciPy are not recompiled from source, which can happen when using
 particular configurations of operating system and hardware (such as Linux on
-a Raspberry Pi). 
+a Raspberry Pi).
 
 If you must install scikit-learn and its dependencies with pip, you can install
 it as ``scikit-learn[alldeps]``.
@@ -255,3 +255,38 @@ WinPython for Windows
 
 The `WinPython <https://winpython.github.io/>`_ project distributes
 scikit-learn as an additional plugin.
+
+
+Troubleshooting
+===============
+
+.. _windows_longpath:
+
+Error caused by file path length limit on Windows
+-------------------------------------------------
+
+It can happen that pip fails to install packages when reaching the default path
+size limit of Windows if Python is installed in a nested location such as the
+`AppData` folder structure under the user home directory, for instance::
+
+    C:\Users\username>C:\Users\username\AppData\Local\Microsoft\WindowsApps\python.exe -m pip install scikit-learn
+    Collecting scikit-learn
+    ...
+    Installing collected packages: scikit-learn
+    ERROR: Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: 'C:\\Users\\username\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python37\\site-packages\\sklearn\\datasets\\tests\\data\\openml\\292\\api-v1-json-data-list-data_name-australian-limit-2-data_version-1-status-deactivated.json.gz'
+
+In this case it is possible to lift that limit in the Windows registry by
+using the ``regedit`` tool:
+
+#. Type "regedit" in the Windows start menu to launch ``regedit``.
+
+#. Go to the
+   ``Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem``
+   key.
+
+#. Edit the value of the ``LongPathsEnabled`` property of that key and set
+   it to 1.
+
+#. Reinstall scikit-learn (ignoring the previous broken installation)::
+
+       pip install --exists-action=i scikit-learn

From 4676789a64db4506c57d2f893b580e3c9097a30b Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <rs2715@stern.nyu.edu>
Date: Thu, 19 Dec 2019 08:07:03 -0500
Subject: [PATCH 38/85] DOC add versionadded directive to some estimators
 (#15849)

---
 sklearn/cross_decomposition/_pls.py | 6 ++++++
 sklearn/feature_extraction/image.py | 3 +++
 sklearn/model_selection/_split.py   | 2 ++
 sklearn/pipeline.py                 | 2 ++
 sklearn/preprocessing/_data.py      | 2 ++
 5 files changed, 15 insertions(+)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 72ee5d4af6ba6..125c5946b1562 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -530,6 +530,8 @@ class PLSRegression(_PLS):
 
     Read more in the :ref:`User Guide <cross_decomposition>`.
 
+    .. versionadded:: 0.8
+
     Parameters
     ----------
     n_components : int, (default 2)
@@ -668,6 +670,8 @@ class PLSCanonical(_PLS):
 
     Read more in the :ref:`User Guide <cross_decomposition>`.
 
+    .. versionadded:: 0.8
+
     Parameters
     ----------
     n_components : int, (default 2).
@@ -810,6 +814,8 @@ class PLSSVD(TransformerMixin, BaseEstimator):
 
     Read more in the :ref:`User Guide <cross_decomposition>`.
 
+    .. versionadded:: 0.8
+
     Parameters
     ----------
     n_components : int, default 2
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 2cec6739e7f98..d0da784c526d7 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -298,6 +298,7 @@ def _extract_patches(arr, patch_shape=8, extraction_step=1):
     patches = as_strided(arr, shape=shape, strides=strides)
     return patches
 
+
 @deprecated("The function feature_extraction.image.extract_patches has been "
             "deprecated in 0.22 and will be removed in 0.24.")
 def extract_patches(arr, patch_shape=8, extraction_step=1):
@@ -483,6 +484,8 @@ class PatchExtractor(BaseEstimator):
 
     Read more in the :ref:`User Guide <image_feature_extraction>`.
 
+    .. versionadded:: 0.9
+
     Parameters
     ----------
     patch_size : tuple of ints (patch_height, patch_width)
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 9a85b4049a3c3..bb0643e8c8edb 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1816,6 +1816,8 @@ class PredefinedSplit(BaseCrossValidator):
 
     Read more in the :ref:`User Guide <cross_validation>`.
 
+    .. versionadded:: 0.16
+
     Parameters
     ----------
     test_fold : array-like, shape (n_samples,)
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index e4ac9007fe8e5..453ce2228d50d 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -47,6 +47,8 @@ class Pipeline(_BaseComposition):
 
     Read more in the :ref:`User Guide <pipeline>`.
 
+    .. versionadded:: 0.5
+
     Parameters
     ----------
     steps : list
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index ef8b9c6db9e3b..2b360af9ad40a 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -2695,6 +2695,8 @@ class PowerTransformer(TransformerMixin, BaseEstimator):
 
     Read more in the :ref:`User Guide <preprocessing_transformer>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     method : str, (default='yeo-johnson')

From e93260a8fba455747aedf100ac97ebac194ee08b Mon Sep 17 00:00:00 2001
From: Oliver Urs Lenz <oulenz@users.noreply.github.com>
Date: Thu, 19 Dec 2019 14:44:44 +0100
Subject: [PATCH 39/85] DOC clarify doc-string of roc_auc_score and add
 references (#15293)

---
 doc/modules/model_evaluation.rst      | 16 +++--
 sklearn/metrics/_ranking.py           | 86 +++++++++++++++++----------
 sklearn/metrics/tests/test_ranking.py |  2 +-
 3 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 7af1e46578de6..24bf9541ebab4 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1348,8 +1348,8 @@ the one-vs-rest algorithm computes the average of the ROC AUC scores for each
 class against all other classes. In both cases, the predicted labels are
 provided in an array with values from 0 to ``n_classes``, and the scores
 correspond to the probability estimates that a sample belongs to a particular
-class. The OvO and OvR algorithms supports weighting uniformly
-(``average='macro'``) and weighting by the prevalence (``average='weighted'``).
+class. The OvO and OvR algorithms support weighting uniformly
+(``average='macro'``) and by prevalence (``average='weighted'``).
 
 **One-vs-one Algorithm**: Computes the average AUC of all possible pairwise
 combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted
@@ -1380,10 +1380,10 @@ the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to
 ``'weighted'``. The ``'weighted'`` option returns a prevalence-weighted average
 as described in [FC2009]_.
 
-**One-vs-rest Algorithm**: Computes the AUC of each class against the rest.
-The algorithm is functionally the same as the multilabel case. To enable this
-algorithm set the keyword argument ``multiclass`` to ``'ovr'``. Similar to
-OvO, OvR supports two types of averaging: ``'macro'`` [F2006]_ and
+**One-vs-rest Algorithm**: Computes the AUC of each class against the rest
+[PD2000]_. The algorithm is functionally the same as the multilabel case. To
+enable this algorithm set the keyword argument ``multiclass`` to ``'ovr'``.
+Like OvO, OvR supports two types of averaging: ``'macro'`` [F2006]_ and
 ``'weighted'`` [F2001]_.
 
 In applications where a high false positive rate is not tolerable the parameter
@@ -1422,6 +1422,10 @@ to the given limit.
        <https://www.math.ucdavis.edu/~saito/data/roc/ferri-class-perf-metrics.pdf>`_
        Pattern Recognition Letters. 30. 27-38.
 
+    .. [PD2000] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving
+       probability estimation trees (Section 6.2), CeDER Working Paper #IS-00-04,
+       Stern School of Business, New York University.
+
     .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis.
        <http://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
        Pattern Recognition Letters, 27(8), pp. 861-874.
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 71731025e5649..e525539c0d706 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -248,27 +248,32 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
-    Note: this implementation is restricted to the binary classification task
-    or multilabel classification task in label indicator format.
+    Note: this implementation can be used with binary, multiclass and
+    multilabel classification, but some restrictions apply (see Parameters).
 
     Read more in the :ref:`User Guide <roc_metrics>`.
 
     Parameters
     ----------
-    y_true : array, shape = [n_samples] or [n_samples, n_classes]
-        True binary labels or binary label indicators.
-        The multiclass case expects shape = [n_samples] and labels
-        with values in ``range(n_classes)``.
-
-    y_score : array, shape = [n_samples] or [n_samples, n_classes]
-        Target scores, can either be probability estimates of the positive
-        class, confidence values, or non-thresholded measure of decisions
-        (as returned by "decision_function" on some classifiers). For binary
-        y_true, y_score is supposed to be the score of the class with greater
-        label. The multiclass case expects shape = [n_samples, n_classes]
-        where the scores correspond to probability estimates.
-
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
+        True labels or binary label indicators. The binary and multiclass cases
+        expect labels with shape (n_samples,) while the multilabel case expects
+        binary label indicators with shape (n_samples, n_classes).
+
+    y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Target scores. In the binary and multilabel cases, these can be either
+        probability estimates or non-thresholded decision values (as returned
+        by `decision_function` on some classifiers). In the multiclass case,
+        these must be probability estimates which sum to 1. The binary
+        case expects a shape (n_samples,), and the scores must be the scores of
+        the class with the greater label. The multiclass and multilabel
+        cases expect a shape (n_samples, n_classes). In the multiclass case,
+        the order of the class scores must correspond to the order of
+        ``labels``, if provided, or else to the numerical or lexicographical
+        order of the labels in ``y_true``.
+
+    average : {'micro', 'macro', 'samples', 'weighted'} or None, \
+            default='macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
         Note: multiclass ROC AUC currently only handles the 'macro' and
@@ -291,26 +296,32 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
-    max_fpr : float > 0 and <= 1, optional
-        If not ``None``, the standardized partial AUC [3]_ over the range
+    max_fpr : float > 0 and <= 1, default=None
+        If not ``None``, the standardized partial AUC [2]_ over the range
         [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
         should be either equal to ``None`` or ``1.0`` as AUC ROC partial
         computation currently is not supported for multiclass.
 
-    multi_class : string, 'ovr' or 'ovo', optional(default='raise')
-        Determines the type of multiclass configuration to use.
-        ``multi_class`` must be provided when ``y_true`` is multiclass.
+    multi_class : {'raise', 'ovr', 'ovo'}, default='raise'
+        Multiclass only. Determines the type of configuration to use. The
+        default value raises an error, so either ``'ovr'`` or ``'ovo'`` must be
+        passed explicitly.
 
         ``'ovr'``:
-            Calculate metrics for the multiclass case using the one-vs-rest
-            approach.
+            Computes the AUC of each class against the rest [3]_ [4]_. This
+            treats the multiclass case in the same way as the multilabel case.
+            Sensitive to class imbalance even when ``average == 'macro'``,
+            because class imbalance affects the composition of each of the
+            'rest' groupings.
         ``'ovo'``:
-            Calculate metrics for the multiclass case using the one-vs-one
-            approach.
+            Computes the average AUC of all possible pairwise combinations of
+            classes [5]_. Insensitive to class imbalance when
+            ``average == 'macro'``.
 
-    labels : array, shape = [n_classes] or None, optional (default=None)
-        List of labels to index ``y_score`` used for multiclass. If ``None``,
-        the lexicon order of ``y_true`` is used to index ``y_score``.
+    labels : array-like of shape (n_classes,), default=None
+        Multiclass only. List of labels that index the classes in ``y_score``.
+        If ``None``, the numerical or lexicographical order of the labels in
+        ``y_true`` is used.
 
     Returns
     -------
@@ -321,12 +332,22 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
     .. [1] `Wikipedia entry for the Receiver operating characteristic
             <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 
-    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
-           Letters, 2006, 27(8):861-874.
-
-    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
+    .. [2] `Analyzing a portion of the ROC curve. McClish, 1989
             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
 
+    .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving
+           probability estimation trees (Section 6.2), CeDER Working Paper
+           #IS-00-04, Stern School of Business, New York University.
+
+    .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern
+            Recognition Letters, 27(8), 861-874.
+            <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
+
+    .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area
+            Under the ROC Curve for Multiple Class Classification Problems.
+            Machine Learning, 45(2), 171-186.
+            <http://link.springer.com/article/10.1023/A:1010920819831>`_
+
     See also
     --------
     average_precision_score : Area under the precision-recall curve
@@ -341,7 +362,6 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
     >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
     >>> roc_auc_score(y_true, y_scores)
     0.75
-
     """
 
     y_type = type_of_target(y_true)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 7ce7cf3e3814e..4542b8e2a2964 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -554,7 +554,7 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
         result_unweighted)
 
     # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
-    # on the same input (Provost & Domingos, 2001)
+    # on the same input (Provost & Domingos, 2000)
     result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
     assert_almost_equal(
         roc_auc_score(

From accb491a14f5e667118e94a94f20be7d15470f02 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 19 Dec 2019 09:51:22 -0500
Subject: [PATCH 40/85] MNT Adds skip lint to azure pipeline CI (#15904)

---
 azure-pipelines.yml             | 10 ++++++++--
 doc/developers/contributing.rst |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5f84a1ae94857..bf0cd41e1d366 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -12,8 +12,14 @@ jobs:
     - bash: conda create --name flake8_env --yes flake8
       displayName: Install flake8
     - bash: |
-        source activate flake8_env
-        ./build_tools/circle/linting.sh
+        if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
+          # skip linting
+          echo "Skipping linting"
+          exit 0
+        else
+          source activate flake8_env
+          ./build_tools/circle/linting.sh
+        fi
       displayName: Run linting
 
 
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 8d12187ade00b..6cca31e81813b 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -437,6 +437,7 @@ message, the following actions are taken.
     ---------------------- -------------------
     [scipy-dev]            Add a Travis build with our dependencies (numpy, scipy, etc ...) development builds
     [ci skip]              CI is skipped completely
+    [lint skip]            Azure pipeline skips linting
     [doc skip]             Docs are not built
     [doc quick]            Docs built, but excludes example gallery plots
     [doc build]            Docs built including example gallery plots

From 2627d9e434dc571ad59cce222ca336ee27c5bec1 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 19 Dec 2019 09:56:12 -0500
Subject: [PATCH 41/85] BLD Fixes bug when building with NO_MATHJAX=1 (#15892)

---
 doc/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/conf.py b/doc/conf.py
index 0386f7676e0be..c4d7e578216fd 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -51,6 +51,7 @@
 if os.environ.get('NO_MATHJAX'):
     extensions.append('sphinx.ext.imgmath')
     imgmath_image_format = 'svg'
+    mathjax_path = ''
 else:
     extensions.append('sphinx.ext.mathjax')
     mathjax_path = ('https://cdn.jsdelivr.net/npm/mathjax@3/es5/'

From 4dceec7085765c0b3728198e57cee873bac6db90 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 19 Dec 2019 12:05:17 -0500
Subject: [PATCH 42/85] [MRG] BUG Checks to number of axes in passed in ax more
 generically (#15760)

---
 doc/whats_new/v0.22.rst                       |  7 +++++
 sklearn/inspection/_partial_dependence.py     | 18 ++++++-----
 .../tests/test_plot_partial_dependence.py     | 31 +++++++++++--------
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 19a8327783b20..ae9cbbd74e313 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -28,6 +28,13 @@ Changelog
 - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with
   boolean columns to floats. :pr:`15797` by `Thomas Fan`_.
 
+:mod:`sklearn.inspection`
+.........................
+
+- |Fix| :func:`inspection.plot_partial_dependence` and 
+  :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks
+  the number of axes passed in. :pr:`15760` by `Thomas Fan`_.
+
 .. _changes_0_22:
 
 Version 0.22.0
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index 40a7f073ca818..12233a766969c 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -655,10 +655,12 @@ def convert_feature(fx):
 
     features = tmp_features
 
-    if isinstance(ax, list):
-        if len(ax) != len(features):
-            raise ValueError("Expected len(ax) == len(features), "
-                             "got len(ax) = {}".format(len(ax)))
+    # Early exit if the axes does not have the correct number of axes
+    if ax is not None and not isinstance(ax, plt.Axes):
+        axes = np.asarray(ax, dtype=object)
+        if axes.size != len(features):
+            raise ValueError("Expected ax to have {} axes, got {}".format(
+                             len(features), axes.size))
 
     for i in chain.from_iterable(features):
         if i >= len(feature_names):
@@ -886,16 +888,16 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
                 axes_ravel[i] = self.figure_.add_subplot(spec)
 
         else:  # array-like
-            ax = check_array(ax, dtype=object, ensure_2d=False)
+            ax = np.asarray(ax, dtype=object)
+            if ax.size != n_features:
+                raise ValueError("Expected ax to have {} axes, got {}"
+                                 .format(n_features, ax.size))
 
             if ax.ndim == 2:
                 n_cols = ax.shape[1]
             else:
                 n_cols = None
 
-            if ax.ndim == 1 and ax.shape[0] != n_features:
-                raise ValueError("Expected len(ax) == len(features), "
-                                 "got len(ax) = {}".format(len(ax)))
             self.bounding_ax_ = None
             self.figure_ = ax.ravel()[0].figure
             self.axes_ = ax
diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py
index 222ab0fc45ccd..abae91d4d2642 100644
--- a/sklearn/inspection/tests/test_plot_partial_dependence.py
+++ b/sklearn/inspection/tests/test_plot_partial_dependence.py
@@ -222,26 +222,31 @@ def test_plot_partial_dependence_passing_numpy_axes(pyplot, clf_boston,
     assert len(disp2.axes_[0, 1].get_lines()) == 2
 
 
+@pytest.mark.parametrize("nrows, ncols", [(2, 2), (3, 1)])
 def test_plot_partial_dependence_incorrent_num_axes(pyplot, clf_boston,
-                                                    boston):
-    grid_resolution = 25
-    fig, (ax1, ax2, ax3) = pyplot.subplots(1, 3)
+                                                    boston, nrows, ncols):
+    grid_resolution = 5
+    fig, axes = pyplot.subplots(nrows, ncols)
+    axes_formats = [list(axes.ravel()), tuple(axes.ravel()), axes]
 
-    msg = r"Expected len\(ax\) == len\(features\), got len\(ax\) = 3"
-    with pytest.raises(ValueError, match=msg):
-        plot_partial_dependence(clf_boston, boston.data,
-                                ['CRIM', ('CRIM', 'ZN')],
-                                grid_resolution=grid_resolution,
-                                feature_names=boston.feature_names,
-                                ax=[ax1, ax2, ax3])
+    msg = "Expected ax to have 2 axes, got {}".format(nrows * ncols)
 
     disp = plot_partial_dependence(clf_boston, boston.data,
-                                   ['CRIM', ('CRIM', 'ZN')],
+                                   ['CRIM', 'ZN'],
                                    grid_resolution=grid_resolution,
                                    feature_names=boston.feature_names)
 
-    with pytest.raises(ValueError, match=msg):
-        disp.plot(ax=[ax1, ax2, ax3])
+    for ax_format in axes_formats:
+        with pytest.raises(ValueError, match=msg):
+            plot_partial_dependence(clf_boston, boston.data,
+                                    ['CRIM', 'ZN'],
+                                    grid_resolution=grid_resolution,
+                                    feature_names=boston.feature_names,
+                                    ax=ax_format)
+
+        # with axes object
+        with pytest.raises(ValueError, match=msg):
+            disp.plot(ax=ax_format)
 
 
 def test_plot_partial_dependence_with_same_axes(pyplot, clf_boston, boston):

From 27f36212ba2270699222b96938ddf4254c3b15a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 19 Dec 2019 21:52:06 +0100
Subject: [PATCH 43/85] EXA Minor fixes in
 plot_sparse_logistic_regression_20newsgroups.py (#15925)

---
 .../plot_sparse_logistic_regression_20newsgroups.py        | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
index 78fdc64684550..7bfad99d991c5 100644
--- a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
+++ b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
@@ -1,7 +1,7 @@
 """
-=====================================================
-Multiclass sparse logisitic regression on newgroups20
-=====================================================
+====================================================
+Multiclass sparse logistic regression on 20newgroups
+====================================================
 
 Comparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression
 to classify documents from the newgroups20 dataset. Multinomial logistic
@@ -42,7 +42,6 @@
 # Turn down for faster run time
 n_samples = 10000
 
-# Memorized fetch_rcv1 for faster access
 X, y = fetch_20newsgroups_vectorized('all', return_X_y=True)
 X = X[:n_samples]
 y = y[:n_samples]

From 50d2664a22d45c14698b967cef2973a432789ff9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 22:10:56 +0100
Subject: [PATCH 44/85] BUG Do not shadow public functions with deprecated
 modules (#15846)

---
 sklearn/decomposition/__init__.py            | 38 +++++++++++++-------
 sklearn/inspection/__init__.py               | 21 ++++++++---
 sklearn/tests/test_import_deprecations.py    |  6 ++++
 sklearn/utils/tests/test_deprecated_utils.py | 35 ++++++++++++++++++
 4 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py
index c8b93ea2e4f9c..42f661171eafe 100644
--- a/sklearn/decomposition/__init__.py
+++ b/sklearn/decomposition/__init__.py
@@ -4,19 +4,33 @@
 this module can be regarded as dimensionality reduction techniques.
 """
 
-from ._nmf import NMF, non_negative_factorization
-from ._pca import PCA
-from ._incremental_pca import IncrementalPCA
-from ._kernel_pca import KernelPCA
-from ._sparse_pca import SparsePCA, MiniBatchSparsePCA
-from ._truncated_svd import TruncatedSVD
-from ._fastica import FastICA, fastica
-from ._dict_learning import (dict_learning, dict_learning_online,
+# TODO: remove me in 0.24 (as well as the noqa markers) and
+# import the dict_learning func directly from the ._dict_learning
+# module instead.
+# Pre-cache the import of the deprecated module so that import
+# sklearn.decomposition.dict_learning returns the function as in
+# 0.21, instead of the module.
+# https://github.com/scikit-learn/scikit-learn/issues/15842
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", category=FutureWarning)
+    from .dict_learning import dict_learning
+
+
+from ._nmf import NMF, non_negative_factorization  # noqa
+from ._pca import PCA  # noqa
+from ._incremental_pca import IncrementalPCA  # noqa
+from ._kernel_pca import KernelPCA  # noqa
+from ._sparse_pca import SparsePCA, MiniBatchSparsePCA  # noqa
+from ._truncated_svd import TruncatedSVD  # noqa
+from ._fastica import FastICA, fastica  # noqa
+from ._dict_learning import (dict_learning_online,
                              sparse_encode, DictionaryLearning,
-                             MiniBatchDictionaryLearning, SparseCoder)
-from ._factor_analysis import FactorAnalysis
-from ..utils.extmath import randomized_svd
-from ._lda import LatentDirichletAllocation
+                             MiniBatchDictionaryLearning, SparseCoder)  # noqa
+from ._factor_analysis import FactorAnalysis  # noqa
+from ..utils.extmath import randomized_svd  # noqa
+from ._lda import LatentDirichletAllocation  # noqa
+
 
 __all__ = ['DictionaryLearning',
            'FastICA',
diff --git a/sklearn/inspection/__init__.py b/sklearn/inspection/__init__.py
index 04d9d84ecaf02..904d16d74b016 100644
--- a/sklearn/inspection/__init__.py
+++ b/sklearn/inspection/__init__.py
@@ -1,8 +1,21 @@
 """The :mod:`sklearn.inspection` module includes tools for model inspection."""
-from ._partial_dependence import partial_dependence
-from ._partial_dependence import plot_partial_dependence
-from ._partial_dependence import PartialDependenceDisplay
-from ._permutation_importance import permutation_importance
+
+# TODO: remove me in 0.24 (as well as the noqa markers) and
+# import the partial_dependence func directly from the
+# ._partial_dependence module instead.
+# Pre-cache the import of the deprecated module so that import
+# sklearn.inspection.partial_dependence returns the function as in
+# 0.21, instead of the module
+# https://github.com/scikit-learn/scikit-learn/issues/15842
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", category=FutureWarning)
+    from .partial_dependence import partial_dependence
+
+from ._partial_dependence import plot_partial_dependence  # noqa
+from ._partial_dependence import PartialDependenceDisplay  # noqa
+from ._permutation_importance import permutation_importance  # noqa
+
 
 __all__ = [
     'partial_dependence',
diff --git a/sklearn/tests/test_import_deprecations.py b/sklearn/tests/test_import_deprecations.py
index 13b31e89b2862..29c4259fe1e5a 100644
--- a/sklearn/tests/test_import_deprecations.py
+++ b/sklearn/tests/test_import_deprecations.py
@@ -24,6 +24,12 @@ def test_import_is_deprecated(deprecated_path, importee):
 
     # TODO: remove in 0.24
 
+    # Special case for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15842
+    if deprecated_path in ("sklearn.decomposition.dict_learning",
+                           "sklearn.inspection.partial_dependence"):
+        pytest.skip("No warning can be raised for " + deprecated_path)
+
     expected_message = (
         "The {deprecated_path} module is  deprecated in version "
         "0.22 and will be removed in version 0.24. "
diff --git a/sklearn/utils/tests/test_deprecated_utils.py b/sklearn/utils/tests/test_deprecated_utils.py
index da41e7e44ddb3..08bd95aacc284 100644
--- a/sklearn/utils/tests/test_deprecated_utils.py
+++ b/sklearn/utils/tests/test_deprecated_utils.py
@@ -1,7 +1,10 @@
 import pytest
+import types
 import numpy as np
+import warnings
 
 from sklearn.dummy import DummyClassifier
+from sklearn.utils import all_estimators
 from sklearn.utils.estimator_checks import choose_check_classifiers_labels
 from sklearn.utils.estimator_checks import NotAnArray
 from sklearn.utils.estimator_checks import enforce_estimator_tags_y
@@ -94,3 +97,35 @@ def test_safe_indexing():
     with pytest.warns(FutureWarning,
                       match="removed in version 0.24"):
         safe_indexing([1, 2], 0)
+
+
+# TODO: remove in 0.24
+def test_partial_dependence_no_shadowing():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15842
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=FutureWarning)
+        from sklearn.inspection.partial_dependence import partial_dependence as _  # noqa
+
+        # Calling all_estimators() also triggers a recursive import of all
+        # submodules, including deprecated ones.
+        all_estimators()
+
+    from sklearn.inspection import partial_dependence
+    assert isinstance(partial_dependence, types.FunctionType)
+
+
+# TODO: remove in 0.24
+def test_dict_learning_no_shadowing():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15842
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=FutureWarning)
+        from sklearn.decomposition.dict_learning import dict_learning as _  # noqa
+
+        # Calling all_estimators() also triggers a recursive import of all
+        # submodules, including deprecated ones.
+        all_estimators()
+
+    from sklearn.decomposition import dict_learning
+    assert isinstance(dict_learning, types.FunctionType)

From 188e80d89e113e7d3202ed6dddb9c355dcb58d70 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 01:23:42 +0100
Subject: [PATCH 45/85] Import sklearn._distributor_init first (#15929)

---
 sklearn/__init__.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index aee8858c1558e..806a317bd03b5 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -70,12 +70,18 @@
     # We are not importing the rest of scikit-learn during the build
     # process, as it may not be compiled yet
 else:
-    from . import __check_build
+    # `_distributor_init` allows distributors to run custom init code.
+    # For instance, for the Windows wheel, this is used to pre-load the
+    # vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
+    # sub-folder.
+    # It is necessary to do this prior to importing show_versions as the
+    # later is linked to the OpenMP runtime to make it possible to introspect
+    # it and importing it first would fail if the OpenMP dll cannot be found.
+    from . import _distributor_init  # noqa: F401
+    from . import __check_build  # noqa: F401
     from .base import clone
     from .utils._show_versions import show_versions
 
-    __check_build  # avoid flakes unused variable error
-
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
                'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
                'experimental', 'externals', 'feature_extraction',
@@ -90,9 +96,6 @@
                'clone', 'get_config', 'set_config', 'config_context',
                'show_versions']
 
-    # Allow distributors to run custom init code
-    from . import _distributor_init  # noqa: F401
-
 
 def setup_module(module):
     """Fixture for the tests to assure globally controllable seeding of RNGs"""

From 8e2ea2b5b666e505bc455fc39e865579c1fc8009 Mon Sep 17 00:00:00 2001
From: Brian Wignall <BrianWignall@gmail.com>
Date: Thu, 19 Dec 2019 19:26:02 -0500
Subject: [PATCH 46/85] DOC Fix typos, via a Levenshtein-style corrector
 (#15923)

---
 azure-pipelines.yml                              | 2 +-
 benchmarks/bench_plot_randomized_svd.py          | 2 +-
 benchmarks/bench_text_vectorizers.py             | 2 +-
 build_tools/azure/install.sh                     | 2 +-
 doc/developers/advanced_installation.rst         | 2 +-
 doc/developers/contributing.rst                  | 2 +-
 doc/developers/maintainer.rst                    | 2 +-
 doc/glossary.rst                                 | 2 +-
 doc/modules/clustering.rst                       | 2 +-
 doc/themes/scikit-learn-modern/javascript.html   | 2 +-
 examples/manifold/plot_t_sne_perplexity.py       | 2 +-
 examples/model_selection/plot_roc.py             | 2 +-
 examples/plot_changed_only_pprint_parameter.py   | 2 +-
 examples/plot_roc_curve_visualization_api.py     | 2 +-
 sklearn/decomposition/_incremental_pca.py        | 2 +-
 sklearn/decomposition/_lda.py                    | 2 +-
 sklearn/dummy.py                                 | 2 +-
 sklearn/ensemble/_gb.py                          | 2 +-
 sklearn/ensemble/_hist_gradient_boosting/loss.py | 2 +-
 sklearn/ensemble/_stacking.py                    | 2 +-
 sklearn/ensemble/tests/test_weight_boosting.py   | 2 +-
 sklearn/externals/joblib/numpy_pickle.py         | 2 +-
 sklearn/metrics/_plot/precision_recall_curve.py  | 2 +-
 sklearn/metrics/_regression.py                   | 2 +-
 sklearn/neighbors/_quad_tree.pyx                 | 2 +-
 sklearn/pipeline.py                              | 2 +-
 sklearn/svm/src/liblinear/liblinear_helper.c     | 2 +-
 sklearn/svm/src/libsvm/svm.cpp                   | 4 ++--
 sklearn/svm/src/libsvm/svm.h                     | 4 ++--
 sklearn/tree/_classes.py                         | 3 ++-
 sklearn/utils/_testing.py                        | 2 +-
 sklearn/utils/deprecation.py                     | 2 +-
 sklearn/utils/graph_shortest_path.pyx            | 2 +-
 33 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index bf0cd41e1d366..e2ff71802ce72 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -74,7 +74,7 @@ jobs:
       # It runs tests requiring pandas and PyAMG.
       pylatest_pip_openblas_pandas:
         DISTRIB: 'conda-pip-latest'
-        # FIXME: pinned until SciPy wheels are available for Pyhon 3.8
+        # FIXME: pinned until SciPy wheels are available for Python 3.8
         PYTHON_VERSION: '3.8'
         PYTEST_VERSION: '4.6.2'
         COVERAGE: 'true'
diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py
index 0fe050eaf7e30..e322cda8e87e9 100644
--- a/benchmarks/bench_plot_randomized_svd.py
+++ b/benchmarks/bench_plot_randomized_svd.py
@@ -104,7 +104,7 @@
 # in case the reconstructed (dense) matrix is too large
 MAX_MEMORY = np.int(2e9)
 
-# The following datasets can be dowloaded manually from:
+# The following datasets can be downloaded manually from:
 # CIFAR 10: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
 # SVHN: http://ufldl.stanford.edu/housenumbers/train_32x32.mat
 CIFAR_FOLDER = "./cifar-10-batches-py/"
diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py
index 196e677e9b49c..96dbc04312291 100644
--- a/benchmarks/bench_text_vectorizers.py
+++ b/benchmarks/bench_text_vectorizers.py
@@ -32,7 +32,7 @@ def f():
 text = fetch_20newsgroups(subset='train').data[:1000]
 
 print("="*80 + '\n#' + "    Text vectorizers benchmark" + '\n' + '='*80 + '\n')
-print("Using a subset of the 20 newsrgoups dataset ({} documents)."
+print("Using a subset of the 20 newsgroups dataset ({} documents)."
       .format(len(text)))
 print("This benchmarks runs in ~1 min ...")
 
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index f4f60df8a9626..aa2707bb03837 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -13,7 +13,7 @@ make_conda() {
 version_ge() {
     # The two version numbers are separated with a new line is piped to sort
     # -rV. The -V activates for version number sorting and -r sorts in
-    # decending order. If the first argument is the top element of the sort, it
+    # descending order. If the first argument is the top element of the sort, it
     # is greater than or equal to the second argument.
     test "$(printf "${1}\n${2}" | sort -rV | head -n 1)" == "$1"
 }
diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index c58eb14e828d2..d52dfba48c0e1 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -49,7 +49,7 @@ feature, code or documentation improvement).
    If you plan on submitting a pull-request, you should clone from your fork
    instead.
 
-#. Install a compiler with OpenMP_ support for your platform. See intructions
+#. Install a compiler with OpenMP_ support for your platform. See instructions
    for :ref:`compiler_windows`, :ref:`compiler_macos`, :ref:`compiler_linux`
    and :ref:`compiler_freebsd`.
 
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 6cca31e81813b..16adf4a607d90 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -377,7 +377,7 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    methods available in scikit-learn.
 
 10. New features often need to be illustrated with narrative documentation in
-    the user guide, with small code snipets. If relevant, please also add
+    the user guide, with small code snippets. If relevant, please also add
     references in the literature, with PDF links when possible.
 
 11. The user guide should also include expected time and space complexity
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index e91f01999b12e..66d5250af1644 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -62,7 +62,7 @@ Making a release
 2. On the branch for releasing, update the version number in
    sklearn/__init__.py, the ``__version__`` variable by removing ``dev*`` only
    when ready to release.
-   On master, increment the verson in the same place (when branching for
+   On master, increment the version in the same place (when branching for
    release).
 
 3. Create the tag and push it::
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 3a01b76a45781..e259fa69745bc 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -1161,7 +1161,7 @@ Methods
 
             TODO: `This gist
             <https://gist.github.com/jnothman/4807b1b0266613c20ba4d1f88d0f8cf5>`_
-            higlights the use of the different formats for multilabel.
+            highlights the use of the different formats for multilabel.
         multioutput classification
             A list of 2d arrays, corresponding to each multiclass decision
             function.
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 93f87989ab233..a9232ea844747 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -775,7 +775,7 @@ core sample, and is at least ``eps`` in distance from any core sample, is
 considered an outlier by the algorithm.
 
 While the parameter ``min_samples`` primarily controls how tolerant the
-algorithm is towards noise (on noisy and large data sets it may be desiable
+algorithm is towards noise (on noisy and large data sets it may be desirable
 to increase this parameter), the parameter ``eps`` is *crucial to choose
 appropriately* for the data set and distance function and usually cannot be
 left at the default value. It controls the local neighborhood of the points.
diff --git a/doc/themes/scikit-learn-modern/javascript.html b/doc/themes/scikit-learn-modern/javascript.html
index bdeab8abb9f42..fc0dca1040e03 100644
--- a/doc/themes/scikit-learn-modern/javascript.html
+++ b/doc/themes/scikit-learn-modern/javascript.html
@@ -114,7 +114,7 @@
         prevScrollpos = lastScrollTop;
     };
 
-    /*** high preformance scroll event listener***/
+    /*** high performance scroll event listener***/
     var raf = window.requestAnimationFrame ||
         window.webkitRequestAnimationFrame ||
         window.mozRequestAnimationFrame ||
diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py
index cda936cf72142..dd7b4d1f21a09 100644
--- a/examples/manifold/plot_t_sne_perplexity.py
+++ b/examples/manifold/plot_t_sne_perplexity.py
@@ -6,7 +6,7 @@
 An illustration of t-SNE on the two concentric circles and the S-curve
 datasets for different perplexity values.
 
-We observe a tendency towards clearer shapes as the preplexity value increases.
+We observe a tendency towards clearer shapes as the perplexity value increases.
 
 The size, the distance and the shape of clusters may vary upon initialization,
 perplexity values and does not always convey a meaning.
diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index d995c5c653ce4..d32ab06f7bf25 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -151,7 +151,7 @@
 # .........................................
 # The :func:`sklearn.metrics.roc_auc_score` function can be used for
 # multi-class classification. The multi-class One-vs-One scheme compares every
-# unique pairwise combination of classes. In this section, we calcuate the AUC
+# unique pairwise combination of classes. In this section, we calculate the AUC
 # using the OvR and OvO schemes. We report a macro average, and a
 # prevalence-weighted average.
 y_prob = classifier.predict_proba(X_test)
diff --git a/examples/plot_changed_only_pprint_parameter.py b/examples/plot_changed_only_pprint_parameter.py
index 1a687cff046d8..a35471105b6c1 100644
--- a/examples/plot_changed_only_pprint_parameter.py
+++ b/examples/plot_changed_only_pprint_parameter.py
@@ -5,7 +5,7 @@
 
 This example illustrates the use of the print_changed_only global parameter.
 
-Setting print_changed_only to True will alterate the representation of
+Setting print_changed_only to True will alternate the representation of
 estimators to only show the parameters that have been set to non-default
 values. This can be used to have more compact representations.
 """
diff --git a/examples/plot_roc_curve_visualization_api.py b/examples/plot_roc_curve_visualization_api.py
index 55dec5649beeb..67592c12ec845 100644
--- a/examples/plot_roc_curve_visualization_api.py
+++ b/examples/plot_roc_curve_visualization_api.py
@@ -44,7 +44,7 @@
 # We train a random forest classifier and create a plot comparing it to the SVC
 # ROC curve. Notice how `svc_disp` uses
 # :func:`~sklearn.metrics.RocCurveDisplay.plot` to plot the SVC ROC curve
-# without recomputing the values of the roc curve itself. Futhermore, we
+# without recomputing the values of the roc curve itself. Furthermore, we
 # pass `alpha=0.8` to the plot functions to adjust the alpha values of the
 # curves.
 rfc = RandomForestClassifier(n_estimators=10, random_state=42)
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index 9fc0936b880cc..fe7c57c61999a 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -270,7 +270,7 @@ def partial_fit(self, X, y=None, check_input=True):
             self.mean_ = .0
             self.var_ = .0
 
-        # Update stats - they are 0 if this is the fisrt step
+        # Update stats - they are 0 if this is the first step
         col_mean, col_var, n_total_samples = \
             _incremental_mean_and_var(
                 X, last_mean=self.mean_, last_variance=self.var_,
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index c10bad994d9cf..8fcb51896d190 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -193,7 +193,7 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
 
     evaluate_every : int, optional (default=0)
         How often to evaluate perplexity. Only used in `fit` method.
-        set it to 0 or negative number to not evalute perplexity in
+        set it to 0 or negative number to not evaluate perplexity in
         training at all. Evaluating perplexity can help you check convergence
         in training process, but it will also increase total training time.
         Evaluating perplexity in every iteration might increase training time
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 04322f0fc3bd1..3c7d3286dd85b 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -50,7 +50,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
           .. versionchanged:: 0.22
              The default value of `strategy` will change to "prior" in version
              0.24. Starting from version 0.22, a warning will be raised if
-             `strategy` is not explicity set.
+             `strategy` is not explicitly set.
 
           .. versionadded:: 0.17
              Dummy Classifier now supports prior fitting strategy using
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 6488d5dd0e776..830e27599f4c8 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -1636,7 +1636,7 @@ def _make_estimator(self, append=True):
         raise NotImplementedError()
 
     def _raw_predict_init(self, X):
-        """Check input and compute raw predictions of the init estimtor."""
+        """Check input and compute raw predictions of the init estimator."""
         self._check_initialized()
         X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
         if X.shape[1] != self.n_features_:
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index bcfec023b5571..dae85f57134e4 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -154,7 +154,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true,
 
 
 class LeastAbsoluteDeviation(BaseLoss):
-    """Least asbolute deviation, for regression.
+    """Least absolute deviation, for regression.
 
     For a given sample x_i, the loss is defined as::
 
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index c18291b9f4461..2fe284253ccc9 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -63,7 +63,7 @@ def _concatenate_predictions(self, X, predictions):
         and `self.passthrough` is True, the output of `transform` will
         be sparse.
 
-        This helper is in charge of ensuring the preditions are 2D arrays and
+        This helper is in charge of ensuring the predictions are 2D arrays and
         it will drop one of the probability column when using probabilities
         in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)
         """
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
index d225f2c78c2b1..c71329be9ec71 100755
--- a/sklearn/ensemble/tests/test_weight_boosting.py
+++ b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -526,7 +526,7 @@ def test_adaboostregressor_sample_weight():
     X[-1] *= 10
     y[-1] = 10000
 
-    # random_state=0 ensure that the underlying boostrap will use the outlier
+    # random_state=0 ensure that the underlying bootstrap will use the outlier
     regr_no_outlier = AdaBoostRegressor(
         base_estimator=LinearRegression(), n_estimators=1, random_state=0
     )
diff --git a/sklearn/externals/joblib/numpy_pickle.py b/sklearn/externals/joblib/numpy_pickle.py
index 7a4a2885c9f15..e79a0e1c5c056 100644
--- a/sklearn/externals/joblib/numpy_pickle.py
+++ b/sklearn/externals/joblib/numpy_pickle.py
@@ -1,3 +1,3 @@
-# Import necessary to preserve backward compatibliity of pickles
+# Import necessary to preserve backward compatibility of pickles
 
 from joblib.numpy_pickle import *
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index d515b9aa86b1d..b16fc96e857cd 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -97,7 +97,7 @@ def plot(self, ax=None, name=None, **kwargs):
 def plot_precision_recall_curve(estimator, X, y,
                                 sample_weight=None, response_method="auto",
                                 name=None, ax=None, **kwargs):
-    """Plot Precision Recall Curve for binary classifers.
+    """Plot Precision Recall Curve for binary classifiers.
 
     Extra keyword arguments will be passed to matplotlib's `plot`.
 
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 6c3c83a0c0c7c..7c115928a1340 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -801,7 +801,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
 
     Gamma deviance is equivalent to the Tweedie deviance with
     the power parameter `p=2`. It is invariant to scaling of
-    the target variable, and mesures relative errors.
+    the target variable, and measures relative errors.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
 
diff --git a/sklearn/neighbors/_quad_tree.pyx b/sklearn/neighbors/_quad_tree.pyx
index af510c317d639..5623799124f7c 100644
--- a/sklearn/neighbors/_quad_tree.pyx
+++ b/sklearn/neighbors/_quad_tree.pyx
@@ -97,7 +97,7 @@ cdef class _QuadTree:
             return self._get_cell_ndarray()['is_leaf'][:self.cell_count]
 
     def build_tree(self, X):
-        """Build a tree from an arary of points X."""
+        """Build a tree from an array of points X."""
         cdef:
             int i
             DTYPE_t[3] pt
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 453ce2228d50d..af2feed1a861e 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -307,7 +307,7 @@ def _fit(self, X, y=None, **fit_params):
                     cloned_transformer = clone(transformer)
             else:
                 cloned_transformer = clone(transformer)
-            # Fit or load from cache the current transfomer
+            # Fit or load from cache the current transformer
             X, fitted_transformer = fit_transform_one_cached(
                 cloned_transformer, X, y, None,
                 message_clsname='Pipeline',
diff --git a/sklearn/svm/src/liblinear/liblinear_helper.c b/sklearn/svm/src/liblinear/liblinear_helper.c
index ffcbe86e01035..86d88e7da9273 100644
--- a/sklearn/svm/src/liblinear/liblinear_helper.c
+++ b/sklearn/svm/src/liblinear/liblinear_helper.c
@@ -172,7 +172,7 @@ struct problem * csr_set_problem (char *X, int double_precision_X,
 }
 
 
-/* Create a paramater struct with and return it */
+/* Create a parameter struct with and return it */
 struct parameter *set_parameter(int solver_type, double eps, double C,
                                 npy_intp nr_weight, char *weight_label,
                                 char *weight, int max_iter, unsigned seed, 
diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp
index 8bf3aa42ed488..9321340acaaed 100644
--- a/sklearn/svm/src/libsvm/svm.cpp
+++ b/sklearn/svm/src/libsvm/svm.cpp
@@ -923,7 +923,7 @@ int Solver::select_working_set(int &out_i, int &out_j)
 	// return i,j such that
 	// i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
 	// j: minimizes the decrease of obj value
-	//    (if quadratic coefficeint <= 0, replace it with tau)
+	//    (if quadratic coefficient <= 0, replace it with tau)
 	//    -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
 	
 	double Gmax = -INF;
@@ -1166,7 +1166,7 @@ int Solver_NU::select_working_set(int &out_i, int &out_j)
 	// return i,j such that y_i = y_j and
 	// i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
 	// j: minimizes the decrease of obj value
-	//    (if quadratic coefficeint <= 0, replace it with tau)
+	//    (if quadratic coefficient <= 0, replace it with tau)
 	//    -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
 
 	double Gmaxp = -INF;
diff --git a/sklearn/svm/src/libsvm/svm.h b/sklearn/svm/src/libsvm/svm.h
index 2187e3df2916f..4002a77c93ac4 100644
--- a/sklearn/svm/src/libsvm/svm.h
+++ b/sklearn/svm/src/libsvm/svm.h
@@ -79,7 +79,7 @@ struct svm_model
 	int *sv_ind;            /* index of support vectors */
 
 	double *rho;		/* constants in decision functions (rho[k*(k-1)/2]) */
-	double *probA;		/* pariwise probability information */
+	double *probA;		/* pairwise probability information */
 	double *probB;
 
 	/* for classification only */
@@ -104,7 +104,7 @@ struct svm_csr_model
         int *sv_ind;            /* index of support vectors */
 
 	double *rho;		/* constants in decision functions (rho[k*(k-1)/2]) */
-	double *probA;		/* pariwise probability information */
+	double *probA;		/* pairwise probability information */
 	double *probB;
 
 	/* for classification only */
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 736049fafbd56..07d1ee11e74a3 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -570,7 +570,8 @@ def feature_importances_(self):
         Returns
         -------
         feature_importances_ : ndarray of shape (n_features,)
-            Normalized total reduction of critera by feature (Gini importance).
+            Normalized total reduction of criteria by feature
+            (Gini importance).
         """
         check_is_fitted(self)
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 7c00d56930273..9695abf78a2b2 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -482,7 +482,7 @@ def all_estimators(include_meta_estimators=None,
     -------
     estimators : list of tuples
         List of (name, class), where ``name`` is the class name as string
-        and ``class`` is the actuall type of the class.
+        and ``class`` is the actual type of the class.
     """
     def is_abstract(c):
         if not(hasattr(c, '__abstractmethods__')):
diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py
index 2a33f34dfd2b8..c14968cafde32 100644
--- a/sklearn/utils/deprecation.py
+++ b/sklearn/utils/deprecation.py
@@ -114,7 +114,7 @@ def _update_doc(self, olddoc):
 
 
 def _is_deprecated(func):
-    """Helper to check if func is wraped by our deprecated decorator"""
+    """Helper to check if func is wrapped by our deprecated decorator"""
     closures = getattr(func, '__closure__', [])
     if closures is None:
         closures = []
diff --git a/sklearn/utils/graph_shortest_path.pyx b/sklearn/utils/graph_shortest_path.pyx
index 30cbec1d5d471..7d2e74127f153 100644
--- a/sklearn/utils/graph_shortest_path.pyx
+++ b/sklearn/utils/graph_shortest_path.pyx
@@ -215,7 +215,7 @@ cdef np.ndarray dijkstra(dist_matrix,
                                       graph, &heap, nodes)
     else:
         #use the csr -> csc sparse matrix conversion to quickly get
-        # both directions of neigbors
+        # both directions of neighbors
         dist_matrix_T = dist_matrix.T.tocsr()
 
         distances2 = np.asarray(dist_matrix_T.data,

From 25bb667ac1028854049a2fadafad1727cb910574 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Fri, 20 Dec 2019 13:29:07 +1100
Subject: [PATCH 47/85] =?UTF-8?q?DOC=20in=20canned=20comment,=20mention=20?=
 =?UTF-8?q?that=20PR=20title=20becomes=20commit=20me=E2=80=A6=20(#15935)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/developers/tips.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst
index 4c313701b4aa6..ed049285c36f3 100644
--- a/doc/developers/tips.rst
+++ b/doc/developers/tips.rst
@@ -181,10 +181,10 @@ Issue/Comment: Linking to comments
 
         Please use links to comments, which make it a lot easier to see what you are referring to, rather than just linking to the issue. See [this](https://stackoverflow.com/questions/25163598/how-do-i-reference-a-specific-issue-comment-on-github) for more details.
 
-PR-NEW: Better description
+PR-NEW: Better description and title
     ::
 
-        Thanks for the pull request! Please make the title of the PR descriptive so that we can easily recall the issue it is resolving. You should state what issue (or PR) it fixes/resolves in the description (see [here](http://scikit-learn.org/dev/developers/contributing.html#contributing-pull-requests)).
+        Thanks for the pull request! Please make the title of the PR more descriptive. The title will become the commit message when this is merged. You should state what issue (or PR) it fixes/resolves in the description using the syntax described [here](http://scikit-learn.org/dev/developers/contributing.html#contributing-pull-requests).
 
 PR-NEW: Fix #
     ::

From ce745354061611952f362e213eded66fff94689c Mon Sep 17 00:00:00 2001
From: Ritchie Ng <ritchieng@u.nus.edu>
Date: Fri, 20 Dec 2019 16:34:58 +0800
Subject: [PATCH 48/85] DOC/EXA Correct spelling of "Classification" (#15938)

---
 examples/linear_model/plot_sparse_logistic_regression_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
index 56b5457c6a27e..ab3749fb5e7f8 100644
--- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py
+++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
@@ -1,6 +1,6 @@
 """
 =====================================================
-MNIST classfification using multinomial logistic + L1
+MNIST classification using multinomial logistic + L1
 =====================================================
 
 Here we fit a multinomial logistic regression with L1 penalty on a subset of

From 199e23d1f9969ec65b54b22eb55608ca62ecb3d8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 17:11:21 +0100
Subject: [PATCH 49/85] BUG fix pip3 ubuntu update by suffixing file (#15928)

---
 sklearn/_build_utils/deprecated_modules.py                  | 2 +-
 sklearn/datasets/__init__.py                                | 6 +++---
 sklearn/datasets/_rcv1.py                                   | 2 +-
 .../{_svmlight_format.py => _svmlight_format_io.py}         | 6 ++++--
 sklearn/tests/test_common.py                                | 2 +-
 sklearn/tests/test_docstring_parameters.py                  | 2 +-
 sklearn/utils/_testing.py                                   | 2 +-
 7 files changed, 12 insertions(+), 10 deletions(-)
 rename sklearn/datasets/{_svmlight_format.py => _svmlight_format_io.py} (99%)

diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py
index 5f11ac1714022..045dc3d297be0 100644
--- a/sklearn/_build_utils/deprecated_modules.py
+++ b/sklearn/_build_utils/deprecated_modules.py
@@ -133,7 +133,7 @@
      'sklearn.datasets', 'make_classification'),
     ('_species_distributions', 'sklearn.datasets.species_distributions',
      'sklearn.datasets', 'fetch_species_distributions'),
-    ('_svmlight_format', 'sklearn.datasets.svmlight_format',
+    ('_svmlight_format_io', 'sklearn.datasets.svmlight_format',
      'sklearn.datasets', 'load_svmlight_file'),
     ('_twenty_newsgroups', 'sklearn.datasets.twenty_newsgroups',
      'sklearn.datasets', 'strip_newsgroup_header'),
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
index 2377de2bfd189..e7c93bb180567 100644
--- a/sklearn/datasets/__init__.py
+++ b/sklearn/datasets/__init__.py
@@ -42,9 +42,9 @@
 from ._samples_generator import make_gaussian_quantiles
 from ._samples_generator import make_biclusters
 from ._samples_generator import make_checkerboard
-from ._svmlight_format import load_svmlight_file
-from ._svmlight_format import load_svmlight_files
-from ._svmlight_format import dump_svmlight_file
+from ._svmlight_format_io import load_svmlight_file
+from ._svmlight_format_io import load_svmlight_files
+from ._svmlight_format_io import dump_svmlight_file
 from ._olivetti_faces import fetch_olivetti_faces
 from ._species_distributions import fetch_species_distributions
 from ._california_housing import fetch_california_housing
diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py
index 53edc9a3407d8..887a8271eae5e 100644
--- a/sklearn/datasets/_rcv1.py
+++ b/sklearn/datasets/_rcv1.py
@@ -23,7 +23,7 @@
 from ._base import _fetch_remote
 from ._base import RemoteFileMetadata
 from ._base import _refresh_cache
-from ._svmlight_format import load_svmlight_files
+from ._svmlight_format_io import load_svmlight_files
 from ..utils import shuffle as shuffle_
 from ..utils import Bunch
 
diff --git a/sklearn/datasets/_svmlight_format.py b/sklearn/datasets/_svmlight_format_io.py
similarity index 99%
rename from sklearn/datasets/_svmlight_format.py
rename to sklearn/datasets/_svmlight_format_io.py
index d344b310be995..91bb35ff2ec75 100644
--- a/sklearn/datasets/_svmlight_format.py
+++ b/sklearn/datasets/_svmlight_format_io.py
@@ -453,8 +453,10 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
 
     Xval = check_array(X, accept_sparse='csr')
     if Xval.shape[0] != yval.shape[0]:
-        raise ValueError("X.shape[0] and y.shape[0] should be the same, got"
-                         " %r and %r instead." % (Xval.shape[0], yval.shape[0]))
+        raise ValueError(
+            "X.shape[0] and y.shape[0] should be the same, got"
+            " %r and %r instead." % (Xval.shape[0], yval.shape[0])
+        )
 
     # We had some issues with CSR matrices with unsorted indices (e.g. #1501),
     # so sort them here, but first make sure we don't modify the user's X.
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 9fc3075c5fe28..08388289bd043 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -179,7 +179,7 @@ def test_import_all_consistency():
     for modname in submods + ['sklearn']:
         if ".tests." in modname:
             continue
-        if IS_PYPY and ('_svmlight_format' in modname or
+        if IS_PYPY and ('_svmlight_format_io' in modname or
                         'feature_extraction._hashing_fast' in modname):
             continue
         package = __import__(modname, fromlist="dummy")
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 31c7d268737b9..28af419195813 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -140,7 +140,7 @@ def test_tabs():
     for importer, modname, ispkg in walk_packages(sklearn.__path__,
                                                   prefix='sklearn.'):
 
-        if IS_PYPY and ('_svmlight_format' in modname or
+        if IS_PYPY and ('_svmlight_format_io' in modname or
                         'feature_extraction._hashing_fast' in modname):
             continue
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 9695abf78a2b2..0ec21ed36bb45 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -513,7 +513,7 @@ def is_abstract(c):
             path=path, prefix='sklearn.', onerror=lambda x: None):
         if ".tests." in modname or "externals" in modname:
             continue
-        if IS_PYPY and ('_svmlight_format' in modname or
+        if IS_PYPY and ('_svmlight_format_io' in modname or
                         'feature_extraction._hashing_fast' in modname):
             continue
         # Ignore deprecation warnings triggered at import time.

From d35a8da52ef70eb032092e12ca1280fe86b27de1 Mon Sep 17 00:00:00 2001
From: inderjeet <43402782+inder128@users.noreply.github.com>
Date: Fri, 20 Dec 2019 22:05:59 +0530
Subject: [PATCH 50/85] [MRG] Ways to compute center_shift_total were different
 in "full" and "elkan" algorithms. (#15930)

---
 doc/whats_new/v0.22.rst               |  7 +++++++
 sklearn/cluster/_k_means_elkan.pyx    |  4 ++--
 sklearn/cluster/tests/test_k_means.py | 15 +++++++++++----
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index ae9cbbd74e313..068ef5ed5ecab 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -15,6 +15,13 @@ This is a bug-fix release to primarily resolve some packaging issues in version
 Changelog
 ---------
 
+:mod:`sklearn.cluster`
+......................
+
+- |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping
+  criterion as with the default ``algorithm="full"``. :pr:`15930` by
+  :user:`inder128`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index abf2ea8aeac8d..87d32d1e47858 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -246,8 +246,8 @@ def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_,
             print('Iteration %i, inertia %s'
                     % (iteration, np.sum((X_ - centers_[labels]) ** 2 *
                                          sample_weight[:,np.newaxis])))
-        center_shift_total = np.sum(center_shift)
-        if center_shift_total ** 2 < tol:
+        center_shift_total = np.sum(center_shift ** 2)
+        if center_shift_total < tol:
             if verbose:
                 print("center shift %e within tolerance %e"
                       % (center_shift_total, tol))
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 50c91382d3117..5a6125d28ced5 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -69,17 +69,19 @@ def test_kmeans_results(representation, algo, dtype):
 
 
 @pytest.mark.parametrize('distribution', ['normal', 'blobs'])
-def test_elkan_results(distribution):
+@pytest.mark.parametrize('tol', [1e-2, 1e-4, 1e-8])
+def test_elkan_results(distribution, tol):
     # check that results are identical between lloyd and elkan algorithms
     rnd = np.random.RandomState(0)
     if distribution == 'normal':
-        X = rnd.normal(size=(50, 10))
+        X = rnd.normal(size=(5000, 10))
     else:
         X, _ = make_blobs(random_state=rnd)
 
-    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
+    km_full = KMeans(algorithm='full', n_clusters=5,
+                     random_state=0, n_init=1, tol=tol)
     km_elkan = KMeans(algorithm='elkan', n_clusters=5,
-                      random_state=0, n_init=1)
+                      random_state=0, n_init=1, tol=tol)
 
     km_full.fit(X)
     km_elkan.fit(X)
@@ -87,6 +89,11 @@ def test_elkan_results(distribution):
                               km_full.cluster_centers_)
     assert_array_equal(km_elkan.labels_, km_full.labels_)
 
+    # The number of iterations and inertia should be close but not
+    # necessarily exactly the same because of rounding errors.
+    assert km_elkan.n_iter_ == pytest.approx(km_full.n_iter_, rel=0.01)
+    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
+
 
 def test_labels_assignment_and_inertia():
     # pure numpy implementation as easily auditable reference gold

From c09a4e7f54d90ccba6bd4892e1a64f5bf2281f31 Mon Sep 17 00:00:00 2001
From: scibol <scibol@users.noreply.github.com>
Date: Fri, 20 Dec 2019 18:43:30 +0100
Subject: [PATCH 51/85] TST Fixes integer test for train and test indices
 (#15941)

---
 sklearn/model_selection/tests/test_split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index eb5eb192a6921..253593968ad24 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -175,7 +175,7 @@ def test_cross_validator_with_default_params():
         # Test that train, test indices returned are integers
         for train, test in cv.split(X, y, groups):
             assert np.asarray(train).dtype.kind == 'i'
-            assert np.asarray(train).dtype.kind == 'i'
+            assert np.asarray(test).dtype.kind == 'i'
 
         # Test if the repr works without any errors
         assert cv_repr == repr(cv)

From 126b0a4fa539e1d0ebe40abb12359a385bbbc06d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 19:14:49 +0100
Subject: [PATCH 52/85] BUG ensure that parallel/sequential give the same
 permutation importances (#15933)

---
 doc/whats_new/v0.22.rst                       |  13 ++
 sklearn/inspection/_permutation_importance.py |  61 ++++-----
 .../tests/test_permutation_importance.py      | 126 ++++++++++++++++++
 3 files changed, 168 insertions(+), 32 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 068ef5ed5ecab..be5688d3a32ae 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -15,12 +15,25 @@ This is a bug-fix release to primarily resolve some packaging issues in version
 Changelog
 ---------
 
+
 :mod:`sklearn.cluster`
 ......................
 
 - |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping
   criterion as with the default ``algorithm="full"``. :pr:`15930` by
   :user:`inder128`.
+ 
+:mod:`sklearn.inspection`
+.........................
+
+- |Fix| :func:`inspection.permutation_importance` will return the same
+  `importances` when a `random_state` is given for both `n_jobs=1` or
+  `n_jobs>1` both with shared memory backends (thread-safety) and
+  isolated memory, process-based backends.
+  Also avoid casting the data as object dtype and avoid read-only error
+  on large dataframes with `n_jobs>1` as reported in :issue:`15810`.
+  Follow-up of :pr:`15898` by :user:`Shivam Gargsya <shivamgargsya>`.
+  :pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>` and `Olivier Grisel`_.
 
 :mod:`sklearn.metrics`
 ......................
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index d71d5fd3f3a68..80bf4d2e2a62c 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -4,41 +4,36 @@
 from joblib import delayed
 
 from ..metrics import check_scoring
+from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
-from ..utils import Bunch
-
-
-def _safe_column_setting(X, col_idx, values):
-    """Set column on X using `col_idx`"""
-    if hasattr(X, "iloc"):
-        X.iloc[:, col_idx] = values
-    else:
-        X[:, col_idx] = values
-
-
-def _safe_column_indexing(X, col_idx):
-    """Return column from X using `col_idx`"""
-    if hasattr(X, "iloc"):
-        return X.iloc[:, col_idx].values
-    else:
-        return X[:, col_idx]
 
 
 def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
                                   n_repeats, scorer):
     """Calculate score when `col_idx` is permuted."""
-    original_feature = _safe_column_indexing(X, col_idx).copy()
-    temp = original_feature.copy()
+    random_state = check_random_state(random_state)
 
+    # Work on a copy of X to to ensure thread-safety in case of threading based
+    # parallelism. Furthermore, making a copy is also useful when the joblib
+    # backend is 'loky' (default) or the old 'multiprocessing': in those cases,
+    # if X is large it will be automatically be backed by a readonly memory map
+    # (memmap). X.copy() on the other hand is always guaranteed to return a
+    # writable data-structure whose columns can be shuffled inplace.
+    X_permuted = X.copy()
     scores = np.zeros(n_repeats)
+    shuffling_idx = np.arange(X.shape[0])
     for n_round in range(n_repeats):
-        random_state.shuffle(temp)
-        _safe_column_setting(X, col_idx, temp)
-        feature_score = scorer(estimator, X, y)
+        random_state.shuffle(shuffling_idx)
+        if hasattr(X_permuted, "iloc"):
+            col = X_permuted.iloc[shuffling_idx, col_idx]
+            col.index = X_permuted.index
+            X_permuted.iloc[:, col_idx] = col
+        else:
+            X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
+        feature_score = scorer(estimator, X_permuted, y)
         scores[n_round] = feature_score
 
-    _safe_column_setting(X, col_idx, original_feature)
     return scores
 
 
@@ -104,20 +99,22 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
              2001. https://doi.org/10.1023/A:1010933404324
     """
-    if hasattr(X, "iloc"):
-        X = X.copy()  # Dataframe
-    else:
-        X = check_array(X, force_all_finite='allow-nan', dtype=np.object,
-                        copy=True)
-
+    if not hasattr(X, "iloc"):
+        X = check_array(X, force_all_finite='allow-nan', dtype=None)
+
+    # Precompute random seed from the random state to be used
+    # to get a fresh independent RandomState instance for each
+    # parallel call to _calculate_permutation_scores, irrespective of
+    # the fact that variables are shared or not depending on the active
+    # joblib backend (sequential, thread-based or process-based).
     random_state = check_random_state(random_state)
-    scorer = check_scoring(estimator, scoring=scoring)
+    random_seed = random_state.randint(np.iinfo(np.int32).max + 1)
 
+    scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)
-    scores = np.zeros((X.shape[1], n_repeats))
 
     scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
-        estimator, X, y, col_idx, random_state, n_repeats, scorer
+        estimator, X, y, col_idx, random_seed, n_repeats, scorer
     ) for col_idx in range(X.shape[1]))
 
     importances = baseline_score - np.array(scores)
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 671a1e11b1fec..2a31a031f2938 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -6,7 +6,9 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.datasets import load_boston
 from sklearn.datasets import load_iris
+from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
+from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression
@@ -14,9 +16,13 @@
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
 from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import scale
+from sklearn.utils import parallel_backend
+from sklearn.utils._testing import _convert_container
+
 
 @pytest.mark.parametrize("n_jobs", [1, 2])
 def test_permutation_importance_correlated_feature_regression(n_jobs):
@@ -150,3 +156,123 @@ def test_permutation_importance_linear_regresssion():
                                      scoring='neg_mean_squared_error')
     assert_allclose(expected_importances, results.importances_mean,
                     rtol=1e-1, atol=1e-6)
+
+
+def test_permutation_importance_equivalence_sequential_parallel():
+    # regression test to make sure that sequential and parallel calls will
+    # output the same results.
+    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
+    lr = LinearRegression().fit(X, y)
+
+    importance_sequential = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=1
+    )
+
+    # First check that the problem is structured enough and that the model is
+    # complex enough to not yield trivial, constant importances:
+    imp_min = importance_sequential['importances'].min()
+    imp_max = importance_sequential['importances'].max()
+    assert imp_max - imp_min > 0.3
+
+    # The actually check that parallelism does not impact the results
+    # either with shared memory (threading) or without isolated memory
+    # via process-based parallelism using the default backend
+    # ('loky' or 'multiprocessing') depending on the joblib version:
+
+    # process-based parallelism (by default):
+    importance_processes = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=2)
+    assert_allclose(
+        importance_processes['importances'],
+        importance_sequential['importances']
+    )
+
+    # thread-based parallelism:
+    with parallel_backend("threading"):
+        importance_threading = permutation_importance(
+            lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+        )
+    assert_allclose(
+        importance_threading['importances'],
+        importance_sequential['importances']
+    )
+
+
+@pytest.mark.parametrize("n_jobs", [None, 1, 2])
+def test_permutation_importance_equivalence_array_dataframe(n_jobs):
+    # This test checks that the column shuffling logic has the same behavior
+    # both a dataframe and a simple numpy array.
+    pd = pytest.importorskip('pandas')
+
+    # regression test to make sure that sequential and parallel calls will
+    # output the same results.
+    X, y = make_regression(n_samples=100, n_features=5, random_state=0)
+    X_df = pd.DataFrame(X)
+
+    # Add a categorical feature that is statistically linked to y:
+    binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
+    cat_column = binner.fit_transform(y.reshape(-1, 1))
+
+    # Concatenate the extra column to the numpy array: integers will be
+    # cast to float values
+    X = np.hstack([X, cat_column])
+    assert X.dtype.kind == "f"
+
+    # Insert extra column as a non-numpy-native dtype (while keeping backward
+    # compat for old pandas versions):
+    if hasattr(pd, "Categorical"):
+        cat_column = pd.Categorical(cat_column.ravel())
+    else:
+        cat_column = cat_column.ravel()
+    new_col_idx = len(X_df.columns)
+    X_df[new_col_idx] = cat_column
+    assert X_df[new_col_idx].dtype == cat_column.dtype
+
+    # Stich an aribtrary index to the dataframe:
+    X_df.index = np.arange(len(X_df)).astype(str)
+
+    rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
+    rf.fit(X, y)
+
+    n_repeats = 3
+    importance_array = permutation_importance(
+        rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
+    )
+
+    # First check that the problem is structured enough and that the model is
+    # complex enough to not yield trivial, constant importances:
+    imp_min = importance_array['importances'].min()
+    imp_max = importance_array['importances'].max()
+    assert imp_max - imp_min > 0.3
+
+    # Now check that importances computed on dataframe matche the values
+    # of those computed on the array with the same data.
+    importance_dataframe = permutation_importance(
+        rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
+    )
+    assert_allclose(
+        importance_array['importances'],
+        importance_dataframe['importances']
+    )
+
+
+@pytest.mark.parametrize("input_type", ["array", "dataframe"])
+def test_permutation_importance_large_memmaped_data(input_type):
+    # Smoke, non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15810
+    n_samples, n_features = int(5e4), 4
+    X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                               random_state=0)
+    assert X.nbytes > 1e6  # trigger joblib memmaping
+
+    X = _convert_container(X, input_type)
+    clf = DummyClassifier(strategy='prior').fit(X, y)
+
+    # Actual smoke test: should not raise any error:
+    n_repeats = 5
+    r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
+
+    # Auxiliary check: DummyClassifier is feature independent:
+    # permutating feature should not change the predictions
+    expected_importances = np.zeros((n_features, n_repeats))
+    assert_allclose(expected_importances, r.importances)

From 476870308402d1a2f47c1546a6d2df4458e8acfc Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Sat, 21 Dec 2019 00:56:38 +0100
Subject: [PATCH 53/85] Formatting fixes in changelog (#15944)

---
 doc/whats_new/v0.22.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index be5688d3a32ae..00c957448550f 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -19,10 +19,10 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
-- |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping
-  criterion as with the default ``algorithm="full"``. :pr:`15930` by
+- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now uses the same
+  stopping criterion as with the default ``algorithm="full"``. :pr:`15930` by
   :user:`inder128`.
- 
+
 :mod:`sklearn.inspection`
 .........................
 
@@ -51,7 +51,7 @@ Changelog
 :mod:`sklearn.inspection`
 .........................
 
-- |Fix| :func:`inspection.plot_partial_dependence` and 
+- |Fix| :func:`inspection.plot_partial_dependence` and
   :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks
   the number of axes passed in. :pr:`15760` by `Thomas Fan`_.
 

From 1739eb9819c2b0aeaaaf2bcd231983c15613e969 Mon Sep 17 00:00:00 2001
From: Tirth Patel <tirthasheshpatel@gmail.com>
Date: Sat, 21 Dec 2019 22:00:37 +0530
Subject: [PATCH 54/85] MRG FIX: order of values of self.quantiles_ in
 QuantileTransformer (#15751)

---
 doc/whats_new/v0.22.rst                  |  4 ++++
 sklearn/preprocessing/_data.py           | 10 ++++++++++
 sklearn/preprocessing/tests/test_data.py | 21 +++++++++++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 00c957448550f..d1cfe0d42239c 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -817,6 +817,10 @@ Changelog
   :class:`preprocessing.KernelCenterer`
   :pr:`14336` by :user:`Gregory Dexter <gdex1>`.
 
+- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the 
+  `quantiles_` attribute to be completely sorted in non-decreasing manner.
+  :pr:`15751` by :user:`Tirth Patel <tirthasheshpatel>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 2b360af9ad40a..7e48f8e0c3bf6 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -2262,6 +2262,11 @@ def _dense_fit(self, X, random_state):
                 col = col.take(subsample_idx, mode='clip')
             self.quantiles_.append(np.nanpercentile(col, references))
         self.quantiles_ = np.transpose(self.quantiles_)
+        # Due to floating-point precision error in `np.nanpercentile`,
+        # make sure that quantiles are monotonically increasing.
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
 
     def _sparse_fit(self, X, random_state):
         """Compute percentiles for sparse matrices.
@@ -2305,6 +2310,11 @@ def _sparse_fit(self, X, random_state):
                 self.quantiles_.append(
                         np.nanpercentile(column_data, references))
         self.quantiles_ = np.transpose(self.quantiles_)
+        # due to floating-point precision error in `np.nanpercentile`,
+        # make sure the quantiles are monotonically increasing
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
 
     def fit(self, X, y=None):
         """Compute the quantiles used for transforming.
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 060719200fa99..49297940811d1 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -25,6 +25,7 @@
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils._testing import _convert_container
 
 from sklearn.utils.sparsefuncs import mean_variance_axis
 from sklearn.preprocessing._data import _handle_zeros_in_scale
@@ -1545,6 +1546,26 @@ def test_deprecated_quantile_transform_copy():
                          np.array([[0, 1], [0, 0.5], [1, 0]]))
 
 
+@pytest.mark.parametrize("array_type", ['array', 'sparse'])
+def test_quantile_transformer_sorted_quantiles(array_type):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15733
+    # Taken from upstream bug report:
+    # https://github.com/numpy/numpy/issues/14685
+    X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10)
+    X = 0.1 * X.reshape(-1, 1)
+    X = _convert_container(X, array_type)
+
+    n_quantiles = 100
+    qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)
+
+    # Check that the estimated quantile threasholds are monotically
+    # increasing:
+    quantiles = qt.quantiles_[:, 0]
+    assert len(quantiles) == 100
+    assert all(np.diff(quantiles) >= 0)
+
+
 def test_robust_scaler_invalid_range():
     for range_ in [
         (-1, 90),

From 23eb807761bc88482e52074d981cfe834b236c32 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Sat, 21 Dec 2019 12:07:54 -0500
Subject: [PATCH 55/85] [MRG] BUG Fixes constrast in plot_confusion_matrix
 (#15936)

---
 doc/whats_new/v0.22.rst                        |  4 ++++
 sklearn/metrics/_plot/confusion_matrix.py      |  2 +-
 .../_plot/tests/test_plot_confusion_matrix.py  | 18 ++++++++++++++++--
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index d1cfe0d42239c..f7b5fda459a09 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -42,6 +42,10 @@ Changelog
   is invalid. Previously, it runs fine with no normalization.
   :pr:`15888` by `Hanmin Qin`_.
 
+- |Fix| :func:`metrics.plot_confusion_matrix` now colors the label color
+  correctly to maximize contrast with its background. :pr:`15936` by
+  `Thomas Fan`_ and :user:`DizietAsahi`.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 1c3fc2715ffb3..f759c4d5c1c3d 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -93,7 +93,7 @@ def plot(self, include_values=True, cmap='viridis',
                 values_format = '.2g'
 
             # print text with appropriate color depending on background
-            thresh = (cm.max() - cm.min()) / 2.
+            thresh = (cm.max() + cm.min()) / 2.0
             for i, j in product(range(n_classes), range(n_classes)):
                 color = cmap_max if cm[i, j] < thresh else cmap_min
                 self.text_[i, j] = ax.text(j, i,
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index cf8d597eb4f44..91ff411ab4f13 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -201,7 +201,7 @@ def test_confusion_matrix_contrast(pyplot):
     assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
     assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
 
-    # oof-diagonal text is white
+    # off-diagonal text is white
     assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
     assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
 
@@ -210,10 +210,24 @@ def test_confusion_matrix_contrast(pyplot):
     assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
     assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
 
-    # oof-diagonal text is black
+    # off-diagonal text is black
     assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
     assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
 
+    # Regression test for #15920
+    cm = np.array([[19, 34], [32, 58]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.Blues)
+    min_color = pyplot.cm.Blues(0)
+    max_color = pyplot.cm.Blues(255)
+    assert_allclose(disp.text_[0, 0].get_color(), max_color)
+    assert_allclose(disp.text_[0, 1].get_color(), max_color)
+    assert_allclose(disp.text_[1, 0].get_color(), max_color)
+    assert_allclose(disp.text_[1, 1].get_color(), min_color)
+
+
+
 
 @pytest.mark.parametrize(
     "clf", [LogisticRegression(),

From 24ecdad84126173097976b498199ba786ef6ccf3 Mon Sep 17 00:00:00 2001
From: Bibhash Chandra Mitra <bibhashm220896@gmail.com>
Date: Sat, 21 Dec 2019 22:49:50 +0530
Subject: [PATCH 56/85] BUG use zero_division argument in classification_report
 (#15879)

---
 doc/whats_new/v0.22.rst                      |  4 ++++
 sklearn/metrics/_classification.py           |  3 ++-
 sklearn/metrics/tests/test_classification.py | 17 ++++++++++++++++-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index f7b5fda459a09..e5cf5a29a8d52 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -46,6 +46,10 @@ Changelog
   correctly to maximize contrast with its background. :pr:`15936` by
   `Thomas Fan`_ and :user:`DizietAsahi`.
 
+- |Fix| :func:`metrics.classification_report` does no longer ignore the
+  value of the ``zero_division`` keyword argument. :pr:`15879`
+  by :user:`Bibhash Chandra Mitra <Bibyutatsu>`.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index c5ebf6f4cec10..2946488d8fe0f 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2038,7 +2038,8 @@ class 2       1.00      0.67      0.80         3
         # compute averages with specified averaging method
         avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support(
             y_true, y_pred, labels=labels,
-            average=average, sample_weight=sample_weight)
+            average=average, sample_weight=sample_weight,
+            zero_division=zero_division)
         avg = [avg_p, avg_r, avg_f1, np.sum(s)]
 
         if output_dict:
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index e26e4e32c5e72..46da9db6e68ed 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -20,7 +20,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_warns_div0
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_warns_message
@@ -155,6 +154,22 @@ def test_classification_report_dictionary_output():
     assert type(expected_report['macro avg']['support']) == int
 
 
+@pytest.mark.parametrize('zero_division', ["warn", 0, 1])
+def test_classification_report_zero_division_warning(zero_division):
+    y_true, y_pred = ["a", "b", "c"], ["a", "b", "d"]
+    with warnings.catch_warnings(record=True) as record:
+        classification_report(
+            y_true, y_pred, zero_division=zero_division, output_dict=True)
+        if zero_division == "warn":
+            assert len(record) > 1
+            for item in record:
+                msg = ("Use `zero_division` parameter to control this "
+                       "behavior.")
+                assert msg in str(item.message)
+        else:
+            assert not record
+
+
 def test_multilabel_accuracy_score_subset_accuracy():
     # Dense label indicator matrix format
     y1 = np.array([[0, 1, 1], [1, 0, 1]])

From 70cee110982a87b828dabb5aa9a02b4674cc7e9b Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Mon, 23 Dec 2019 11:29:47 +0100
Subject: [PATCH 57/85] DOC change logreg solver in plot_logistic_path (#15927)

---
 examples/linear_model/plot_logistic_path.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index 79b5522575eb0..7aead065f3445 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -14,7 +14,7 @@
 coefficients are exactly 0. When regularization gets progressively looser,
 coefficients can get non-zero values one after the other.
 
-Here we choose the SAGA solver because it can efficiently optimize for the
+Here we choose the liblinear solver because it can efficiently optimize for the
 Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
 
 Also note that we set a low value for the tolerance to make sure that the model
@@ -55,9 +55,10 @@
 
 print("Computing regularization path ...")
 start = time()
-clf = linear_model.LogisticRegression(penalty='l1', solver='saga',
+clf = linear_model.LogisticRegression(penalty='l1', solver='liblinear',
                                       tol=1e-6, max_iter=int(1e6),
-                                      warm_start=True)
+                                      warm_start=True,
+                                      intercept_scaling=10000.)
 coefs_ = []
 for c in cs:
     clf.set_params(C=c)

From 10c0945d219511431927d977a4642b7b7942a17c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 23 Dec 2019 17:20:41 +0100
Subject: [PATCH 58/85] DOC fix whats new ordering (#15961)

---
 doc/whats_new/v0.22.rst | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index e5cf5a29a8d52..370f02fda49f9 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -35,6 +35,10 @@ Changelog
   Follow-up of :pr:`15898` by :user:`Shivam Gargsya <shivamgargsya>`.
   :pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>` and `Olivier Grisel`_.
 
+- |Fix| :func:`inspection.plot_partial_dependence` and
+  :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks
+  the number of axes passed in. :pr:`15760` by `Thomas Fan`_.
+
 :mod:`sklearn.metrics`
 ......................
 
@@ -56,13 +60,6 @@ Changelog
 - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with
   boolean columns to floats. :pr:`15797` by `Thomas Fan`_.
 
-:mod:`sklearn.inspection`
-.........................
-
-- |Fix| :func:`inspection.plot_partial_dependence` and
-  :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks
-  the number of axes passed in. :pr:`15760` by `Thomas Fan`_.
-
 .. _changes_0_22:
 
 Version 0.22.0

From bcc3c24367e5859e2202a9007d249d0d0333c60a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 24 Dec 2019 09:41:21 +0100
Subject: [PATCH 59/85] COSMIT use np.iinfo to define the max int32 (#15960)

---
 sklearn/__init__.py                          | 2 +-
 sklearn/ensemble/_base.py                    | 4 +---
 sklearn/feature_extraction/_hash.py          | 2 +-
 sklearn/feature_extraction/_hashing_fast.pyx | 2 +-
 sklearn/feature_extraction/text.py           | 2 +-
 sklearn/tree/_classes.py                     | 2 +-
 6 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 806a317bd03b5..cbea669c69269 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -106,7 +106,7 @@ def setup_module(module):
     # Check if a random seed exists in the environment, if not create one.
     _random_seed = os.environ.get('SKLEARN_SEED', None)
     if _random_seed is None:
-        _random_seed = np.random.uniform() * (2 ** 31 - 1)
+        _random_seed = np.random.uniform() * np.iinfo(np.int32).max
     _random_seed = int(_random_seed)
     print("I: Seeding RNGs with %r" % _random_seed)
     np.random.seed(_random_seed)
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 5db30b9bbc600..9c6d8cbce0206 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -19,8 +19,6 @@
 from ..utils import check_random_state
 from ..utils.metaestimators import _BaseComposition
 
-MAX_RAND_SEED = np.iinfo(np.int32).max
-
 
 def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
     """Private function used to fit an estimator within a job."""
@@ -71,7 +69,7 @@ def _set_random_states(estimator, random_state=None):
     to_set = {}
     for key in sorted(estimator.get_params(deep=True)):
         if key == 'random_state' or key.endswith('__random_state'):
-            to_set[key] = random_state.randint(MAX_RAND_SEED)
+            to_set[key] = random_state.randint(np.iinfo(np.int32).max)
 
     if to_set:
         estimator.set_params(**to_set)
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index 83c3cf4857dbd..f5a0ba540ccf9 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -101,7 +101,7 @@ def _validate_params(n_features, input_type):
         if not isinstance(n_features, numbers.Integral):
             raise TypeError("n_features must be integral, got %r (%s)."
                             % (n_features, type(n_features)))
-        elif n_features < 1 or n_features >= 2 ** 31:
+        elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1:
             raise ValueError("Invalid number of features (%d)." % n_features)
 
         if input_type not in ("dict", "pair", "string"):
diff --git a/sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing_fast.pyx
index 87980db0f435d..d5f8de592b5c6 100644
--- a/sklearn/feature_extraction/_hashing_fast.pyx
+++ b/sklearn/feature_extraction/_hashing_fast.pyx
@@ -88,7 +88,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype,
     indices_a = np.frombuffer(indices, dtype=np.int32)
     indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype)
 
-    if indptr[-1] > 2147483648:  # = 2**31
+    if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
         if sp_version < (0, 14):
             raise ValueError(('sparse CSR array has {} non-zero '
                               'elements and requires 64 bit indexing, '
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 07bf4d20b2b54..b5ca1302e5a03 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1150,7 +1150,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        if indptr[-1] > 2147483648:  # = 2**31 - 1
+        if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
             if _IS_32BIT:
                 raise ValueError(('sparse CSR array has {} non-zero '
                                   'elements and requires 64 bit indexing, '
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 07d1ee11e74a3..1201f4af843c8 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -197,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
         # Check parameters
-        max_depth = ((2 ** 31) - 1 if self.max_depth is None
+        max_depth = (np.iinfo(np.int32).max if self.max_depth is None
                      else self.max_depth)
         max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                           else self.max_leaf_nodes)

From 6a6ea374882212900ea3c6fc366fcc732fd8b1eb Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <rs2715@stern.nyu.edu>
Date: Thu, 26 Dec 2019 06:10:17 -0500
Subject: [PATCH 60/85] DOC Apply numpydoc validation to VotingRegressor
 methods (#15969)

Co-authored-by: Tiffany R. Williams <Tiffany8@users.noreply.github.com>
---
 sklearn/ensemble/_voting.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 3d6d8016cf6ed..23f381ca75750 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -326,7 +326,7 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
 
     Parameters
     ----------
-    estimators : list of (string, estimator) tuples
+    estimators : list of (str, estimator) tuples
         Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
         of those original estimators that will be stored in the class attribute
         ``self.estimators_``. An estimator can be set to ``'drop'`` using
@@ -357,6 +357,10 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
 
         .. versionadded:: 0.20
 
+    See Also
+    --------
+    VotingClassifier: Soft Voting/Majority Rule classifier.
+
     Examples
     --------
     >>> import numpy as np
@@ -370,10 +374,6 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
     >>> er = VotingRegressor([('lr', r1), ('rf', r2)])
     >>> print(er.fit(X, y).predict(X))
     [ 3.3  5.7 11.8 19.7 28.  40.3]
-
-    See also
-    --------
-    VotingClassifier: Soft Voting/Majority Rule classifier.
     """
 
     def __init__(self, estimators, weights=None, n_jobs=None):
@@ -382,7 +382,7 @@ def __init__(self, estimators, weights=None, n_jobs=None):
         self.n_jobs = n_jobs
 
     def fit(self, X, y, sample_weight=None):
-        """ Fit the estimators.
+        """Fit the estimators.
 
         Parameters
         ----------
@@ -401,6 +401,7 @@ def fit(self, X, y, sample_weight=None):
         Returns
         -------
         self : object
+            Fitted estimator.
         """
         y = column_or_1d(y, warn=True)
         return super().fit(X, y, sample_weight)
@@ -435,9 +436,8 @@ def transform(self, X):
 
         Returns
         -------
-        predictions
-            array-like of shape (n_samples, n_classifiers), being
-            values predicted by each regressor.
+        predictions: array of shape (n_samples, n_classifiers)
+            Values predicted by each regressor.
         """
         check_is_fitted(self)
         return self._predict(X)

From 882830a0ae9a57098d372c144be81f76b9b8e009 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <rs2715@stern.nyu.edu>
Date: Thu, 26 Dec 2019 06:37:34 -0500
Subject: [PATCH 61/85] DOC improve naive_bayes.py documentation (#15943)

Co-authored-by: Jigna Panchal <40188288+jigna-panchal@users.noreply.github.com>
---
 sklearn/naive_bayes.py | 127 ++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 64 deletions(-)

diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 585ba69fbb1ce..cebc428e17b12 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -139,23 +139,23 @@ class GaussianNB(_BaseNB):
 
     Attributes
     ----------
-    class_prior_ : array, shape (n_classes,)
-        probability of each class.
-
     class_count_ : array, shape (n_classes,)
         number of training samples observed in each class.
 
+    class_prior_ : array, shape (n_classes,)
+        probability of each class.
+
     classes_ : array, shape (n_classes,)
         class labels known to the classifier
 
-    theta_ : array, shape (n_classes, n_features)
-        mean of each feature per class
+    epsilon_ : float
+        absolute additive value to variances
 
     sigma_ : array, shape (n_classes, n_features)
         variance of each feature per class
 
-    epsilon_ : float
-        absolute additive value to variances
+    theta_ : array, shape (n_classes, n_features)
+        mean of each feature per class
 
     Examples
     --------
@@ -685,33 +685,33 @@ class MultinomialNB(_BaseDiscreteNB):
 
     Attributes
     ----------
+    class_count_ : array, shape (n_classes,)
+        Number of samples encountered for each class during fitting. This
+        value is weighted by the sample weight when provided.
+
     class_log_prior_ : array, shape (n_classes, )
         Smoothed empirical log probability for each class.
 
-    intercept_ : array, shape (n_classes, )
-        Mirrors ``class_log_prior_`` for interpreting MultinomialNB
-        as a linear model.
-
-    feature_log_prob_ : array, shape (n_classes, n_features)
-        Empirical log probability of features
-        given a class, ``P(x_i|y)``.
+    classes_ : array, shape (n_classes,)
+        Class labels known to the classifier
 
     coef_ : array, shape (n_classes, n_features)
         Mirrors ``feature_log_prob_`` for interpreting MultinomialNB
         as a linear model.
 
-    class_count_ : array, shape (n_classes,)
-        Number of samples encountered for each class during fitting. This
-        value is weighted by the sample weight when provided.
-
-    classes_ : array, shape (n_classes,)
-        Class labels known to the classifier
-
     feature_count_ : array, shape (n_classes, n_features)
         Number of samples encountered for each (class, feature)
         during fitting. This value is weighted by the sample weight when
         provided.
 
+    feature_log_prob_ : array, shape (n_classes, n_features)
+        Empirical log probability of features
+        given a class, ``P(x_i|y)``.
+
+    intercept_ : array, shape (n_classes, )
+        Mirrors ``class_log_prior_`` for interpreting MultinomialNB
+        as a linear model.
+
     n_features_ : int
         Number of features of each sample.
 
@@ -797,31 +797,31 @@ class ComplementNB(_BaseDiscreteNB):
 
     Attributes
     ----------
-    class_log_prior_ : array, shape (n_classes, )
-        Smoothed empirical log probability for each class. Only used in edge
-        case with a single class in the training set.
-
-    feature_log_prob_ : array, shape (n_classes, n_features)
-        Empirical weights for class complements.
-
     class_count_ : array, shape (n_classes,)
         Number of samples encountered for each class during fitting. This
         value is weighted by the sample weight when provided.
 
+    class_log_prior_ : array, shape (n_classes, )
+        Smoothed empirical log probability for each class. Only used in edge
+        case with a single class in the training set.
+
     classes_ : array, shape (n_classes,)
         Class labels known to the classifier
 
+    feature_all_ : array, shape (n_features,)
+        Number of samples encountered for each feature during fitting. This
+        value is weighted by the sample weight when provided.
+
     feature_count_ : array, shape (n_classes, n_features)
         Number of samples encountered for each (class, feature) during fitting.
         This value is weighted by the sample weight when provided.
 
+    feature_log_prob_ : array, shape (n_classes, n_features)
+        Empirical weights for class complements.
+
     n_features_ : int
         Number of features of each sample.
 
-    feature_all_ : array, shape (n_features,)
-        Number of samples encountered for each feature during fitting. This
-        value is weighted by the sample weight when provided.
-
     Examples
     --------
     >>> import numpy as np
@@ -909,16 +909,13 @@ class BernoulliNB(_BaseDiscreteNB):
 
     Attributes
     ----------
-    class_log_prior_ : array, shape = [n_classes]
-        Log probability of each class (smoothed).
-
-    feature_log_prob_ : array, shape = [n_classes, n_features]
-        Empirical log probability of features given a class, P(x_i|y).
-
     class_count_ : array, shape = [n_classes]
         Number of samples encountered for each class during fitting. This
         value is weighted by the sample weight when provided.
 
+    class_log_prior_ : array, shape = [n_classes]
+        Log probability of each class (smoothed).
+
     classes_ : array, shape (n_classes,)
         Class labels known to the classifier
 
@@ -927,26 +924,12 @@ class BernoulliNB(_BaseDiscreteNB):
         during fitting. This value is weighted by the sample weight when
         provided.
 
+    feature_log_prob_ : array, shape = [n_classes, n_features]
+        Empirical log probability of features given a class, P(x_i|y).
+
     n_features_ : int
         Number of features of each sample.
 
-    See Also
-    ----------
-    MultinomialNB: The multinomial Naive Bayes classifier is \
-        suitable for classification with discrete features.
-
-    References
-    ----------
-    C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
-    Information Retrieval. Cambridge University Press, pp. 234-265.
-    https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
-
-    A. McCallum and K. Nigam (1998). A comparison of event models for naive
-    Bayes text classification. Proc. AAAI/ICML-98 Workshop on Learning for
-    Text Categorization, pp. 41-48.
-
-    V. Metsis, I. Androutsopoulos and G. Paliouras (2006). Spam filtering with
-    naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).
 
     Examples
     --------
@@ -960,6 +943,19 @@ class BernoulliNB(_BaseDiscreteNB):
     BernoulliNB()
     >>> print(clf.predict(X[2:3]))
     [3]
+
+    References
+    ----------
+    C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
+    Information Retrieval. Cambridge University Press, pp. 234-265.
+    https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
+
+    A. McCallum and K. Nigam (1998). A comparison of event models for naive
+    Bayes text classification. Proc. AAAI/ICML-98 Workshop on Learning for
+    Text Categorization, pp. 41-48.
+
+    V. Metsis, I. Androutsopoulos and G. Paliouras (2006). Spam filtering with
+    naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).
     """
 
     def __init__(self, alpha=1.0, binarize=.0, fit_prior=True,
@@ -1036,22 +1032,25 @@ class CategoricalNB(_BaseDiscreteNB):
 
     Attributes
     ----------
-    class_log_prior_ : array, shape (n_classes, )
-        Smoothed empirical log probability for each class.
-
-    feature_log_prob_ : list of arrays, len n_features
+    category_count_ : list of arrays, len n_features
         Holds arrays of shape (n_classes, n_categories of respective feature)
-        for each feature. Each array provides the empirical log probability
-        of categories given the respective feature and class, ``P(x_i|y)``.
+        for each feature. Each array provides the number of samples
+        encountered for each class and category of the specific feature.
 
     class_count_ : array, shape (n_classes,)
         Number of samples encountered for each class during fitting. This
         value is weighted by the sample weight when provided.
 
-    category_count_ : list of arrays, len n_features
+    class_log_prior_ : array, shape (n_classes, )
+        Smoothed empirical log probability for each class.
+
+    classes_ : array, shape (n_classes,)
+        Class labels known to the classifier
+
+    feature_log_prob_ : list of arrays, len n_features
         Holds arrays of shape (n_classes, n_categories of respective feature)
-        for each feature. Each array provides the number of samples
-        encountered for each class and category of the specific feature.
+        for each feature. Each array provides the empirical log probability
+        of categories given the respective feature and class, ``P(x_i|y)``.
 
     n_features_ : int
         Number of features of each sample.

From e8b24bb8c3157c6d1ed145a1c94d946d945f46de Mon Sep 17 00:00:00 2001
From: "@nkish" <19225359+ankishb@users.noreply.github.com>
Date: Thu, 26 Dec 2019 11:43:10 +0000
Subject: [PATCH 62/85] DOC Fix default values in Perceptron documentation
 (#15965)

---
 sklearn/linear_model/_perceptron.py |  36 ++++-----
 sklearn/linear_model/_ridge.py      | 109 ++++++++++++++--------------
 2 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index 10e4f27f5490e..157083c010390 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -12,25 +12,25 @@ class Perceptron(BaseSGDClassifier):
     Parameters
     ----------
 
-    penalty : None, 'l2' or 'l1' or 'elasticnet'
-        The penalty (aka regularization term) to be used. Defaults to None.
+    penalty : {'l2','l1','elasticnet'}, default=None
+        The penalty (aka regularization term) to be used.
 
-    alpha : float
+    alpha : float, default=0.0001
         Constant that multiplies the regularization term if regularization is
-        used. Defaults to 0.0001
+        used.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
-        data is assumed to be already centered. Defaults to True.
+        data is assumed to be already centered.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > previous_loss - tol).
 
@@ -39,20 +39,20 @@ class Perceptron(BaseSGDClassifier):
     shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
 
-    verbose : integer, default=0
+    verbose : int, default=0
         The verbosity level
 
-    eta0 : double
-        Constant by which the updates are multiplied. Defaults to 1.
+    eta0 : double, default=1
+        Constant by which the updates are multiplied.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -80,7 +80,7 @@ class Perceptron(BaseSGDClassifier):
 
         .. versionadded:: 0.20
 
-    class_weight : dict, {class_label: weight} or "balanced" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
@@ -97,18 +97,18 @@ class Perceptron(BaseSGDClassifier):
 
     Attributes
     ----------
-    coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
-            n_features]
+    coef_ : ndarray of shape = [1, n_features] if n_classes == 2 else \
+        [n_classes, n_features]
         Weights assigned to the features.
 
-    intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
+    intercept_ : ndarray of shape = [1] if n_classes == 2 else [n_classes]
         Constants in decision function.
 
     n_iter_ : int
         The actual number of iterations to reach the stopping criterion.
         For multiclass fits, it is the maximum over every binary fit.
 
-    classes_ : array of shape (n_classes,)
+    classes_ : ndarray of shape (n_classes,)
         The unique classes labels.
 
     t_ : int
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 9e1dd7f22085d..a6dad4bb372d9 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -245,11 +245,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
     Parameters
     ----------
-    X : {array-like, sparse matrix, LinearOperator} of shape \
+    X : {ndarray, sparse matrix, LinearOperator} of shape \
         (n_samples, n_features)
         Training data
 
-    y : array-like of shape (n_samples,) or (n_samples, n_targets)
+    y : ndarray of shape (n_samples,) or (n_samples, n_targets)
         Target values
 
     alpha : float or array-like of shape (n_targets,)
@@ -268,7 +268,8 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
         .. versionadded:: 0.17
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \
+        default='auto'
         Solver to use in the computational routines:
 
         - 'auto' chooses the solver automatically based on the type of data.
@@ -308,7 +309,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         .. versionadded:: 0.19
            SAGA solver.
 
-    max_iter : int, optional
+    max_iter : int, default=None
         Maximum number of iterations for conjugate gradient solver.
         For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
         by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
@@ -321,7 +322,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         Verbosity level. Setting verbose > 0 will display additional
         information depending on the solver used.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -349,14 +350,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
     Returns
     -------
-    coef : array of shape (n_features,) or (n_targets, n_features)
+    coef : ndarray of shape (n_features,) or (n_targets, n_features)
         Weight vector(s).
 
     n_iter : int, optional
         The actual number of iteration performed by the solver.
         Only returned if `return_n_iter` is True.
 
-    intercept : float or array of shape (n_targets,)
+    intercept : float or ndarray of shape (n_targets,)
         The intercept of the model. Only returned if `return_intercept`
         is True and if X is a scipy sparse array.
 
@@ -618,7 +619,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
 
     Parameters
     ----------
-    alpha : {float, array-like of shape (n_targets,)}, default=1.0
+    alpha : {float, ndarray of shape (n_targets,)}, default=1.0
         Regularization strength; must be a positive float. Regularization
         improves the conditioning of the problem and reduces the variance of
         the estimates. Larger values specify stronger regularization.
@@ -643,7 +644,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
-    max_iter : int, optional
+    max_iter : int, default=None
         Maximum number of iterations for conjugate gradient solver.
         For 'sparse_cg' and 'lsqr' solvers, the default value is determined
         by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
@@ -651,7 +652,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     tol : float, default=1e-3
         Precision of the solution.
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \
+        default='auto'
         Solver to use in the computational routines:
 
         - 'auto' chooses the solver automatically based on the type of data.
@@ -688,7 +690,7 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
         .. versionadded:: 0.19
            SAGA solver.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -700,14 +702,14 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
 
     Attributes
     ----------
-    coef_ : array of shape (n_features,) or (n_targets, n_features)
+    coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
         Weight vector(s).
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
-    n_iter_ : None or array of shape (n_targets,)
+    n_iter_ : None or ndarray of shape (n_targets,)
         Actual number of iterations for each target. Available only for
         sag and lsqr solvers. Other solvers will return None.
 
@@ -747,13 +749,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Training data
 
-        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target values
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -798,14 +800,14 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
-    max_iter : int, optional
+    max_iter : int, default=None
         Maximum number of iterations for conjugate gradient solver.
         The default value is determined by scipy.sparse.linalg.
 
     tol : float, default=1e-3
         Precision of the solution.
 
-    class_weight : dict or 'balanced', optional
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -813,7 +815,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``.
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \
+        default='auto'
         Solver to use in the computational routines:
 
         - 'auto' chooses the solver automatically based on the type of data.
@@ -847,7 +850,7 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
           .. versionadded:: 0.19
            SAGA solver.
 
-    random_state : int, RandomState instance or None, default=None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -856,20 +859,20 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
 
     Attributes
     ----------
-    coef_ : array of shape (1, n_features) or (n_classes, n_features)
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
         Coefficient of the features in the decision function.
 
         ``coef_`` is of shape (1, n_features) when the given problem is binary.
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
-    n_iter_ : None or array of shape (n_targets,)
+    n_iter_ : None or ndarray of shape (n_targets,)
         Actual number of iterations for each target. Available only for
         sag and lsqr solvers. Other solvers will return None.
 
-    classes_ : array of shape (n_classes,)
+    classes_ : ndarray of shape (n_classes,)
         The classes labels.
 
     See Also
@@ -907,13 +910,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Training data.
 
-        y : array-like of shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -1121,7 +1124,7 @@ def _compute_gram(self, X, sqrt_sw):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             The preprocessed design matrix.
 
         sqrt_sw : ndarray of shape (n_samples,)
@@ -1411,13 +1414,13 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Training data. Will be cast to float64 if necessary.
 
-        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target values. Will be cast to float64 if necessary.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -1532,14 +1535,14 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training data. If using GCV, will be cast to float64
             if necessary.
 
-        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+        y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target values. Will be cast to X's dtype if necessary.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 
@@ -1621,14 +1624,14 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    scoring : string, callable or None, default=None
+    scoring : string, callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``.
         If None, the negative mean squared error if cv is 'auto' or None
         (i.e. when using generalized cross-validation), and r2 score otherwise.
 
-    cv : int, cross-validation generator or an iterable, optional
+    cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1645,7 +1648,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-    gcv_mode : {None, 'auto', 'svd', eigen'}, optional
+    gcv_mode : {'auto', 'svd', eigen'}, default='auto'
         Flag indicating which strategy to use when performing
         Generalized Cross-Validation. Options are::
 
@@ -1657,7 +1660,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         The 'auto' mode is the default and is intended to pick the cheaper
         option of the two depending on the shape of the training data.
 
-    store_cv_values : boolean, default=False
+    store_cv_values : bool, default=False
         Flag indicating if the cross-validation values corresponding to
         each alpha should be stored in the ``cv_values_`` attribute (see
         below). This flag is only compatible with ``cv=None`` (i.e. using
@@ -1665,17 +1668,17 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
 
     Attributes
     ----------
-    cv_values_ : array of shape (n_samples, n_alphas) or \
+    cv_values_ : ndarray of shape (n_samples, n_alphas) or \
         shape (n_samples, n_targets, n_alphas), optional
         Cross-validation values for each alpha (if ``store_cv_values=True``\
         and ``cv=None``). After ``fit()`` has been called, this attribute \
         will contain the mean squared errors (by default) or the values \
         of the ``{loss,score}_func`` function (if provided in the constructor).
 
-    coef_ : array of shape (n_features) or (n_targets, n_features)
+    coef_ : ndarray of shape (n_features) or (n_targets, n_features)
         Weight vector(s).
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
@@ -1734,12 +1737,12 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    scoring : string, callable or None, default=None
+    scoring : string, callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``.
 
-    cv : int, cross-validation generator or an iterable, optional
+    cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1751,7 +1754,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-    class_weight : dict or 'balanced', optional
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -1759,7 +1762,7 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``
 
-    store_cv_values : boolean, default=False
+    store_cv_values : bool, default=False
         Flag indicating if the cross-validation values corresponding to
         each alpha should be stored in the ``cv_values_`` attribute (see
         below). This flag is only compatible with ``cv=None`` (i.e. using
@@ -1767,26 +1770,26 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
 
     Attributes
     ----------
-    cv_values_ : array of shape (n_samples, n_targets, n_alphas), optional
+    cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional
         Cross-validation values for each alpha (if ``store_cv_values=True`` and
         ``cv=None``). After ``fit()`` has been called, this attribute will
         contain the mean squared errors (by default) or the values of the
         ``{loss,score}_func`` function (if provided in the constructor). This
         attribute exists only when ``store_cv_values`` is True.
 
-    coef_ : array of shape (1, n_features) or (n_targets, n_features)
+    coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)
         Coefficient of the features in the decision function.
 
         ``coef_`` is of shape (1, n_features) when the given problem is binary.
 
-    intercept_ : float or array of shape (n_targets,)
+    intercept_ : float or ndarray of shape (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
 
     alpha_ : float
         Estimated regularization parameter
 
-    classes_ : array of shape (n_classes,)
+    classes_ : ndarray of shape (n_classes,)
         The classes labels.
 
     Examples
@@ -1824,15 +1827,15 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training vectors, where n_samples is the number of samples
             and n_features is the number of features. When using GCV,
             will be cast to float64 if necessary.
 
-        y : array-like of shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values. Will be cast to X's dtype if necessary.
 
-        sample_weight : float or array-like of shape (n_samples,), default=None
+        sample_weight : float or ndarray of shape (n_samples,), default=None
             Individual weights for each sample. If given a float, every sample
             will have the same weight.
 

From 3c2c3d34d129f4c20fff318d270ba1a6523e9dd2 Mon Sep 17 00:00:00 2001
From: "@nkish" <19225359+ankishb@users.noreply.github.com>
Date: Thu, 26 Dec 2019 11:46:03 +0000
Subject: [PATCH 63/85] DOC Improve default values in logistic documentation
 (#15966)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/linear_model/_logistic.py | 268 +++++++++++++++---------------
 1 file changed, 135 insertions(+), 133 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index de7ac323833e8..b3eeb98b1353a 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -52,18 +52,18 @@ def _intercept_dot(w, X, y):
 
     Parameters
     ----------
-    w : ndarray, shape (n_features,) or (n_features + 1,)
+    w : ndarray of shape (n_features,) or (n_features + 1,)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    y : ndarray, shape (n_samples,)
+    y : ndarray of shape (n_samples,)
         Array of labels.
 
     Returns
     -------
-    w : ndarray, shape (n_features,)
+    w : ndarray of shape (n_features,)
         Coefficient vector without the intercept weight (w[-1]) if the
         intercept should be fit. Unchanged otherwise.
 
@@ -88,19 +88,19 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
 
     Parameters
     ----------
-    w : ndarray, shape (n_features,) or (n_features + 1,)
+    w : ndarray of shape (n_features,) or (n_features + 1,)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    y : ndarray, shape (n_samples,)
+    y : ndarray of shape (n_samples,)
         Array of labels.
 
     alpha : float
         Regularization parameter. alpha is equal to 1 / C.
 
-    sample_weight : array-like, shape (n_samples,) optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
@@ -109,7 +109,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
     out : float
         Logistic loss.
 
-    grad : ndarray, shape (n_features,) or (n_features + 1,)
+    grad : ndarray of shape (n_features,) or (n_features + 1,)
         Logistic gradient.
     """
     n_samples, n_features = X.shape
@@ -139,19 +139,19 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None):
 
     Parameters
     ----------
-    w : ndarray, shape (n_features,) or (n_features + 1,)
+    w : ndarray of shape (n_features,) or (n_features + 1,)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    y : ndarray, shape (n_samples,)
+    y : ndarray of shape (n_samples,)
         Array of labels.
 
     alpha : float
         Regularization parameter. alpha is equal to 1 / C.
 
-    sample_weight : array-like, shape (n_samples,) optional
+    sample_weight : array-like of shape (n_samples,) default=None
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
@@ -175,25 +175,25 @@ def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
 
     Parameters
     ----------
-    w : ndarray, shape (n_features,) or (n_features + 1,)
+    w : ndarray of shape (n_features,) or (n_features + 1,)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    y : ndarray, shape (n_samples,)
+    y : ndarray of shape (n_samples,)
         Array of labels.
 
     alpha : float
         Regularization parameter. alpha is equal to 1 / C.
 
-    sample_weight : array-like, shape (n_samples,) optional
+    sample_weight : array-like of shape (n_samples,) default=None
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
     Returns
     -------
-    grad : ndarray, shape (n_features,) or (n_features + 1,)
+    grad : ndarray of shape (n_features,) or (n_features + 1,)
         Logistic gradient.
 
     Hs : callable
@@ -252,20 +252,20 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight):
 
     Parameters
     ----------
-    w : ndarray, shape (n_classes * n_features,) or
+    w : ndarray of shape (n_classes * n_features,) or
         (n_classes * (n_features + 1),)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    Y : ndarray, shape (n_samples, n_classes)
+    Y : ndarray of shape (n_samples, n_classes)
         Transformed labels according to the output of LabelBinarizer.
 
     alpha : float
         Regularization parameter. alpha is equal to 1 / C.
 
-    sample_weight : array-like, shape (n_samples,)
+    sample_weight : array-like of shape (n_samples,)
         Array of weights that are assigned to individual samples.
 
     Returns
@@ -273,10 +273,10 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight):
     loss : float
         Multinomial loss.
 
-    p : ndarray, shape (n_samples, n_classes)
+    p : ndarray of shape (n_samples, n_classes)
         Estimated class probabilities.
 
-    w : ndarray, shape (n_classes, n_features)
+    w : ndarray of shape (n_classes, n_features)
         Reshaped param vector excluding intercept terms.
 
     Reference
@@ -308,20 +308,20 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
 
     Parameters
     ----------
-    w : ndarray, shape (n_classes * n_features,) or
+    w : ndarray of shape (n_classes * n_features,) or
         (n_classes * (n_features + 1),)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    Y : ndarray, shape (n_samples, n_classes)
+    Y : ndarray of shape (n_samples, n_classes)
         Transformed labels according to the output of LabelBinarizer.
 
     alpha : float
         Regularization parameter. alpha is equal to 1 / C.
 
-    sample_weight : array-like, shape (n_samples,)
+    sample_weight : array-like of shape (n_samples,)
         Array of weights that are assigned to individual samples.
 
     Returns
@@ -329,11 +329,11 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
     loss : float
         Multinomial loss.
 
-    grad : ndarray, shape (n_classes * n_features,) or
-        (n_classes * (n_features + 1),)
+    grad : ndarray of shape (n_classes * n_features,) or \
+            (n_classes * (n_features + 1),)
         Ravelled gradient of the multinomial loss.
 
-    p : ndarray, shape (n_samples, n_classes)
+    p : ndarray of shape (n_samples, n_classes)
         Estimated class probabilities
 
     Reference
@@ -362,26 +362,26 @@ def _multinomial_grad_hess(w, X, Y, alpha, sample_weight):
 
     Parameters
     ----------
-    w : ndarray, shape (n_classes * n_features,) or
+    w : ndarray of shape (n_classes * n_features,) or
         (n_classes * (n_features + 1),)
         Coefficient vector.
 
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    Y : ndarray, shape (n_samples, n_classes)
+    Y : ndarray of shape (n_samples, n_classes)
         Transformed labels according to the output of LabelBinarizer.
 
     alpha : float
         Regularization parameter. alpha is equal to 1 / C.
 
-    sample_weight : array-like, shape (n_samples,)
+    sample_weight : array-like of shape (n_samples,)
         Array of weights that are assigned to individual samples.
 
     Returns
     -------
-    grad : array, shape (n_classes * n_features,) or
-        (n_classes * (n_features + 1),)
+    grad : ndarray of shape (n_classes * n_features,) or \
+            (n_classes * (n_features + 1),)
         Ravelled gradient of the multinomial loss.
 
     hessp : callable
@@ -669,46 +669,47 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
     Parameters
     ----------
-    X : array-like or sparse matrix, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input data.
 
-    y : array-like, shape (n_samples,) or (n_samples, n_targets)
+    y : array-like of shape (n_samples,) or (n_samples, n_targets)
         Input data, target values.
 
-    pos_class : int, None
+    pos_class : int, default=None
         The class with respect to which we perform a one-vs-all fit.
         If None, then it is assumed that the given problem is binary.
 
-    Cs : int | array-like, shape (n_cs,)
+    Cs : int or array-like of shape (n_cs,), default=10
         List of values for the regularization parameter or integer specifying
         the number of regularization parameters that should be used. In this
         case, the parameters will be chosen in a logarithmic scale between
         1e-4 and 1e4.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether to fit an intercept for the model. In this case the shape of
         the returned array is (n_cs, n_features + 1).
 
-    max_iter : int
+    max_iter : int, default=100
         Maximum number of iterations for the solver.
 
-    tol : float
+    tol : float, default=1e-4
         Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
         will stop when ``max{|g_i | i = 1, ..., n} <= tol``
         where ``g_i`` is the i-th component of the gradient.
 
-    verbose : int
+    verbose : int, default=0
         For the liblinear and lbfgs solvers set verbose to any positive
         number for verbosity.
 
-    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}
+    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, \
+            default='lbfgs'
         Numerical solver to use.
 
-    coef : array-like, shape (n_features,), default None
+    coef : array-like of shape (n_features,), default=None
         Initialization value for coefficients of logistic regression.
         Useless for liblinear solver.
 
-    class_weight : dict or 'balanced', optional
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -719,17 +720,17 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
-    dual : bool
+    dual : bool, default=False
         Dual or primal formulation. Dual formulation is only implemented for
         l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
-    penalty : str, 'l1', 'l2', or 'elasticnet'
+    penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
         Used to specify the norm used in the penalization. The 'newton-cg',
         'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
         only supported by the 'saga' solver.
 
-    intercept_scaling : float, default 1.
+    intercept_scaling : float, default=1.
         Useful only when the solver 'liblinear' is used
         and self.fit_intercept is set to True. In this case, x becomes
         [x, self.intercept_scaling],
@@ -755,7 +756,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         .. versionchanged:: 0.22
             Default changed from 'ovr' to 'auto' in 0.22.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -763,19 +764,19 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         instance used by `np.random`. Used when ``solver`` == 'sag' or
         'liblinear'.
 
-    check_input : bool, default True
+    check_input : bool, default=True
         If False, the input arrays X and y will not be checked.
 
-    max_squared_sum : float, default None
+    max_squared_sum : float, default=None
         Maximum squared sum of X over samples. Used only in SAG solver.
         If None, it will be computed, going through all the samples.
         The value should be precomputed to speed up cross validation.
 
-    sample_weight : array-like, shape(n_samples,) optional
+    sample_weight : array-like of shape(n_samples,), default=None
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
-    l1_ratio : float or None, optional (default=None)
+    l1_ratio : float, default=None
         The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
         used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
         to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
@@ -784,7 +785,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
     Returns
     -------
-    coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
+    coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
         List of coefficients for the Logistic Regression model. If
         fit_intercept is set to True then the second dimension will be
         n_features + 1, where the last item represents the intercept. For
@@ -794,7 +795,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     Cs : ndarray
         Grid of Cs used for cross-validation.
 
-    n_iter : array, shape (n_cs,)
+    n_iter : array of shape (n_cs,)
         Actual number of iteration for each Cs.
 
     Notes
@@ -1006,10 +1007,10 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
 
     Parameters
     ----------
-    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Training data.
 
-    y : array-like, shape (n_samples,) or (n_samples, n_targets)
+    y : array-like of shape (n_samples,) or (n_samples, n_targets)
         Target labels.
 
     train : list of indices
@@ -1018,34 +1019,34 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     test : list of indices
         The indices of the test set.
 
-    pos_class : int, None
+    pos_class : int, default=None
         The class with respect to which we perform a one-vs-all fit.
         If None, then it is assumed that the given problem is binary.
 
-    Cs : list of floats | int
+    Cs : int or list of floats, default=10
         Each of the values in Cs describes the inverse of
         regularization strength. If Cs is as an int, then a grid of Cs
         values are chosen in a logarithmic scale between 1e-4 and 1e4.
         If not provided, then a fixed set of values for Cs are used.
 
-    scoring : callable or None, optional, default: None
+    scoring : callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``. For a list of scoring functions
         that can be used, look at :mod:`sklearn.metrics`. The
         default scoring option used is accuracy_score.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=False
         If False, then the bias term is set to zero. Else the last
         term of each coef_ gives us the intercept.
 
-    max_iter : int
+    max_iter : int, default=100
         Maximum number of iterations for the solver.
 
-    tol : float
+    tol : float, default=1e-4
         Tolerance for stopping criteria.
 
-    class_weight : dict or 'balanced', optional
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -1056,24 +1057,25 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
-    verbose : int
+    verbose : int, default=0
         For the liblinear and lbfgs solvers set verbose to any positive
         number for verbosity.
 
-    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}
+    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, \
+            default='lbfgs'
         Decides which solver to use.
 
-    penalty : str, 'l1', 'l2', or 'elasticnet'
+    penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
         Used to specify the norm used in the penalization. The 'newton-cg',
         'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
         only supported by the 'saga' solver.
 
-    dual : bool
+    dual : bool, default=False
         Dual or primal formulation. Dual formulation is only implemented for
         l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
-    intercept_scaling : float, default 1.
+    intercept_scaling : float, default=1.
         Useful only when the solver 'liblinear' is used
         and self.fit_intercept is set to True. In this case, x becomes
         [x, self.intercept_scaling],
@@ -1085,13 +1087,13 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         To lessen the effect of regularization on synthetic feature weight
         (and therefore on the intercept) intercept_scaling has to be increased.
 
-    multi_class : {'ovr', 'multinomial'}
+    multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'
         If the option chosen is 'ovr', then a binary problem is fit for each
         label. For 'multinomial' the loss minimised is the multinomial loss fit
         across the entire probability distribution, *even when the data is
         binary*. 'multinomial' is unavailable when solver='liblinear'.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -1099,16 +1101,16 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         instance used by `np.random`. Used when ``solver`` == 'sag' and
         'liblinear'.
 
-    max_squared_sum : float, default None
+    max_squared_sum : float, default=None
         Maximum squared sum of X over samples. Used only in SAG solver.
         If None, it will be computed, going through all the samples.
         The value should be precomputed to speed up cross validation.
 
-    sample_weight : array-like, shape(n_samples,) optional
+    sample_weight : array-like of shape(n_samples,), default=None
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
-    l1_ratio : float or None, optional (default=None)
+    l1_ratio : float, default=None
         The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
         used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
         to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
@@ -1117,7 +1119,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
 
     Returns
     -------
-    coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
+    coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
         List of coefficients for the Logistic Regression model. If
         fit_intercept is set to True then the second dimension will be
         n_features + 1, where the last item represents the intercept.
@@ -1125,10 +1127,10 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     Cs : ndarray
         Grid of Cs used for cross-validation.
 
-    scores : ndarray, shape (n_cs,)
+    scores : ndarray of shape (n_cs,)
         Scores obtained for each Cs.
 
-    n_iter : array, shape(n_cs,)
+    n_iter : ndarray of shape(n_cs,)
         Actual number of iteration for each Cs.
     """
     X_train = X[train]
@@ -1214,7 +1216,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
 
     Parameters
     ----------
-    penalty : str, 'l1', 'l2', 'elasticnet' or 'none', optional (default='l2')
+    penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'
         Used to specify the norm used in the penalization. The 'newton-cg',
         'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
         only supported by the 'saga' solver. If 'none' (not supported by the
@@ -1223,24 +1225,24 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionadded:: 0.19
            l1 penalty with SAGA solver (allowing 'multinomial' + L1)
 
-    dual : bool, optional (default=False)
+    dual : bool, default=False
         Dual or primal formulation. Dual formulation is only implemented for
         l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
-    tol : float, optional (default=1e-4)
+    tol : float, default=1e-4
         Tolerance for stopping criteria.
 
-    C : float, optional (default=1.0)
+    C : float, default=1.0
         Inverse of regularization strength; must be a positive float.
         Like in support vector machines, smaller values specify stronger
         regularization.
 
-    fit_intercept : bool, optional (default=True)
+    fit_intercept : bool, default=True
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the decision function.
 
-    intercept_scaling : float, optional (default=1)
+    intercept_scaling : float, default=1
         Useful only when the solver 'liblinear' is used
         and self.fit_intercept is set to True. In this case, x becomes
         [x, self.intercept_scaling],
@@ -1253,7 +1255,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         To lessen the effect of regularization on synthetic feature weight
         (and therefore on the intercept) intercept_scaling has to be increased.
 
-    class_weight : dict or 'balanced', optional (default=None)
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -1267,7 +1269,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionadded:: 0.17
            *class_weight='balanced'*
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
@@ -1275,8 +1277,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         instance used by `np.random`. Used when ``solver`` == 'sag' or
         'liblinear'.
 
-    solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
-             optional (default='lbfgs')
+    solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
+            default='lbfgs'
 
         Algorithm to use in the optimization problem.
 
@@ -1301,10 +1303,10 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionchanged:: 0.22
             The default solver changed from 'liblinear' to 'lbfgs' in 0.22.
 
-    max_iter : int, optional (default=100)
+    max_iter : int, default=100
         Maximum number of iterations taken for the solvers to converge.
 
-    multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'
+    multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'
         If the option chosen is 'ovr', then a binary problem is fit for each
         label. For 'multinomial' the loss minimised is the multinomial loss fit
         across the entire probability distribution, *even when the data is
@@ -1317,11 +1319,11 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionchanged:: 0.22
             Default changed from 'ovr' to 'auto' in 0.22.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         For the liblinear and lbfgs solvers set verbose to any positive
         number for verbosity.
 
-    warm_start : bool, optional (default=False)
+    warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
         Useless for liblinear solver. See :term:`the Glossary <warm_start>`.
@@ -1329,7 +1331,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionadded:: 0.17
            *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         Number of CPU cores used when parallelizing over classes if
         multi_class='ovr'". This parameter is ignored when the ``solver`` is
         set to 'liblinear' regardless of whether 'multi_class' is specified or
@@ -1337,7 +1339,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         context. ``-1`` means using all processors.
         See :term:`Glossary <n_jobs>` for more details.
 
-    l1_ratio : float or None, optional (default=None)
+    l1_ratio : float, default=None
         The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
         used if ``penalty='elasticnet'`. Setting ``l1_ratio=0`` is equivalent
         to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
@@ -1347,17 +1349,17 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
     Attributes
     ----------
 
-    classes_ : array, shape (n_classes, )
+    classes_ : ndarray of shape (n_classes, )
         A list of class labels known to the classifier.
 
-    coef_ : array, shape (1, n_features) or (n_classes, n_features)
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
         Coefficient of the features in the decision function.
 
         `coef_` is of shape (1, n_features) when the given problem is binary.
         In particular, when `multi_class='multinomial'`, `coef_` corresponds
         to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).
 
-    intercept_ : array, shape (1,) or (n_classes,)
+    intercept_ : ndarray of shape (1,) or (n_classes,)
         Intercept (a.k.a. bias) added to the decision function.
 
         If `fit_intercept` is set to False, the intercept is set to zero.
@@ -1366,7 +1368,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         corresponds to outcome 1 (True) and `-intercept_` corresponds to
         outcome 0 (False).
 
-    n_iter_ : array, shape (n_classes,) or (1, )
+    n_iter_ : ndarray of shape (n_classes,) or (1, )
         Actual number of iterations for all classes. If binary or multinomial,
         it returns only 1 element. For liblinear solver, only the maximum
         number of iteration across all classes is given.
@@ -1460,14 +1462,14 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training vector, where n_samples is the number of samples and
             n_features is the number of features.
 
-        y : array-like, shape (n_samples,)
+        y : array-like of shape (n_samples,)
             Target vector relative to X.
 
-        sample_weight : array-like, shape (n_samples,) optional
+        sample_weight : array-like of shape (n_samples,) default=None
             Array of weights that are assigned to individual samples.
             If not provided, then each sample is given unit weight.
 
@@ -1701,18 +1703,18 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
 
     Parameters
     ----------
-    Cs : list of floats or int, optional (default=10)
+    Cs : int or list of floats, default=10
         Each of the values in Cs describes the inverse of regularization
         strength. If Cs is as an int, then a grid of Cs values are chosen
         in a logarithmic scale between 1e-4 and 1e4.
         Like in support vector machines, smaller values specify stronger
         regularization.
 
-    fit_intercept : bool, optional (default=True)
+    fit_intercept : bool, default=True
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the decision function.
 
-    cv : int or cross-validation generator, optional (default=None)
+    cv : int or cross-validation generator, default=None
         The default cross-validation generator used is Stratified K-Folds.
         If an integer is provided, then it is the number of folds used.
         See the module :mod:`sklearn.model_selection` module for the
@@ -1721,25 +1723,25 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         .. versionchanged:: 0.22
             ``cv`` default value if None changed from 3-fold to 5-fold.
 
-    dual : bool, optional (default=False)
+    dual : bool, default=False
         Dual or primal formulation. Dual formulation is only implemented for
         l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
-    penalty : str, 'l1', 'l2', or 'elasticnet', optional (default='l2')
+    penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
         Used to specify the norm used in the penalization. The 'newton-cg',
         'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
         only supported by the 'saga' solver.
 
-    scoring : string, callable, or None, optional (default=None)
+    scoring : str or callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``. For a list of scoring functions
         that can be used, look at :mod:`sklearn.metrics`. The
         default scoring option used is 'accuracy'.
 
-    solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
-             optional (default='lbfgs')
+    solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
+            default='lbfgs'
 
         Algorithm to use in the optimization problem.
 
@@ -1762,13 +1764,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         .. versionadded:: 0.19
            SAGA solver.
 
-    tol : float, optional (default=1e-4)
+    tol : float, default=1e-4
         Tolerance for stopping criteria.
 
-    max_iter : int, optional (default=100)
+    max_iter : int, default=100
         Maximum number of iterations of the optimization algorithm.
 
-    class_weight : dict or 'balanced', optional (default=None)
+    class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one.
 
@@ -1782,24 +1784,24 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         .. versionadded:: 0.17
            class_weight == 'balanced'
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         Number of CPU cores used during the cross-validation loop.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any
         positive number for verbosity.
 
-    refit : bool, optional (default=True)
+    refit : bool, default=True
         If set to True, the scores are averaged across all folds, and the
         coefs and the C that corresponds to the best score is taken, and a
         final refit is done using these parameters.
         Otherwise the coefs, intercepts and C that correspond to the
         best scores across folds are averaged.
 
-    intercept_scaling : float, optional (default=1)
+    intercept_scaling : float, default=1
         Useful only when the solver 'liblinear' is used
         and self.fit_intercept is set to True. In this case, x becomes
         [x, self.intercept_scaling],
@@ -1812,7 +1814,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         To lessen the effect of regularization on synthetic feature weight
         (and therefore on the intercept) intercept_scaling has to be increased.
 
-    multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'
+    multi_class : {'auto, 'ovr', 'multinomial'}, default='auto'
         If the option chosen is 'ovr', then a binary problem is fit for each
         label. For 'multinomial' the loss minimised is the multinomial loss fit
         across the entire probability distribution, *even when the data is
@@ -1825,7 +1827,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         .. versionchanged:: 0.22
             Default changed from 'ovr' to 'auto' in 0.22.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
@@ -1833,7 +1835,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         Note that this only applies to the solver and not the cross-validation
         generator.
 
-    l1_ratios : list of float or None, optional (default=None)
+    l1_ratios : list of float, default=None
         The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``.
         Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to
         using ``penalty='l2'``, while 1 is equivalent to using
@@ -1842,30 +1844,30 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
 
     Attributes
     ----------
-    classes_ : array, shape (n_classes, )
+    classes_ : ndarray of shape (n_classes, )
         A list of class labels known to the classifier.
 
-    coef_ : array, shape (1, n_features) or (n_classes, n_features)
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
         Coefficient of the features in the decision function.
 
         `coef_` is of shape (1, n_features) when the given problem
         is binary.
 
-    intercept_ : array, shape (1,) or (n_classes,)
+    intercept_ : ndarray of shape (1,) or (n_classes,)
         Intercept (a.k.a. bias) added to the decision function.
 
         If `fit_intercept` is set to False, the intercept is set to zero.
         `intercept_` is of shape(1,) when the problem is binary.
 
-    Cs_ : array, shape (n_cs)
+    Cs_ : ndarray of shape (n_cs)
         Array of C i.e. inverse of regularization parameter values used
         for cross-validation.
 
-    l1_ratios_ : array, shape (n_l1_ratios)
+    l1_ratios_ : ndarray of shape (n_l1_ratios)
         Array of l1_ratios used for cross-validation. If no l1_ratio is used
         (i.e. penalty is not 'elasticnet'), this is set to ``[None]``
 
-    coefs_paths_ : array, shape (n_folds, n_cs, n_features) or \
+    coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or \
                    (n_folds, n_cs, n_features + 1)
         dict with classes as the keys, and the path of coefficients obtained
         during cross-validating across each fold and then across each Cs
@@ -1887,19 +1889,19 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         has shape ``(n_folds, n_cs`` or ``(n_folds, n_cs, n_l1_ratios)`` if
         ``penalty='elasticnet'``.
 
-    C_ : array, shape (n_classes,) or (n_classes - 1,)
+    C_ : ndarray of shape (n_classes,) or (n_classes - 1,)
         Array of C that maps to the best scores across every class. If refit is
         set to False, then for each class, the best C is the average of the
         C's that correspond to the best scores for each fold.
         `C_` is of shape(n_classes,) when the problem is binary.
 
-    l1_ratio_ : array, shape (n_classes,) or (n_classes - 1,)
+    l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,)
         Array of l1_ratio that maps to the best scores across every class. If
         refit is set to False, then for each class, the best l1_ratio is the
         average of the l1_ratio's that correspond to the best scores for each
         fold.  `l1_ratio_` is of shape(n_classes,) when the problem is binary.
 
-    n_iter_ : array, shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)
+    n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)
         Actual number of iterations for all classes, folds and Cs.
         In the binary or multinomial cases, the first dimension is equal to 1.
         If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds,
@@ -1952,14 +1954,14 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training vector, where n_samples is the number of samples and
             n_features is the number of features.
 
-        y : array-like, shape (n_samples,)
+        y : array-like of shape (n_samples,)
             Target vector relative to X.
 
-        sample_weight : array-like, shape (n_samples,) optional
+        sample_weight : array-like of shape (n_samples,) default=None
             Array of weights that are assigned to individual samples.
             If not provided, then each sample is given unit weight.
 

From 31a8134a56ba2ef70c11be65b543b385648d2521 Mon Sep 17 00:00:00 2001
From: Pulkit Mehta <pulkit_mehta_work@yahoo.com>
Date: Thu, 26 Dec 2019 17:19:45 +0530
Subject: [PATCH 64/85] DOC Improve documentation of default values for
 imputers (#15964)

---
 sklearn/impute/_base.py      | 16 ++++++++--------
 sklearn/impute/_iterative.py | 26 +++++++++++++-------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 2ad49833641dc..c952831d85e1f 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -126,7 +126,7 @@ class SimpleImputer(_BaseImputer):
         The placeholder for the missing values. All occurrences of
         `missing_values` will be imputed.
 
-    strategy : string, optional (default="mean")
+    strategy : string, default='mean'
         The imputation strategy.
 
         - If "mean", then replace missing values using the mean along
@@ -141,16 +141,16 @@ class SimpleImputer(_BaseImputer):
         .. versionadded:: 0.20
            strategy="constant" for fixed value imputation.
 
-    fill_value : string or numerical value, optional (default=None)
+    fill_value : string or numerical value, default=None
         When strategy == "constant", fill_value is used to replace all
         occurrences of missing_values.
         If left to the default, fill_value will be 0 when imputing numerical
         data and "missing_value" for strings or object data types.
 
-    verbose : integer, optional (default=0)
+    verbose : integer, default=0
         Controls the verbosity of the imputer.
 
-    copy : boolean, optional (default=True)
+    copy : boolean, default=True
         If True, a copy of X will be created. If False, imputation will
         be done in-place whenever possible. Note that, in the following cases,
         a new copy will always be made, even if `copy=False`:
@@ -159,7 +159,7 @@ class SimpleImputer(_BaseImputer):
         - If X is encoded as a CSR matrix;
         - If add_indicator=True.
 
-    add_indicator : boolean, optional (default=False)
+    add_indicator : boolean, default=False
         If True, a :class:`MissingIndicator` transform will stack onto output
         of the imputer's transform. This allows a predictive estimator
         to account for missingness despite imputation. If a feature has no
@@ -470,7 +470,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
         `missing_values` will be indicated (True in the output array), the
         other values will be marked as False.
 
-    features : str, optional
+    features : str, default=None
         Whether the imputer mask should represent all or a subset of
         features.
 
@@ -478,7 +478,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
           features containing missing values during fit time.
         - If "all", the imputer mask will represent all features.
 
-    sparse : boolean or "auto", optional
+    sparse : boolean or "auto", default=None
         Whether the imputer mask format should be sparse or dense.
 
         - If "auto" (default), the imputer mask will be of same type as
@@ -486,7 +486,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
         - If True, the imputer mask will be a sparse matrix.
         - If False, the imputer mask will be a numpy array.
 
-    error_on_new : boolean, optional
+    error_on_new : boolean, default=None
         If True (default), transform will raise an error when there are
         features with missing values in transform that have no missing values
         in fit. This is applicable only when ``features="missing-only"``.
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index fa9d576f04008..7983b8dbe4062 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -52,7 +52,7 @@ class IterativeImputer(_BaseImputer):
         If ``sample_posterior`` is True, the estimator must support
         ``return_std`` in its ``predict`` method.
 
-    missing_values : int, np.nan, optional (default=np.nan)
+    missing_values : int, np.nan, default=np.nan
         The placeholder for the missing values. All occurrences of
         ``missing_values`` will be imputed.
 
@@ -62,7 +62,7 @@ class IterativeImputer(_BaseImputer):
         ``return_std`` in its ``predict`` method if set to ``True``. Set to
         ``True`` if using ``IterativeImputer`` for multiple imputations.
 
-    max_iter : int, optional (default=10)
+    max_iter : int, default=10
         Maximum number of imputation rounds to perform before returning the
         imputations computed during the final round. A round is a single
         imputation of each feature with missing values. The stopping criterion
@@ -70,10 +70,10 @@ class IterativeImputer(_BaseImputer):
         where `X_t` is `X` at iteration `t. Note that early stopping is only
         applied if ``sample_posterior=False``.
 
-    tol : float, optional (default=1e-3)
+    tol : float, default=1e-3
         Tolerance of the stopping condition.
 
-    n_nearest_features : int, optional (default=None)
+    n_nearest_features : int, default=None
         Number of other features to use to estimate the missing values of
         each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
@@ -83,12 +83,12 @@ class IterativeImputer(_BaseImputer):
         imputed target feature. Can provide significant speed-up when the
         number of features is huge. If ``None``, all features will be used.
 
-    initial_strategy : str, optional (default="mean")
+    initial_strategy : str, default='mean'
         Which strategy to use to initialize the missing values. Same as the
         ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
         Valid values: {"mean", "median", "most_frequent", or "constant"}.
 
-    imputation_order : str, optional (default="ascending")
+    imputation_order : str, default='ascending'
         The order in which the features will be imputed. Possible values:
 
         "ascending"
@@ -102,34 +102,34 @@ class IterativeImputer(_BaseImputer):
         "random"
             A random order for each round.
 
-    skip_complete : boolean, optional (default=False)
+    skip_complete : boolean, default=False
         If ``True`` then features with missing values during ``transform``
         which did not have any missing values during ``fit`` will be imputed
         with the initial imputation method only. Set to ``True`` if you have
         many features with no missing values at both ``fit`` and ``transform``
         time to save compute.
 
-    min_value : float, optional (default=None)
+    min_value : float, default=None
         Minimum possible imputed value. Default of ``None`` will set minimum
         to negative infinity.
 
-    max_value : float, optional (default=None)
+    max_value : float, default=None
         Maximum possible imputed value. Default of ``None`` will set maximum
         to positive infinity.
 
-    verbose : int, optional (default=0)
+    verbose : int, default=0
         Verbosity flag, controls the debug messages that are issued
         as functions are evaluated. The higher, the more verbose. Can be 0, 1,
         or 2.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance or None, default=None
         The seed of the pseudo random number generator to use. Randomizes
         selection of estimator features if n_nearest_features is not None, the
         ``imputation_order`` if ``random``, and the sampling from posterior if
         ``sample_posterior`` is True. Use an integer for determinism.
         See :term:`the Glossary <random_state>`.
 
-    add_indicator : boolean, optional (default=False)
+    add_indicator : boolean, default=False
         If True, a :class:`MissingIndicator` transform will stack onto output
         of the imputer's transform. This allows a predictive estimator
         to account for missingness despite imputation. If a feature has no
@@ -443,7 +443,7 @@ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
         X_filled : ndarray, shape (n_samples, n_features)
             Input data with the most recent imputations.
 
-        tolerance : float, optional (default=1e-6)
+        tolerance : float, default=1e-6
             ``abs_corr_mat`` can have nans, which will be replaced
             with ``tolerance``.
 

From f0c6bacd3a11bad2841fa7422d575bf26a5584b2 Mon Sep 17 00:00:00 2001
From: David Breuer <DavidBreuer@users.noreply.github.com>
Date: Thu, 26 Dec 2019 13:02:31 +0100
Subject: [PATCH 65/85] EXA/MAINT Simplify code in manifold learning example
 (#15949)

---
 examples/manifold/plot_compare_methods.py | 94 +++++++----------------
 1 file changed, 29 insertions(+), 65 deletions(-)

diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py
index 7c5f3b6200635..ed01e8ac19b89 100644
--- a/examples/manifold/plot_compare_methods.py
+++ b/examples/manifold/plot_compare_methods.py
@@ -23,6 +23,8 @@
 
 print(__doc__)
 
+from collections import OrderedDict
+from functools import partial
 from time import time
 
 import matplotlib.pyplot as plt
@@ -39,81 +41,43 @@
 n_neighbors = 10
 n_components = 2
 
+# Create figure
 fig = plt.figure(figsize=(15, 8))
-plt.suptitle("Manifold Learning with %i points, %i neighbors"
+fig.suptitle("Manifold Learning with %i points, %i neighbors"
              % (1000, n_neighbors), fontsize=14)
 
-
+# Add 3d scatter plot
 ax = fig.add_subplot(251, projection='3d')
 ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
 ax.view_init(4, -72)
 
-methods = ['standard', 'ltsa', 'hessian', 'modified']
-labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']
-
-for i, method in enumerate(methods):
+# Set-up manifold methods
+LLE = partial(manifold.LocallyLinearEmbedding,
+              n_neighbors, n_components, eigen_solver='auto')
+
+methods = OrderedDict()
+methods['LLE'] = LLE(method='standard')
+methods['LTSA'] = LLE(method='ltsa')
+methods['Hessian LLE'] = LLE(method='hessian')
+methods['Modified LLE'] = LLE(method='modified')
+methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
+methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
+methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
+                                           n_neighbors=n_neighbors)
+methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
+                                 random_state=0)
+
+# Plot results
+for i, (label, method) in enumerate(methods.items()):
     t0 = time()
-    Y = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
-                                        eigen_solver='auto',
-                                        method=method).fit_transform(X)
+    Y = method.fit_transform(X)
     t1 = time()
-    print("%s: %.2g sec" % (methods[i], t1 - t0))
-
-    ax = fig.add_subplot(252 + i)
-    plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
-    plt.title("%s (%.2g sec)" % (labels[i], t1 - t0))
+    print("%s: %.2g sec" % (label, t1 - t0))
+    ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
+    ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
+    ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
     ax.xaxis.set_major_formatter(NullFormatter())
     ax.yaxis.set_major_formatter(NullFormatter())
-    plt.axis('tight')
-
-t0 = time()
-Y = manifold.Isomap(n_neighbors, n_components).fit_transform(X)
-t1 = time()
-print("Isomap: %.2g sec" % (t1 - t0))
-ax = fig.add_subplot(257)
-plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
-plt.title("Isomap (%.2g sec)" % (t1 - t0))
-ax.xaxis.set_major_formatter(NullFormatter())
-ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
-
-
-t0 = time()
-mds = manifold.MDS(n_components, max_iter=100, n_init=1)
-Y = mds.fit_transform(X)
-t1 = time()
-print("MDS: %.2g sec" % (t1 - t0))
-ax = fig.add_subplot(258)
-plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
-plt.title("MDS (%.2g sec)" % (t1 - t0))
-ax.xaxis.set_major_formatter(NullFormatter())
-ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
-
-
-t0 = time()
-se = manifold.SpectralEmbedding(n_components=n_components,
-                                n_neighbors=n_neighbors)
-Y = se.fit_transform(X)
-t1 = time()
-print("SpectralEmbedding: %.2g sec" % (t1 - t0))
-ax = fig.add_subplot(259)
-plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
-plt.title("SpectralEmbedding (%.2g sec)" % (t1 - t0))
-ax.xaxis.set_major_formatter(NullFormatter())
-ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
-
-t0 = time()
-tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
-Y = tsne.fit_transform(X)
-t1 = time()
-print("t-SNE: %.2g sec" % (t1 - t0))
-ax = fig.add_subplot(2, 5, 10)
-plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
-plt.title("t-SNE (%.2g sec)" % (t1 - t0))
-ax.xaxis.set_major_formatter(NullFormatter())
-ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
+    ax.axis('tight')
 
 plt.show()

From 5d80a3938cd231d93eb3d5c1bf691d7c7db43911 Mon Sep 17 00:00:00 2001
From: "@nkish" <19225359+ankishb@users.noreply.github.com>
Date: Thu, 26 Dec 2019 13:35:52 +0000
Subject: [PATCH 66/85] DOC Improve default values in SGD documentation
 (#15967)

---
 sklearn/linear_model/_stochastic_gradient.py | 115 +++++++++----------
 1 file changed, 56 insertions(+), 59 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index eb1e9e7b545e7..a9775a4ae850e 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -244,12 +244,12 @@ def _make_validation_split(self, y):
 
         Parameters
         ----------
-        y : array, shape (n_samples, )
+        y : ndarray of shape (n_samples, )
             Target values.
 
         Returns
         -------
-        validation_mask : array, shape (n_samples, )
+        validation_mask : ndarray of shape (n_samples, )
             Equal to 1 on the validation set, 0 on the training set.
         """
         n_samples = y.shape[0]
@@ -362,11 +362,11 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
     sample_weight : numpy array of shape [n_samples, ]
         The weight of each sample
 
-    validation_mask : numpy array of shape [n_samples, ] or None
+    validation_mask : numpy array of shape [n_samples, ], default=None
         Precomputed validation mask in case _fit_binary is called in the
         context of a one-vs-rest reduction.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
@@ -641,10 +641,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Subset of the training data.
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Subset of the target values.
 
-        classes : array, shape (n_classes,)
+        classes : ndarray of shape (n_classes,), default=None
             Classes across all calls to partial_fit.
             Can be obtained by via `np.unique(y_all)`, where y_all is the
             target vector of the entire dataset.
@@ -652,7 +652,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             and can be omitted in the subsequent calls.
             Note that y doesn't need to contain all labels in `classes`.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed.
 
@@ -685,16 +685,16 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values.
 
-        coef_init : array, shape (n_classes, n_features)
+        coef_init : ndarray of shape (n_classes, n_features), default=None
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (n_classes,)
+        intercept_init : ndarray of shape (n_classes,), default=None
             The initial intercept to warm-start the optimization.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed. These weights will
             be multiplied with class_weight (passed through the
@@ -738,7 +738,7 @@ class SGDClassifier(BaseSGDClassifier):
 
     Parameters
     ----------
-    loss : str, default: 'hinge'
+    loss : str, default='hinge'
         The loss function to be used. Defaults to 'hinge', which gives a
         linear SVM.
 
@@ -754,42 +754,41 @@ class SGDClassifier(BaseSGDClassifier):
         The other losses are designed for regression but can be useful in
         classification as well; see SGDRegressor for a description.
 
-    penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
+    penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
         'elasticnet' might bring sparsity to the model (feature selection)
         not achievable with 'l2'.
 
-    alpha : float
+    alpha : float, default=0.0001
         Constant that multiplies the regularization term. Defaults to 0.0001.
         Also used to compute learning_rate when set to 'optimal'.
 
-    l1_ratio : float
+    l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
         Defaults to 0.15.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
         .. versionadded:: 0.19
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
-        Defaults to True.
 
     verbose : int, default=0
         The verbosity level.
@@ -802,21 +801,21 @@ class SGDClassifier(BaseSGDClassifier):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : str, optional
+    learning_rate : str, default='optimal'
         The learning rate schedule:
 
         'constant':
@@ -832,12 +831,12 @@ class SGDClassifier(BaseSGDClassifier):
             training loss by tol or fail to increase validation score by tol if
             early_stopping is True, the current learning rate is divided by 5.
 
-    eta0 : double
+    eta0 : double, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
         the default schedule 'optimal'.
 
-    power_t : double
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate [default 0.5].
 
     early_stopping : bool, default=False
@@ -861,7 +860,7 @@ class SGDClassifier(BaseSGDClassifier):
 
         .. versionadded:: 0.20
 
-    class_weight : dict, {class_label: weight} or "balanced" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
@@ -893,11 +892,11 @@ class SGDClassifier(BaseSGDClassifier):
 
     Attributes
     ----------
-    coef_ : array, shape (1, n_features) if n_classes == 2 else (n_classes,\
-            n_features)
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
         Weights assigned to the features.
 
-    intercept_ : array, shape (1,) if n_classes == 2 else (n_classes,)
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
         Constants in decision function.
 
     n_iter_ : int
@@ -979,7 +978,7 @@ def predict_proba(self):
 
         Returns
         -------
-        array, shape (n_samples, n_classes)
+        ndarray of shape (n_samples, n_classes)
             Returns the probability of the sample for each class in the model,
             where classes are ordered as they are in `self.classes_`.
 
@@ -1140,7 +1139,7 @@ def partial_fit(self, X, y, sample_weight=None):
         y : numpy array of shape (n_samples,)
             Subset of target values
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed.
 
@@ -1198,16 +1197,16 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values
 
-        coef_init : array, shape (n_features,)
+        coef_init : ndarray of shape (n_features,), default=None
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (1,)
+        intercept_init : ndarray of shape (1,), default=None
             The initial intercept to warm-start the optimization.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples (1. for unweighted).
 
         Returns
@@ -1229,7 +1228,7 @@ def _decision_function(self, X):
 
         Returns
         -------
-        array, shape (n_samples,)
+        ndarray of shape (n_samples,)
            Predicted target values per element in X.
         """
         check_is_fitted(self)
@@ -1249,7 +1248,7 @@ def predict(self, X):
 
         Returns
         -------
-        array, shape (n_samples,)
+        ndarray of shape (n_samples,)
            Predicted target values per element in X.
         """
         return self._decision_function(X)
@@ -1359,7 +1358,7 @@ class SGDRegressor(BaseSGDRegressor):
 
     Parameters
     ----------
-    loss : str, default: 'squared_loss'
+    loss : str, default='squared_loss'
         The loss function to be used. The possible values are 'squared_loss',
         'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
 
@@ -1371,44 +1370,42 @@ class SGDRegressor(BaseSGDRegressor):
         'squared_epsilon_insensitive' is the same but becomes squared loss past
         a tolerance of epsilon.
 
-    penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
+    penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
         'elasticnet' might bring sparsity to the model (feature selection)
         not achievable with 'l2'.
 
-    alpha : float
-        Constant that multiplies the regularization term. Defaults to 0.0001
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term.
         Also used to compute learning_rate when set to 'optimal'.
 
-    l1_ratio : float
+    l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
-        Defaults to 0.15.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
-        data is assumed to be already centered. Defaults to True.
+        data is assumed to be already centered.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
         .. versionadded:: 0.19
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
-        Defaults to True.
 
-    verbose : integer, default=0
+    verbose : int, default=0
         The verbosity level.
 
     epsilon : float, default=0.1
@@ -1419,14 +1416,14 @@ class SGDRegressor(BaseSGDRegressor):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : string, optional
+    learning_rate : string, default='invscaling'
         The learning rate schedule:
 
         'constant':
@@ -1442,12 +1439,12 @@ class SGDRegressor(BaseSGDRegressor):
             training loss by tol or fail to increase validation score by tol if
             early_stopping is True, the current learning rate is divided by 5.
 
-    eta0 : double
+    eta0 : double, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.01.
 
-    power_t : double
-        The exponent for inverse scaling learning rate [default 0.25].
+    power_t : double, default=0.25
+        The exponent for inverse scaling learning rate.
 
     early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation
@@ -1492,16 +1489,16 @@ class SGDRegressor(BaseSGDRegressor):
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
+    coef_ : ndarray of shape (n_features,)
         Weights assigned to the features.
 
-    intercept_ : array, shape (1,)
+    intercept_ : ndarray of shape (1,)
         The intercept term.
 
-    average_coef_ : array, shape (n_features,)
+    average_coef_ : ndarray of shape (n_features,)
         Averaged weights assigned to the features.
 
-    average_intercept_ : array, shape (1,)
+    average_intercept_ : ndarray of shape (1,)
         The averaged intercept term.
 
     n_iter_ : int

From 5d3e79f465e771462fac839fe00064b2a2bd249e Mon Sep 17 00:00:00 2001
From: "@nkish" <19225359+ankishb@users.noreply.github.com>
Date: Thu, 26 Dec 2019 15:18:06 +0000
Subject: [PATCH 67/85] DOC Improve defaults in neural network documentation
 (#15968)

---
 .../neural_network/_multilayer_perceptron.py  | 136 +++++++++---------
 sklearn/neural_network/_rbm.py                |  46 +++---
 .../neural_network/_stochastic_optimizers.py  |  18 +--
 3 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index b6367d32e57a9..51af0e33139dd 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -140,13 +140,13 @@ def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
 
         Parameters
         ----------
-        packed_coef_inter : array-like
+        packed_coef_inter : ndarray
             A vector comprising the flattened coefficients and intercepts.
 
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             The target values.
 
         activations : list, length = n_layers - 1
@@ -185,10 +185,10 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             The target values.
 
         activations : list, length = n_layers - 1
@@ -613,10 +613,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
             The target values (class labels in classification, real numbers in
             regression).
 
@@ -632,10 +632,10 @@ def partial_fit(self):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             The target values.
 
         Returns
@@ -656,12 +656,12 @@ def _predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
             The decision function of the samples for each class in the model.
         """
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
@@ -698,11 +698,11 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
     Parameters
     ----------
-    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
+    hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
         Activation function for the hidden layer.
 
         - 'identity', no-op activation, useful to implement linear bottleneck,
@@ -717,7 +717,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
 
-    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
         The solver for weight optimization.
 
         - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
@@ -733,15 +733,15 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         For small datasets, however, 'lbfgs' can converge faster and perform
         better.
 
-    alpha : float, optional, default 0.0001
+    alpha : float, default=0.0001
         L2 penalty (regularization term) parameter.
 
-    batch_size : int, optional, default 'auto'
+    batch_size : int, default='auto'
         Size of minibatches for stochastic optimizers.
         If the solver is 'lbfgs', the classifier will not use minibatch.
         When set to "auto", `batch_size=min(200, n_samples)`
 
-    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
         Learning rate schedule for weight updates.
 
         - 'constant' is a constant learning rate given by
@@ -759,55 +759,55 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
         Only used when ``solver='sgd'``.
 
-    learning_rate_init : double, optional, default 0.001
+    learning_rate_init : double, default=0.001
         The initial learning rate used. It controls the step-size
         in updating the weights. Only used when solver='sgd' or 'adam'.
 
-    power_t : double, optional, default 0.5
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate.
         It is used in updating effective learning rate when the learning_rate
         is set to 'invscaling'. Only used when solver='sgd'.
 
-    max_iter : int, optional, default 200
+    max_iter : int, default=200
         Maximum number of iterations. The solver iterates until convergence
         (determined by 'tol') or this number of iterations. For stochastic
         solvers ('sgd', 'adam'), note that this determines the number of epochs
         (how many times each data point will be used), not the number of
         gradient steps.
 
-    shuffle : bool, optional, default True
+    shuffle : bool, default=True
         Whether to shuffle samples in each iteration. Only used when
         solver='sgd' or 'adam'.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance or None, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    tol : float, optional, default 1e-4
+    tol : float, default=1e-4
         Tolerance for the optimization. When the loss or score is not improving
         by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
         unless ``learning_rate`` is set to 'adaptive', convergence is
         considered to be reached and training stops.
 
-    verbose : bool, optional, default False
+    verbose : bool, default=False
         Whether to print progress messages to stdout.
 
-    warm_start : bool, optional, default False
+    warm_start : bool, default=False
         When set to True, reuse the solution of the previous
         call to fit as initialization, otherwise, just erase the
         previous solution. See :term:`the Glossary <warm_start>`.
 
-    momentum : float, default 0.9
+    momentum : float, default=0.9
         Momentum for gradient descent update. Should be between 0 and 1. Only
         used when solver='sgd'.
 
-    nesterovs_momentum : boolean, default True
+    nesterovs_momentum : boolean, default=True
         Whether to use Nesterov's momentum. Only used when solver='sgd' and
         momentum > 0.
 
-    early_stopping : bool, default False
+    early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to true, it will automatically set
         aside 10% of training data as validation and terminate training when
@@ -816,29 +816,29 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         except in a multilabel setting.
         Only effective when solver='sgd' or 'adam'
 
-    validation_fraction : float, optional, default 0.1
+    validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
         Only used if early_stopping is True
 
-    beta_1 : float, optional, default 0.9
+    beta_1 : float, default=0.9
         Exponential decay rate for estimates of first moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    beta_2 : float, optional, default 0.999
+    beta_2 : float, default=0.999
         Exponential decay rate for estimates of second moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    epsilon : float, optional, default 1e-8
+    epsilon : float, default=1e-8
         Value for numerical stability in adam. Only used when solver='adam'
 
-    n_iter_no_change : int, optional, default 10
+    n_iter_no_change : int, default=10
         Maximum number of epochs to not meet ``tol`` improvement.
         Only effective when solver='sgd' or 'adam'
 
         .. versionadded:: 0.20
 
-    max_fun : int, optional, default 15000
+    max_fun : int, default=15000
         Only used when solver='lbfgs'. Maximum number of loss function calls.
         The solver iterates until convergence (determined by 'tol'), number
         of iterations reaches max_iter, or this number of loss function calls.
@@ -849,7 +849,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
     Attributes
     ----------
-    classes_ : array or list of array of shape (n_classes,)
+    classes_ : ndarray or list of ndarray of shape (n_classes,)
         Class labels for each output.
 
     loss_ : float
@@ -959,12 +959,12 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y : array-like, shape (n_samples,) or (n_samples, n_classes)
+        y : ndarray, shape (n_samples,) or (n_samples, n_classes)
             The predicted classes.
         """
         check_is_fitted(self)
@@ -980,10 +980,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
             The input data.
 
-        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
+        y : ndarray, shape (n_samples,) or (n_samples, n_outputs)
             The target values (class labels in classification, real numbers in
             regression).
 
@@ -1041,12 +1041,12 @@ def predict_log_proba(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        log_y_prob : array-like, shape (n_samples, n_classes)
+        log_y_prob : ndarray of shape (n_samples, n_classes)
             The predicted log-probability of the sample for each class
             in the model, where classes are ordered as they are in
             `self.classes_`. Equivalent to log(predict_proba(X))
@@ -1059,12 +1059,12 @@ def predict_proba(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y_prob : array-like, shape (n_samples, n_classes)
+        y_prob : ndarray of shape (n_samples, n_classes)
             The predicted probability of the sample for each class in the
             model, where classes are ordered as they are in `self.classes_`.
         """
@@ -1090,11 +1090,11 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
 
     Parameters
     ----------
-    hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
+    hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
         Activation function for the hidden layer.
 
         - 'identity', no-op activation, useful to implement linear bottleneck,
@@ -1109,7 +1109,7 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
 
-    solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
         The solver for weight optimization.
 
         - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
@@ -1125,15 +1125,15 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         For small datasets, however, 'lbfgs' can converge faster and perform
         better.
 
-    alpha : float, optional, default 0.0001
+    alpha : float, default=0.0001
         L2 penalty (regularization term) parameter.
 
-    batch_size : int, optional, default 'auto'
+    batch_size : int, default='auto'
         Size of minibatches for stochastic optimizers.
         If the solver is 'lbfgs', the classifier will not use minibatch.
         When set to "auto", `batch_size=min(200, n_samples)`
 
-    learning_rate : {'constant', 'invscaling', 'adaptive'}, default 'constant'
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
         Learning rate schedule for weight updates.
 
         - 'constant' is a constant learning rate given by
@@ -1151,55 +1151,55 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
 
         Only used when solver='sgd'.
 
-    learning_rate_init : double, optional, default 0.001
+    learning_rate_init : double, default=0.001
         The initial learning rate used. It controls the step-size
         in updating the weights. Only used when solver='sgd' or 'adam'.
 
-    power_t : double, optional, default 0.5
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate.
         It is used in updating effective learning rate when the learning_rate
         is set to 'invscaling'. Only used when solver='sgd'.
 
-    max_iter : int, optional, default 200
+    max_iter : int, default=200
         Maximum number of iterations. The solver iterates until convergence
         (determined by 'tol') or this number of iterations. For stochastic
         solvers ('sgd', 'adam'), note that this determines the number of epochs
         (how many times each data point will be used), not the number of
         gradient steps.
 
-    shuffle : bool, optional, default True
+    shuffle : bool, default=True
         Whether to shuffle samples in each iteration. Only used when
         solver='sgd' or 'adam'.
 
-    random_state : int, RandomState instance or None, optional, default None
+    random_state : int, RandomState instance or None, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    tol : float, optional, default 1e-4
+    tol : float, default=1e-4
         Tolerance for the optimization. When the loss or score is not improving
         by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
         unless ``learning_rate`` is set to 'adaptive', convergence is
         considered to be reached and training stops.
 
-    verbose : bool, optional, default False
+    verbose : bool, default=False
         Whether to print progress messages to stdout.
 
-    warm_start : bool, optional, default False
+    warm_start : bool, default=False
         When set to True, reuse the solution of the previous
         call to fit as initialization, otherwise, just erase the
         previous solution. See :term:`the Glossary <warm_start>`.
 
-    momentum : float, default 0.9
+    momentum : float, default=0.9
         Momentum for gradient descent update.  Should be between 0 and 1. Only
         used when solver='sgd'.
 
-    nesterovs_momentum : boolean, default True
+    nesterovs_momentum : boolean, default=True
         Whether to use Nesterov's momentum. Only used when solver='sgd' and
         momentum > 0.
 
-    early_stopping : bool, default False
+    early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to true, it will automatically set
         aside 10% of training data as validation and terminate training when
@@ -1207,29 +1207,29 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         ``n_iter_no_change`` consecutive epochs.
         Only effective when solver='sgd' or 'adam'
 
-    validation_fraction : float, optional, default 0.1
+    validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
         Only used if early_stopping is True
 
-    beta_1 : float, optional, default 0.9
+    beta_1 : float, default=0.9
         Exponential decay rate for estimates of first moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    beta_2 : float, optional, default 0.999
+    beta_2 : float, default=0.999
         Exponential decay rate for estimates of second moment vector in adam,
         should be in [0, 1). Only used when solver='adam'
 
-    epsilon : float, optional, default 1e-8
+    epsilon : float, default=1e-8
         Value for numerical stability in adam. Only used when solver='adam'
 
-    n_iter_no_change : int, optional, default 10
+    n_iter_no_change : int, default=10
         Maximum number of epochs to not meet ``tol`` improvement.
         Only effective when solver='sgd' or 'adam'
 
         .. versionadded:: 0.20
 
-    max_fun : int, optional, default 15000
+    max_fun : int, default=15000
         Only used when solver='lbfgs'. Maximum number of function calls.
         The solver iterates until convergence (determined by 'tol'), number
         of iterations reaches max_iter, or this number of function calls.
@@ -1321,12 +1321,12 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The input data.
 
         Returns
         -------
-        y : array-like, shape (n_samples, n_outputs)
+        y : ndarray of shape (n_samples, n_outputs)
             The predicted values.
         """
         check_is_fitted(self)
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index efe3aeda951af..14960a8b2bb22 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -38,25 +38,25 @@ class BernoulliRBM(TransformerMixin, BaseEstimator):
 
     Parameters
     ----------
-    n_components : int, optional
+    n_components : int, default=256
         Number of binary hidden units.
 
-    learning_rate : float, optional
+    learning_rate : float, default=0.1
         The learning rate for weight updates. It is *highly* recommended
         to tune this hyper-parameter. Reasonable values are in the
         10**[0., -3.] range.
 
-    batch_size : int, optional
+    batch_size : int, default=10
         Number of examples per minibatch.
 
-    n_iter : int, optional
+    n_iter : int, default=10
         Number of iterations/sweeps over the training dataset to perform
         during training.
 
-    verbose : int, optional
+    verbose : int, default=0
         The verbosity level. The default, zero, means silent mode.
 
-    random_state : integer or RandomState, optional
+    random_state : integer or RandomState, default=None
         A random number generator instance to define the state of the
         random permutations generator. If an integer is given, it fixes the
         seed. Defaults to the global numpy random number generator.
@@ -113,12 +113,12 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The data to be transformed.
 
         Returns
         -------
-        h : array, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Latent representations of the data.
         """
         check_is_fitted(self)
@@ -131,12 +131,12 @@ def _mean_hiddens(self, v):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer.
 
         Returns
         -------
-        h : array-like, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Corresponding mean field values for the hidden layer.
         """
         p = safe_sparse_dot(v, self.components_.T)
@@ -148,7 +148,7 @@ def _sample_hiddens(self, v, rng):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer to sample from.
 
         rng : RandomState
@@ -156,7 +156,7 @@ def _sample_hiddens(self, v, rng):
 
         Returns
         -------
-        h : array-like, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Values of the hidden layer.
         """
         p = self._mean_hiddens(v)
@@ -167,7 +167,7 @@ def _sample_visibles(self, h, rng):
 
         Parameters
         ----------
-        h : array-like, shape (n_samples, n_components)
+        h : ndarray of shape (n_samples, n_components)
             Values of the hidden layer to sample from.
 
         rng : RandomState
@@ -175,7 +175,7 @@ def _sample_visibles(self, h, rng):
 
         Returns
         -------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer.
         """
         p = np.dot(h, self.components_)
@@ -188,12 +188,12 @@ def _free_energy(self, v):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer.
 
         Returns
         -------
-        free_energy : array-like, shape (n_samples,)
+        free_energy : ndarray of shape (n_samples,)
             The value of the free energy.
         """
         return (- safe_sparse_dot(v, self.intercept_visible_)
@@ -205,12 +205,12 @@ def gibbs(self, v):
 
         Parameters
         ----------
-        v : array-like, shape (n_samples, n_features)
+        v : ndarray of shape (n_samples, n_features)
             Values of the visible layer to start from.
 
         Returns
         -------
-        v_new : array-like, shape (n_samples, n_features)
+        v_new : ndarray of shape (n_samples, n_features)
             Values of the visible layer after one Gibbs step.
         """
         check_is_fitted(self)
@@ -227,7 +227,7 @@ def partial_fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
             Training data.
 
         Returns
@@ -263,7 +263,7 @@ def _fit(self, v_pos, rng):
 
         Parameters
         ----------
-        v_pos : array-like, shape (n_samples, n_features)
+        v_pos : ndarray of shape (n_samples, n_features)
             The data to use for training.
 
         rng : RandomState
@@ -290,12 +290,12 @@ def score_samples(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Values of the visible layer. Must be all-boolean (not checked).
 
         Returns
         -------
-        pseudo_likelihood : array-like, shape (n_samples,)
+        pseudo_likelihood : ndarray of shape (n_samples,)
             Value of the pseudo-likelihood (proxy for likelihood).
 
         Notes
@@ -328,7 +328,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data.
 
         Returns
diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py
index 3e49e94de8bd1..02fc53a7aecc2 100644
--- a/sklearn/neural_network/_stochastic_optimizers.py
+++ b/sklearn/neural_network/_stochastic_optimizers.py
@@ -16,7 +16,7 @@ class BaseOptimizer:
         The concatenated list containing coefs_ and intercepts_ in MLP model.
         Used for initializing velocities and updating params
 
-    learning_rate_init : float, optional, default 0.1
+    learning_rate_init : float, default=0.1
         The initial learning rate used. It controls the step-size in updating
         the weights
 
@@ -80,11 +80,11 @@ class SGDOptimizer(BaseOptimizer):
         The concatenated list containing coefs_ and intercepts_ in MLP model.
         Used for initializing velocities and updating params
 
-    learning_rate_init : float, optional, default 0.1
+    learning_rate_init : float, default=0.1
         The initial learning rate used. It controls the step-size in updating
         the weights
 
-    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default 'constant'
+    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
         Learning rate schedule for weight updates.
 
         -'constant', is a constant learning rate given by
@@ -100,10 +100,10 @@ class SGDOptimizer(BaseOptimizer):
          tol, or fail to increase validation score by tol if 'early_stopping'
          is on, the current learning rate is divided by 5.
 
-    momentum : float, optional, default 0.9
+    momentum : float, default=0.9
         Value of momentum used, must be larger than or equal to 0
 
-    nesterov : bool, optional, default True
+    nesterov : bool, default=True
         Whether to use nesterov's momentum or not. Use nesterov's if True
 
     Attributes
@@ -192,19 +192,19 @@ class AdamOptimizer(BaseOptimizer):
         The concatenated list containing coefs_ and intercepts_ in MLP model.
         Used for initializing velocities and updating params
 
-    learning_rate_init : float, optional, default 0.1
+    learning_rate_init : float, default=0.1
         The initial learning rate used. It controls the step-size in updating
         the weights
 
-    beta_1 : float, optional, default 0.9
+    beta_1 : float, default=0.9
         Exponential decay rate for estimates of first moment vector, should be
         in [0, 1)
 
-    beta_2 : float, optional, default 0.999
+    beta_2 : float, default=0.999
         Exponential decay rate for estimates of second moment vector, should be
         in [0, 1)
 
-    epsilon : float, optional, default 1e-8
+    epsilon : float, default=1e-8
         Value for numerical stability
 
     Attributes

From 1b866a48a78edce835a465fbda9360dc5c197090 Mon Sep 17 00:00:00 2001
From: Niklas <niklas.sm+github@gmail.com>
Date: Fri, 27 Dec 2019 04:31:09 -0500
Subject: [PATCH 68/85] FIX use safe_sparse_dot for callable kernel in
 LabelSpreading (#15868)

---
 doc/whats_new/v0.22.rst                       |  8 ++++
 sklearn/semi_supervised/_label_propagation.py |  3 +-
 .../tests/test_label_propagation.py           | 39 +++++++++++++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 370f02fda49f9..487b3254f13a9 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -53,6 +53,14 @@ Changelog
 - |Fix| :func:`metrics.classification_report` does no longer ignore the
   value of the ``zero_division`` keyword argument. :pr:`15879`
   by :user:`Bibhash Chandra Mitra <Bibyutatsu>`.
+  
+:mod:`sklearn.semi_supervised`
+..............................
+
+- |Fix| :class:`semi_supervised.LabelPropagation` and
+  :class:`semi_supervised.LabelSpreading` now allow callable kernel function to
+  return sparse weight matrix.
+  :pr:`15868` by :user:`Niklas Smedemark-Margulies <nik-sm>`.
 
 :mod:`sklearn.utils`
 ....................
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 0ec687aae7d20..665b50dcfa507 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -195,7 +195,8 @@ class labels
                 for weight_matrix in weight_matrices])
         else:
             weight_matrices = weight_matrices.T
-            probabilities = np.dot(weight_matrices, self.label_distributions_)
+            probabilities = safe_sparse_dot(
+                    weight_matrices, self.label_distributions_)
         normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
         probabilities /= normalizer
         return probabilities
diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index 7e20350b20b2f..d983ab854948b 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -3,10 +3,13 @@
 import numpy as np
 import pytest
 
+from scipy.sparse import issparse
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.semi_supervised import _label_propagation as label_propagation
 from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 from sklearn.datasets import make_classification
 from sklearn.exceptions import ConvergenceWarning
 from numpy.testing import assert_array_almost_equal
@@ -152,3 +155,39 @@ def test_convergence_warning():
 
     mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
     assert_no_warnings(mdl.fit, X, y)
+
+
+def test_predict_sparse_callable_kernel():
+    # This is a non-regression test for #15866
+
+    # Custom sparse kernel (top-K RBF)
+    def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
+        nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1)
+        nn.fit(X)
+        W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma
+        np.exp(W.data, out=W.data)
+        assert issparse(W)
+        return W.T
+
+    n_classes = 4
+    n_samples = 500
+    n_test = 10
+    X, y = make_classification(n_classes=n_classes,
+                               n_samples=n_samples,
+                               n_features=20,
+                               n_informative=20,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                        test_size=n_test,
+                                                        random_state=0)
+
+    model = label_propagation.LabelSpreading(kernel=topk_rbf)
+    model.fit(X_train, y_train)
+    assert model.score(X_test, y_test) >= 0.9
+
+    model = label_propagation.LabelPropagation(kernel=topk_rbf)
+    model.fit(X_train, y_train)
+    assert model.score(X_test, y_test) >= 0.9

From e7b03feab5d22181d666ab4020e51c908c972b68 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 27 Dec 2019 05:46:27 -0500
Subject: [PATCH 69/85] BUG Adds attributes back to check_is_fitted (#15947)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v0.22.rst                       |  8 +++
 sklearn/feature_extraction/tests/test_text.py |  1 +
 sklearn/feature_extraction/text.py            |  8 ++-
 sklearn/utils/tests/test_validation.py        | 49 ++++++++++++++++++-
 sklearn/utils/validation.py                   | 33 ++++++++-----
 5 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 487b3254f13a9..a362f839bcca7 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -68,6 +68,14 @@ Changelog
 - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with
   boolean columns to floats. :pr:`15797` by `Thomas Fan`_.
 
+- |Fix| :func:`utils.check_is_fitted` accepts back an explicit ``attributes``
+  argument to check for specific attributes as explicit markers of a fitted
+  estimator. When no explicit ``attributes`` are provided, only the attributes
+  ending with a single "_" are used as "fitted" markers. The ``all_or_any``
+  argument is also no longer deprecated. This change is made to
+  restore some backward compatibility with the behavior of this utility in
+  version 0.21. :pr:`15947` by `Thomas Fan`_.
+
 .. _changes_0_22:
 
 Version 0.22.0
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 25c353aae5276..f8f741a862594 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1099,6 +1099,7 @@ def test_vectorizer_string_object_as_input(Vectorizer):
     assert_raise_message(
             ValueError, message, vec.fit_transform, "hello world!")
     assert_raise_message(ValueError, message, vec.fit, "hello world!")
+    vec.fit(["some text", "some other text"])
     assert_raise_message(ValueError, message, vec.transform, "hello world!")
 
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index b5ca1302e5a03..8d0cef5625173 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1498,7 +1498,11 @@ def transform(self, X, copy=True):
             X.data += 1
 
         if self.use_idf:
-            check_is_fitted(self, msg='idf vector is not fitted')
+            # idf_ being a property, the automatic attributes detection
+            # does not work as usual and we need to specify the attribute
+            # name:
+            check_is_fitted(self, attributes=["idf_"],
+                            msg='idf vector is not fitted')
 
             expected_n_features = self._idf_diag.shape[0]
             if n_features != expected_n_features:
@@ -1883,7 +1887,7 @@ def transform(self, raw_documents, copy="deprecated"):
         X : sparse matrix, [n_samples, n_features]
             Tf-idf-weighted document-term matrix.
         """
-        check_is_fitted(self, msg='The tfidf vector is not fitted')
+        check_is_fitted(self, msg='The TF-IDF vectorizer is not fitted')
 
         # FIXME Remove copy parameter support in 0.24
         if copy != "deprecated":
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index bdd31f9c4859f..60a1fb8cded97 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -5,6 +5,7 @@
 
 from tempfile import NamedTemporaryFile
 from itertools import product
+from operator import itemgetter
 
 import pytest
 from pytest import importorskip
@@ -14,7 +15,6 @@
 from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils._testing import assert_no_warnings
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import SkipTest
@@ -50,7 +50,6 @@
 import sklearn
 
 from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning
-from sklearn.exceptions import DataConversionWarning
 
 from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import TempMemmap
@@ -736,6 +735,52 @@ def test_check_is_fitted():
         check_is_fitted, ard, all_or_any=any)
 
 
+def test_check_is_fitted_attributes():
+    class MyEstimator():
+        def fit(self, X, y):
+            return self
+
+    msg = "not fitted"
+    est = MyEstimator()
+
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"])
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+
+    est.a_ = "a"
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"])
+    with pytest.raises(NotFittedError, match=msg):
+        check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+
+    est.b_ = "b"
+    check_is_fitted(est, attributes=["a_", "b_"])
+    check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
+
+
+@pytest.mark.parametrize("wrap",
+                         [itemgetter(0), list, tuple],
+                         ids=["single", "list", "tuple"])
+def test_check_is_fitted_with_attributes(wrap):
+    ard = ARDRegression()
+    with pytest.raises(NotFittedError, match="is not fitted yet"):
+        check_is_fitted(ard, wrap(["coef_"]))
+
+    ard.fit(*make_blobs())
+
+    # Does not raise
+    check_is_fitted(ard, wrap(["coef_"]))
+
+    # Raises when using attribute that is not defined
+    with pytest.raises(NotFittedError, match="is not fitted yet"):
+        check_is_fitted(ard, wrap(["coef_bad_"]))
+
+
 def test_check_consistent_length():
     check_consistent_length([1], [2], [3], [4], [5])
     check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index fb34f3b3cccbd..b99c1ee35e1c4 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -890,23 +890,28 @@ def check_symmetric(array, tol=1E-10, raise_warning=True,
     return array
 
 
-def check_is_fitted(estimator, attributes='deprecated', msg=None,
-                    all_or_any='deprecated'):
+def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
     Checks if the estimator is fitted by verifying the presence of
     fitted attributes (ending with a trailing underscore) and otherwise
     raises a NotFittedError with the given message.
 
+    This utility is meant to be used internally by estimators themselves,
+    typically in their own predict / transform methods.
+
     Parameters
     ----------
     estimator : estimator instance.
         estimator instance for which the check is performed.
 
-    attributes : deprecated, ignored
-        .. deprecated:: 0.22
-           `attributes` is deprecated, is currently ignored and will be removed
-           in 0.23.
+    attributes : str, list or tuple of str, default=None
+        Attribute name(s) given as string or a list/tuple of strings
+        Eg.: ``["coef_", "estimator_", ...], "coef_"``
+
+        If `None`, `estimator` is considered fitted if there exist an
+        attribute that ends with a underscore and does not start with double
+        underscore.
 
     msg : string
         The default error message is, "This %(name)s instance is not fitted
@@ -918,10 +923,8 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None,
 
         Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
 
-    all_or_any : deprecated, ignored
-        .. deprecated:: 0.21
-           `all_or_any` is deprecated, is currently ignored and will be removed
-           in 0.23.
+    all_or_any : callable, {all, any}, default all
+        Specify whether all or any of the given attributes must exist.
 
     Returns
     -------
@@ -949,9 +952,13 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None,
     if not hasattr(estimator, 'fit'):
         raise TypeError("%s is not an estimator instance." % (estimator))
 
-    attrs = [v for v in vars(estimator)
-             if (v.endswith("_") or v.startswith("_"))
-             and not v.startswith("__")]
+    if attributes is not None:
+        if not isinstance(attributes, (list, tuple)):
+            attributes = [attributes]
+        attrs = all_or_any([hasattr(estimator, attr) for attr in attributes])
+    else:
+        attrs = [v for v in vars(estimator)
+                 if v.endswith("_") and not v.startswith("__")]
 
     if not attrs:
         raise NotFittedError(msg % {'name': type(estimator).__name__})

From 8201ad79195a04273c5e74b4094839643c8df492 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Fri, 27 Dec 2019 18:57:16 +0800
Subject: [PATCH 70/85] DOC update check_is_fitted what's new

---
 doc/whats_new/v0.22.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index a362f839bcca7..7ebe82a39b884 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -71,10 +71,10 @@ Changelog
 - |Fix| :func:`utils.check_is_fitted` accepts back an explicit ``attributes``
   argument to check for specific attributes as explicit markers of a fitted
   estimator. When no explicit ``attributes`` are provided, only the attributes
-  ending with a single "_" are used as "fitted" markers. The ``all_or_any``
-  argument is also no longer deprecated. This change is made to
-  restore some backward compatibility with the behavior of this utility in
-  version 0.21. :pr:`15947` by `Thomas Fan`_.
+  that end with a underscore and do not start with double underscore are used
+  as "fitted" markers. The ``all_or_any`` argument is also no longer
+  deprecated. This change is made to restore some backward compatibility with
+  the behavior of this utility in version 0.21. :pr:`15947` by `Thomas Fan`_.
 
 .. _changes_0_22:
 

From 2df830ddfd2068960cbc61a21a7be3980f2680d8 Mon Sep 17 00:00:00 2001
From: Windber <guolipengyeah@126.com>
Date: Mon, 30 Dec 2019 02:16:48 +0800
Subject: [PATCH 71/85] DOC change python-devel to python3-devel for yum.
 (#15986)

---
 doc/developers/advanced_installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index d52dfba48c0e1..8fd0f9ecf0273 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -339,7 +339,7 @@ architecture (e.g. ARM), you can install the system versions::
 
 On Red Hat and clones (e.g. CentOS), install the dependencies using::
 
-    sudo yum -y install gcc gcc-c++ python-devel numpy scipy
+    sudo yum -y install gcc gcc-c++ python3-devel numpy scipy
 
 Linux compilers from conda-forge
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 9f8f89560b9a78866de02f3fe7990b13ff588c82 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Sun, 29 Dec 2019 15:11:35 -0600
Subject: [PATCH 72/85] DOC Correct the default value of values_format in
 plot_confusion_matrix (#15981)

---
 sklearn/metrics/_plot/confusion_matrix.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index f759c4d5c1c3d..11a456aa635b1 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -61,8 +61,7 @@ def plot(self, include_values=True, cmap='viridis',
 
         values_format : str, default=None
             Format specification for values in confusion matrix. If `None`,
-            the format specification is '.2f' for a normalized matrix, and
-            'd' for a unnormalized matrix.
+            the format specification is '.2g'.
 
         ax : matplotlib axes, default=None
             Axes object to plot on. If `None`, a new figure and axes is
@@ -165,8 +164,7 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None,
 
     values_format : str, default=None
         Format specification for values in confusion matrix. If `None`,
-        the format specification is '.2f' for a normalized matrix, and
-        'd' for a unnormalized matrix.
+        the format specification is '.2g'.
 
     cmap : str or matplotlib Colormap, default='viridis'
         Colormap recognized by matplotlib.

From 784b91bd00a3a28c486627681db29071c43e5b15 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Sun, 29 Dec 2019 18:18:40 -0500
Subject: [PATCH 73/85] [MRG] MNT Updates pypy to use 7.2.0 (#15954)

---
 .circleci/config.yml                  |  2 +-
 build_tools/circle/build_test_pypy.sh | 13 ++++++++-----
 conftest.py                           |  2 +-
 sklearn/exceptions.py                 |  2 +-
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3933d5404202f..9fecc150ba297 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -101,7 +101,7 @@ jobs:
 
   pypy3:
     docker:
-      - image: pypy:3.6-7.1.1
+      - image: pypy:3.6-7.2.0
     steps:
       - restore_cache:
           keys:
diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh
index 60b81e60709f0..22e4790e7e4ab 100755
--- a/build_tools/circle/build_test_pypy.sh
+++ b/build_tools/circle/build_test_pypy.sh
@@ -18,11 +18,14 @@ source pypy-env/bin/activate
 python --version
 which python
 
-# XXX: numpy version pinning can be reverted once PyPy
-#      compatibility is resolved for numpy v1.6.x. For instance,
-#      when PyPy3 >6.0 is released (see numpy/numpy#12740)
-pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy Cython pytest
-pip install scipy sphinx numpydoc docutils joblib pillow
+pip install -U pip
+
+# pins versions to install wheel from https://antocuni.github.io/pypy-wheels/manylinux2010
+pip install --extra-index-url https://antocuni.github.io/pypy-wheels/manylinux2010 numpy==1.18.0 scipy==1.3.2
+
+# Install Cython directly
+pip install https://antocuni.github.io/pypy-wheels/ubuntu/Cython/Cython-0.29.14-py3-none-any.whl
+pip install sphinx numpydoc docutils joblib pillow pytest
 
 ccache -M 512M
 export CCACHE_COMPRESS=1
diff --git a/conftest.py b/conftest.py
index f640c0e3d001f..b98bb4b271aca 100644
--- a/conftest.py
+++ b/conftest.py
@@ -37,7 +37,7 @@ def pytest_collection_modifyitems(config, items):
         skip_marker = pytest.mark.skip(
             reason='FeatureHasher is not compatible with PyPy')
         for item in items:
-            if item.name.endswith(('hashing.FeatureHasher',
+            if item.name.endswith(('_hash.FeatureHasher',
                                    'text.HashingVectorizer')):
                 item.add_marker(skip_marker)
 
diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index c3da8e8d7986f..51dd92bbf3c56 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -139,7 +139,7 @@ class FitFailedWarning(RuntimeWarning):
     ...     print(repr(w[-1].message))
     FitFailedWarning('Estimator fit failed. The score on this train-test
     partition for these parameters will be set to 0.000000.
-    Details: \\nValueError: Penalty term must be positive; got (C=-2)\\n'...)
+    Details:...ValueError: Penalty term must be positive; got (C=-2)...
 
     .. versionchanged:: 0.18
        Moved from sklearn.cross_validation.

From cbe29e0035b0ac8f847e0665401ff9c5b597add8 Mon Sep 17 00:00:00 2001
From: Stephen Blystone <29995339+blynotes@users.noreply.github.com>
Date: Mon, 30 Dec 2019 21:40:32 -0600
Subject: [PATCH 74/85] FIX Add missing 'values_format' param to disp.plot() in
 plot_confusion_matrix (#15937)

---
 doc/whats_new/v0.22.rst                       |  4 ++++
 sklearn/metrics/_plot/confusion_matrix.py     |  3 ++-
 .../_plot/tests/test_plot_confusion_matrix.py | 19 +++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 7ebe82a39b884..74b20810de6db 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -54,6 +54,10 @@ Changelog
   value of the ``zero_division`` keyword argument. :pr:`15879`
   by :user:`Bibhash Chandra Mitra <Bibyutatsu>`.
   
+- |Fix| Fixed a bug in :func:`metrics.plot_confusion_matrix` to correctly
+  pass the `values_format` parameter to the :class:`ConfusionMatrixDisplay`
+  plot() call. :pr:`15937` by :user:`Stephen Blystone <blynotes>`.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 11a456aa635b1..537d2b9f0d838 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -195,4 +195,5 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None,
     disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                   display_labels=display_labels)
     return disp.plot(include_values=include_values,
-                     cmap=cmap, ax=ax, xticks_rotation=xticks_rotation)
+                     cmap=cmap, ax=ax, xticks_rotation=xticks_rotation,
+                     values_format=values_format)
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index 91ff411ab4f13..23a7d5a638f25 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -246,3 +246,22 @@ def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes):
 
     assert_allclose(disp.confusion_matrix, cm)
     assert disp.text_.shape == (n_classes, n_classes)
+
+
+@pytest.mark.parametrize("values_format", ['e', 'n'])
+def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes,
+                                      fitted_clf, values_format):
+    # Make sure plot text is formatted with 'values_format'.
+    X, y = data
+    cm = confusion_matrix(y, y_pred)
+    disp = plot_confusion_matrix(fitted_clf, X, y,
+                                 include_values=True,
+                                 values_format=values_format)
+
+    assert disp.text_.shape == (n_classes, n_classes)
+
+    expected_text = np.array([format(v, values_format)
+                              for v in cm.ravel()])
+    text_text = np.array([
+        t.get_text() for t in disp.text_.ravel()])
+    assert_array_equal(expected_text, text_text)

From 30fcc8d015ba554fc22551d11da5d8f99727d4bd Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Tue, 31 Dec 2019 07:02:45 -0500
Subject: [PATCH 75/85] FIX support scalar values in fit_params in SearchCV
 (#15863)

* support a scalar fit param

* pep8

* TST add test for desired behavior

* FIX introduce _check_fit_params to validate parameters

* DOC update whats new

* TST tests both grid-search and randomize-search

* PEP8

* DOC revert unecessary change

* TST add test for _check_fit_params

* olivier comments

* TST fixes

* DOC whats new

* DOC whats new

* TST revert type of error

* add olivier suggestions

* address olivier comments

* address thomas comments

* PEP8

* comments olivier

* TST fix test by passing X

* avoid to call twice tocsr

* add case column/row sparse in check_fit_param

* provide optional indices

* TST check content when indexing params

* PEP8

* TST update tests to check identity

* stupid fix

* use a distribution in RandomizedSearchCV

* MNT add lightgbm to one of the CI build

* move to another build

* do not install dependencies lightgbm

* MNT comments on the CI setup

* address some comments

* Test fit_params compat without dependency on lightgbm

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 azure-pipelines.yml                          |   3 +-
 build_tools/azure/install.sh                 |   2 +
 doc/whats_new/v0.22.rst                      |   9 ++
 sklearn/model_selection/_search.py           |   6 +-
 sklearn/model_selection/_validation.py       |  17 +--
 sklearn/model_selection/tests/test_search.py | 110 +++++++++++++++----
 sklearn/utils/tests/test_validation.py       |  30 +++++
 sklearn/utils/validation.py                  |  69 ++++++++++--
 8 files changed, 195 insertions(+), 51 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index e2ff71802ce72..b029a2fd18574 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -71,10 +71,9 @@ jobs:
         JOBLIB_VERSION: '0.12.3'
         COVERAGE: 'true'
       # Linux environment to test the latest available dependencies and MKL.
-      # It runs tests requiring pandas and PyAMG.
+      # It runs tests requiring lightgbm, pandas and PyAMG.
       pylatest_pip_openblas_pandas:
         DISTRIB: 'conda-pip-latest'
-        # FIXME: pinned until SciPy wheels are available for Python 3.8
         PYTHON_VERSION: '3.8'
         PYTEST_VERSION: '4.6.2'
         COVERAGE: 'true'
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index aa2707bb03837..1ef981b5dd6e8 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -92,6 +92,8 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     python -m pip install numpy scipy cython joblib
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
     python -m pip install pandas matplotlib pyamg
+    # do not install dependencies for lightgbm since it requires scikit-learn
+    python -m pip install lightgbm --no-deps
 fi
 
 if [[ "$COVERAGE" == "true" ]]; then
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 74b20810de6db..40f4ef92143d9 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -66,6 +66,15 @@ Changelog
   return sparse weight matrix.
   :pr:`15868` by :user:`Niklas Smedemark-Margulies <nik-sm>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |Fix| :class:`model_selection.GridSearchCV` and
+  :class:`model_selection.RandomizedSearchCV` accept scalar values provided in
+  `fit_params`. Change in 0.22 was breaking backward compatibility.
+  :pr:`15863` by :user:`Adrin Jalali <adrinjalali>` and
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index e6a8493ef6250..a70bdd7a2f9dc 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -33,7 +33,7 @@
 from ..utils import check_random_state
 from ..utils.fixes import MaskedArray
 from ..utils.random import sample_without_replacement
-from ..utils.validation import indexable, check_is_fitted
+from ..utils.validation import indexable, check_is_fitted, _check_fit_params
 from ..utils.metaestimators import if_delegate_has_method
 from ..metrics._scorer import _check_multimetric_scoring
 from ..metrics import check_scoring
@@ -648,9 +648,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
             refit_metric = 'score'
 
         X, y, groups = indexable(X, y, groups)
-        # make sure fit_params are sliceable
-        fit_params_values = indexable(*fit_params.values())
-        fit_params = dict(zip(fit_params.keys(), fit_params_values))
+        fit_params = _check_fit_params(X, fit_params)
 
         n_splits = cv.get_n_splits(X, y, groups)
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 88eb5d49c4d0f..2ac3ffa1d5913 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -23,6 +23,7 @@
 from ..base import is_classifier, clone
 from ..utils import (indexable, check_random_state, _safe_indexing,
                      _message_with_time)
+from ..utils.validation import _check_fit_params
 from ..utils.validation import _is_arraylike, _num_samples
 from ..utils.metaestimators import _safe_split
 from ..metrics import check_scoring
@@ -489,8 +490,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
 
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
-    fit_params = {k: _index_param_value(X, v, train)
-                  for k, v in fit_params.items()}
+    fit_params = _check_fit_params(X, fit_params, train)
 
     train_scores = {}
     if parameters is not None:
@@ -830,8 +830,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params,
     """
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
-    fit_params = {k: _index_param_value(X, v, train)
-                  for k, v in fit_params.items()}
+    fit_params = _check_fit_params(X, fit_params, train)
 
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, _ = _safe_split(estimator, X, y, test, train)
@@ -937,16 +936,6 @@ def _check_is_permutation(indices, n_samples):
     return True
 
 
-def _index_param_value(X, v, indices):
-    """Private helper function for parameter value indexing."""
-    if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
-        # pass through: skip indexing
-        return v
-    if sp.issparse(v):
-        v = v.tocsr()
-    return _safe_indexing(v, indices)
-
-
 def permutation_test_score(estimator, X, y, groups=None, cv=None,
                            n_permutations=100, n_jobs=None, random_state=0,
                            verbose=0, scoring=None):
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index fc6183f3a1f0b..032644dfd90d7 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -27,7 +27,7 @@
 
 from scipy.stats import bernoulli, expon, uniform
 
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.base import clone
 from sklearn.exceptions import NotFittedError
 from sklearn.datasets import make_classification
@@ -36,6 +36,7 @@
 
 from sklearn.model_selection import fit_grid_point
 from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import train_test_split
 from sklearn.model_selection import KFold
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import StratifiedShuffleSplit
@@ -218,33 +219,25 @@ def test_grid_search_pipeline_steps():
     assert not hasattr(param_grid['regressor'][1], 'coef_')
 
 
-def check_hyperparameter_searcher_with_fit_params(klass, **klass_kwargs):
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_SearchCV_with_fit_params(SearchCV):
     X = np.arange(100).reshape(10, 10)
     y = np.array([0] * 5 + [1] * 5)
     clf = CheckingClassifier(expected_fit_params=['spam', 'eggs'])
-    searcher = klass(clf, {'foo_param': [1, 2, 3]}, cv=2, **klass_kwargs)
+    searcher = SearchCV(
+        clf, {'foo_param': [1, 2, 3]}, cv=2, error_score="raise"
+    )
 
     # The CheckingClassifier generates an assertion error if
     # a parameter is missing or has length != len(X).
-    assert_raise_message(AssertionError,
-                         "Expected fit parameter(s) ['eggs'] not seen.",
-                         searcher.fit, X, y, spam=np.ones(10))
-    assert_raise_message(
-        ValueError,
-        "Found input variables with inconsistent numbers of samples: [",
-        searcher.fit, X, y, spam=np.ones(1),
-        eggs=np.zeros(10))
-    searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10))
+    err_msg = r"Expected fit parameter\(s\) \['eggs'\] not seen."
+    with pytest.raises(AssertionError, match=err_msg):
+        searcher.fit(X, y, spam=np.ones(10))
 
-
-def test_grid_search_with_fit_params():
-    check_hyperparameter_searcher_with_fit_params(GridSearchCV,
-                                                  error_score='raise')
-
-
-def test_random_search_with_fit_params():
-    check_hyperparameter_searcher_with_fit_params(RandomizedSearchCV, n_iter=1,
-                                                  error_score='raise')
+    err_msg = "Fit parameter spam has length 1; expected"
+    with pytest.raises(AssertionError, match=err_msg):
+        searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10))
+    searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10))
 
 
 @ignore_warnings
@@ -1847,3 +1840,78 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed():
 
     attr_message = "GridSearchCV not identical with precomputed metric"
     assert (preds_original == preds_precomputed).all(), attr_message
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [(GridSearchCV, {'a': [0.1, 0.01]}),
+     (RandomizedSearchCV, {'a': uniform(1, 3)})]
+)
+def test_scalar_fit_param(SearchCV, param_search):
+    # unofficially sanctioned tolerance for scalar values in fit_params
+    # non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15805
+    class TestEstimator(BaseEstimator, ClassifierMixin):
+        def __init__(self, a=None):
+            self.a = a
+
+        def fit(self, X, y, r=None):
+            self.r_ = r
+
+        def predict(self, X):
+            return np.zeros(shape=(len(X)))
+
+    model = SearchCV(TestEstimator(), param_search)
+    X, y = make_classification(random_state=42)
+    model.fit(X, y, r=42)
+    assert model.best_estimator_.r_ == 42
+
+
+@pytest.mark.parametrize(
+    "SearchCV, param_search",
+    [(GridSearchCV, {'alpha': [0.1, 0.01]}),
+     (RandomizedSearchCV, {'alpha': uniform(0.01, 0.1)})]
+)
+def test_scalar_fit_param_compat(SearchCV, param_search):
+    # check support for scalar values in fit_params, for instance in LightGBM
+    # that do not exactly respect the scikit-learn API contract but that we do
+    # not want to break without an explicit deprecation cycle and API
+    # recommendations for implementing early stopping with a user provided
+    # validation set. non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15805
+    X_train, X_valid, y_train, y_valid = train_test_split(
+        *make_classification(random_state=42), random_state=42
+    )
+
+    class _FitParamClassifier(SGDClassifier):
+
+        def fit(self, X, y, sample_weight=None, tuple_of_arrays=None,
+                scalar_param=None, callable_param=None):
+            super().fit(X, y, sample_weight=sample_weight)
+            assert scalar_param > 0
+            assert callable(callable_param)
+
+            # The tuple of arrays should be preserved as tuple.
+            assert isinstance(tuple_of_arrays, tuple)
+            assert tuple_of_arrays[0].ndim == 2
+            assert tuple_of_arrays[1].ndim == 1
+            return self
+
+    def _fit_param_callable():
+        pass
+
+    model = SearchCV(
+        _FitParamClassifier(), param_search
+    )
+
+    # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which
+    # is not the case for the following parameters. But this abuse is common in
+    # popular third-party libraries and we should tolerate this behavior for
+    # now and be careful not to break support for those without following
+    # proper deprecation cycle.
+    fit_params = {
+        'tuple_of_arrays': (X_valid, y_valid),
+        'callable_param': _fit_param_callable,
+        'scalar_param': 42,
+    }
+    model.fit(X_train, y_train, **fit_params)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 60a1fb8cded97..3bb65f2b94de1 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -32,6 +32,7 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.svm import SVR
 from sklearn.datasets import make_blobs
+from sklearn.utils import _safe_indexing
 from sklearn.utils.validation import (
     has_fit_parameter,
     check_is_fitted,
@@ -46,6 +47,7 @@
     _check_sample_weight,
     _allclose_dense_sparse,
     FLOAT_DTYPES)
+from sklearn.utils.validation import _check_fit_params
 
 import sklearn
 
@@ -1194,3 +1196,31 @@ def __init__(self, a=1, b=1, *, c=1, d=1):
     with pytest.warns(FutureWarning,
                       match=r"Pass c=3, d=4 as keyword args"):
         A2(1, 2, 3, 4)
+
+
+@pytest.mark.parametrize("indices", [None, [1, 3]])
+def test_check_fit_params(indices):
+    X = np.random.randn(4, 2)
+    fit_params = {
+        'list': [1, 2, 3, 4],
+        'array': np.array([1, 2, 3, 4]),
+        'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T,
+        'sparse-row': sp.csc_matrix([1, 2, 3, 4]),
+        'scalar-int': 1,
+        'scalar-str': 'xxx',
+        'None': None,
+    }
+    result = _check_fit_params(X, fit_params, indices)
+    indices_ = indices if indices is not None else list(range(X.shape[0]))
+
+    for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']:
+        assert result[key] is fit_params[key]
+
+    assert result['list'] == _safe_indexing(fit_params['list'], indices_)
+    assert_array_equal(
+        result['array'], _safe_indexing(fit_params['array'], indices_)
+    )
+    assert_allclose_dense_sparse(
+        result['sparse-col'],
+        _safe_indexing(fit_params['sparse-col'], indices_)
+    )
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index b99c1ee35e1c4..4506128a39ae1 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -212,6 +212,26 @@ def check_consistent_length(*arrays):
                          " samples: %r" % [int(l) for l in lengths])
 
 
+def _make_indexable(iterable):
+    """Ensure iterable supports indexing or convert to an indexable variant.
+
+    Convert sparse matrices to csr and other non-indexable iterable to arrays.
+    Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
+
+    Parameters
+    ----------
+    iterable : {list, dataframe, array, sparse} or None
+        Object to be converted to an indexable iterable.
+    """
+    if sp.issparse(iterable):
+        return iterable.tocsr()
+    elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
+        return iterable
+    elif iterable is None:
+        return iterable
+    return np.array(iterable)
+
+
 def indexable(*iterables):
     """Make arrays indexable for cross-validation.
 
@@ -224,16 +244,7 @@ def indexable(*iterables):
     *iterables : lists, dataframes, arrays, sparse matrices
         List of objects to ensure sliceability.
     """
-    result = []
-    for X in iterables:
-        if sp.issparse(X):
-            result.append(X.tocsr())
-        elif hasattr(X, "__getitem__") or hasattr(X, "iloc"):
-            result.append(X)
-        elif X is None:
-            result.append(X)
-        else:
-            result.append(np.array(X))
+    result = [_make_indexable(X) for X in iterables]
     check_consistent_length(*result)
     return result
 
@@ -1323,3 +1334,41 @@ def inner_f(*args, **kwargs):
         kwargs.update({k: arg for k, arg in zip(all_args, args)})
         return f(**kwargs)
     return inner_f
+
+
+def _check_fit_params(X, fit_params, indices=None):
+    """Check and validate the parameters passed during `fit`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data array.
+
+    fit_params : dict
+        Dictionary containing the parameters passed at fit.
+
+    indices : array-like of shape (n_samples,), default=None
+        Indices to be selected if the parameter has the same size as `X`.
+
+    Returns
+    -------
+    fit_params_validated : dict
+        Validated parameters. We ensure that the values support indexing.
+    """
+    from . import _safe_indexing
+    fit_params_validated = {}
+    for param_key, param_value in fit_params.items():
+        if (not _is_arraylike(param_value) or
+                _num_samples(param_value) != _num_samples(X)):
+            # Non-indexable pass-through (for now for backward-compatibility).
+            # https://github.com/scikit-learn/scikit-learn/issues/15805
+            fit_params_validated[param_key] = param_value
+        else:
+            # Any other fit_params should support indexing
+            # (e.g. for cross-validation).
+            fit_params_validated[param_key] = _make_indexable(param_value)
+            fit_params_validated[param_key] = _safe_indexing(
+                fit_params_validated[param_key], indices
+            )
+
+    return fit_params_validated

From b6d43a70bae78e2c6e274f6635e8465a34d58273 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= <b.sipocz@gmail.com>
Date: Tue, 31 Dec 2019 06:19:15 -0800
Subject: [PATCH 76/85] Remove abstractmethod that silently brake downstream
 packages (#15996)

---
 doc/whats_new/v0.22.rst | 7 +++++++
 sklearn/naive_bayes.py  | 6 ------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 40f4ef92143d9..e36d7e925529d 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -58,6 +58,13 @@ Changelog
   pass the `values_format` parameter to the :class:`ConfusionMatrixDisplay`
   plot() call. :pr:`15937` by :user:`Stephen Blystone <blynotes>`.
 
+:mod:`sklearn.naive_bayes`
+..........................
+
+- |Fix| removed abstract method `_check_X` from :class:`naive_bayes.BaseNB`
+  that could break downstream projects inheriting from this deprecated
+  public base class. :pr:`15996` by :user:`Brigitta Sipőcz <bsipocz>`.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index cebc428e17b12..d958645b178f6 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -51,12 +51,6 @@ def _joint_log_likelihood(self, X):
         predict_proba and predict_log_proba.
         """
 
-    @abstractmethod
-    def _check_X(self, X):
-        """Validate input X
-        """
-        pass
-
     def predict(self, X):
         """
         Perform classification on an array of test vectors X.

From 0cb7c09800f726ffc622b2e1c065e089d3eb3d32 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 1 Jan 2020 04:11:17 +0100
Subject: [PATCH 77/85] FIX restore BaseNB._check_X without abstractmethod
 decoration (#15997)

---
 doc/whats_new/v0.22.rst | 7 ++++---
 sklearn/naive_bayes.py  | 8 ++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index e36d7e925529d..394fd6ee8203c 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -61,9 +61,10 @@ Changelog
 :mod:`sklearn.naive_bayes`
 ..........................
 
-- |Fix| removed abstract method `_check_X` from :class:`naive_bayes.BaseNB`
-  that could break downstream projects inheriting from this deprecated
-  public base class. :pr:`15996` by :user:`Brigitta Sipőcz <bsipocz>`.
+- |Fix| Removed `abstractmethod` decorator for the method `_check_X` in
+  :class:`naive_bayes.BaseNB` that could break downstream projects inheriting
+  from this deprecated public base class. :pr:`15996` by
+  :user:`Brigitta Sipőcz <bsipocz>`.
 
 :mod:`sklearn.semi_supervised`
 ..............................
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index d958645b178f6..22bd339cbd6b0 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -51,6 +51,14 @@ def _joint_log_likelihood(self, X):
         predict_proba and predict_log_proba.
         """
 
+    def _check_X(self, X):
+        """To be overridden in subclasses with the actual checks."""
+        # Note that this is not marked @abstractmethod as long as the
+        # deprecated public alias sklearn.naive_bayes.BayesNB exists
+        # (until 0.24) to preserve backward compat for 3rd party projects
+        # with existing derived classes.
+        return X
+
     def predict(self, X):
         """
         Perform classification on an array of test vectors X.

From f9c9ccc311018dfc1d8b48e598971886fb1fc55b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 10:25:39 +0100
Subject: [PATCH 78/85] Update v0.22 changelog for 0.22.1 (#16002)

- set the date
- move entry for quantile transformer to the 0.22.1 section
- fix alphabetical ordering of modules
---
 doc/whats_new/v0.22.rst | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 394fd6ee8203c..d5d7fddf7417d 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -7,7 +7,7 @@
 Version 0.22.1
 ==============
 
-**In Development**
+**January 2 2020**
 
 This is a bug-fix release to primarily resolve some packaging issues in version
 0.22.0. It also includes minor documentation improvements and some bug fixes.
@@ -53,11 +53,20 @@ Changelog
 - |Fix| :func:`metrics.classification_report` does no longer ignore the
   value of the ``zero_division`` keyword argument. :pr:`15879`
   by :user:`Bibhash Chandra Mitra <Bibyutatsu>`.
-  
+
 - |Fix| Fixed a bug in :func:`metrics.plot_confusion_matrix` to correctly
   pass the `values_format` parameter to the :class:`ConfusionMatrixDisplay`
   plot() call. :pr:`15937` by :user:`Stephen Blystone <blynotes>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |Fix| :class:`model_selection.GridSearchCV` and
+  :class:`model_selection.RandomizedSearchCV` accept scalar values provided in
+  `fit_params`. Change in 0.22 was breaking backward compatibility.
+  :pr:`15863` by :user:`Adrin Jalali <adrinjalali>` and
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
@@ -66,6 +75,13 @@ Changelog
   from this deprecated public base class. :pr:`15996` by
   :user:`Brigitta Sipőcz <bsipocz>`.
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the
+  `quantiles_` attribute to be completely sorted in non-decreasing manner.
+  :pr:`15751` by :user:`Tirth Patel <tirthasheshpatel>`.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 
@@ -74,15 +90,6 @@ Changelog
   return sparse weight matrix.
   :pr:`15868` by :user:`Niklas Smedemark-Margulies <nik-sm>`.
 
-:mod:`sklearn.model_selection`
-..............................
-
-- |Fix| :class:`model_selection.GridSearchCV` and
-  :class:`model_selection.RandomizedSearchCV` accept scalar values provided in
-  `fit_params`. Change in 0.22 was breaking backward compatibility.
-  :pr:`15863` by :user:`Adrin Jalali <adrinjalali>` and
-  :user:`Guillaume Lemaitre <glemaitre>`.
-
 :mod:`sklearn.utils`
 ....................
 
@@ -859,10 +866,6 @@ Changelog
   :class:`preprocessing.KernelCenterer`
   :pr:`14336` by :user:`Gregory Dexter <gdex1>`.
 
-- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the 
-  `quantiles_` attribute to be completely sorted in non-decreasing manner.
-  :pr:`15751` by :user:`Tirth Patel <tirthasheshpatel>`.
-
 :mod:`sklearn.model_selection`
 ..............................
 

From faaed7c2b9d5b62ecf7f4b26024a65035877a120 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 2 Jan 2020 04:39:41 -0500
Subject: [PATCH 79/85] STY Removes hidden scroll bar (#15999)

---
 doc/themes/scikit-learn-modern/static/css/theme.css | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 782800eb31915..a77fb03e36f65 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -518,6 +518,16 @@ div.sk-sidebar-toc-wrapper {
   overflow-y: scroll;
   height: 100vh;
   padding-right: 1.75rem;
+
+  /* Hide scrollbar for IE and Edge */
+  -ms-overflow-style: none;
+
+  /* Hide scrollbar for Firefox */
+  scrollbar-width: none;
+}
+
+div.sk-sidebar-toc-wrapper::-webkit-scrollbar {
+  display: none;
 }
 
 div.sk-sidebar-toc-wrapper::after {

From 55b77e97ba4dbd5807ff57339ec4789ba7c11f4a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 11:01:42 +0100
Subject: [PATCH 80/85] Flake8 fixes

---
 sklearn/cluster/_agglomerative.py                         | 1 +
 sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index e4fca1c0a572c..2c977c845e832 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -21,6 +21,7 @@
 
 from . import _hierarchical_fast as _hierarchical
 from ._feature_agglomeration import AgglomerationTransform
+from ..utils import deprecated
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
 
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index 23a7d5a638f25..0adff08e98b56 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -227,8 +227,6 @@ def test_confusion_matrix_contrast(pyplot):
     assert_allclose(disp.text_[1, 1].get_color(), min_color)
 
 
-
-
 @pytest.mark.parametrize(
     "clf", [LogisticRegression(),
             make_pipeline(StandardScaler(), LogisticRegression()),

From 7e22b9ee9f3b520e9540bf767fddcaf681f27b63 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 11:17:01 +0100
Subject: [PATCH 81/85] Fix: remove left-over lines that should have been
 deleted during conflict resolution when rebasing

---
 sklearn/utils/validation.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 4506128a39ae1..929ab7e0f5f78 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -946,14 +946,6 @@ def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all):
     NotFittedError
         If the attributes are not found.
     """
-    if attributes != 'deprecated':
-        warnings.warn("Passing attributes to check_is_fitted is deprecated"
-                      " and will be removed in 0.23. The attributes "
-                      "argument is ignored.", FutureWarning)
-    if all_or_any != 'deprecated':
-        warnings.warn("Passing all_or_any to check_is_fitted is deprecated"
-                      " and will be removed in 0.23. The any_or_all "
-                      "argument is ignored.", FutureWarning)
     if isclass(estimator):
         raise TypeError("{} is a class, not an instance.".format(estimator))
     if msg is None:

From 3f0a9654cb9bb5c91d7ff1cbcfbb586be255cd44 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 11:44:24 +0100
Subject: [PATCH 82/85] Fix missing imports

---
 sklearn/metrics/tests/test_classification.py | 1 +
 sklearn/utils/tests/test_validation.py       | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 46da9db6e68ed..e0204649b00a2 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -20,6 +20,7 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_warns_div0
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_warns_message
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 3bb65f2b94de1..a6fc29c14a3bb 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -16,6 +16,7 @@
 from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_warns
+from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import SkipTest
 from sklearn.utils._testing import assert_array_equal
@@ -30,6 +31,7 @@
 from sklearn.linear_model import ARDRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.exceptions import DataConversionWarning
 from sklearn.svm import SVR
 from sklearn.datasets import make_blobs
 from sklearn.utils import _safe_indexing

From 20cd7d886a6ef7a54277e87e296a97ad73fab127 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 11:45:16 +0100
Subject: [PATCH 83/85] Update version

---
 sklearn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index cbea669c69269..7a80019bad2f0 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -40,7 +40,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.22'
+__version__ = '0.22.1'
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

From 0d98c7745efc5d69f4a1f0f745ed625d91e1779c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 12:03:24 +0100
Subject: [PATCH 84/85] Fix test_check_is_fitted

---
 sklearn/utils/tests/test_validation.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index a6fc29c14a3bb..59c87f9bc451f 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -728,16 +728,6 @@ def test_check_is_fitted():
     assert check_is_fitted(ard) is None
     assert check_is_fitted(svr) is None
 
-    # to be removed in 0.23
-    assert_warns_message(
-        FutureWarning,
-        "Passing attributes to check_is_fitted is deprecated",
-        check_is_fitted, ard, ['coef_'])
-    assert_warns_message(
-        FutureWarning,
-        "Passing all_or_any to check_is_fitted is deprecated",
-        check_is_fitted, ard, all_or_any=any)
-
 
 def test_check_is_fitted_attributes():
     class MyEstimator():

From 26f6459f7aefa46e41784934eb3293f8fa3f57c8 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 2 Jan 2020 14:13:27 +0100
Subject: [PATCH 85/85] Make test_sag_regressor_computed_correctly
 deterministic (#16003)

Fix #15818.
---
 sklearn/linear_model/tests/test_sag.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py
index 6a591288b55d8..6bb156c64715b 100644
--- a/sklearn/linear_model/tests/test_sag.py
+++ b/sklearn/linear_model/tests/test_sag.py
@@ -120,7 +120,7 @@ def sag(X, y, step_size, alpha, n_iter=1, dloss=None, sparse=False,
 
 def sag_sparse(X, y, step_size, alpha, n_iter=1,
                dloss=None, sample_weight=None, sparse=False,
-               fit_intercept=True, saga=False):
+               fit_intercept=True, saga=False, random_state=0):
     if step_size * alpha == 1.:
         raise ZeroDivisionError("Sparse sag does not handle the case "
                                 "step_size * alpha == 1")
@@ -130,7 +130,7 @@ def sag_sparse(X, y, step_size, alpha, n_iter=1,
     sum_gradient = np.zeros(n_features)
     last_updated = np.zeros(n_features, dtype=np.int)
     gradient_memory = np.zeros(n_samples)
-    rng = np.random.RandomState(77)
+    rng = check_random_state(random_state)
     intercept = 0.0
     intercept_sum_gradient = 0.0
     wscale = 1.0
@@ -368,7 +368,7 @@ def test_sag_regressor_computed_correctly():
     alpha = .1
     n_features = 10
     n_samples = 40
-    max_iter = 50
+    max_iter = 100
     tol = .000001
     fit_intercept = True
     rng = np.random.RandomState(0)
@@ -378,7 +378,8 @@ def test_sag_regressor_computed_correctly():
     step_size = get_step_size(X, alpha, fit_intercept, classification=False)
 
     clf1 = Ridge(fit_intercept=fit_intercept, tol=tol, solver='sag',
-                 alpha=alpha * n_samples, max_iter=max_iter)
+                 alpha=alpha * n_samples, max_iter=max_iter,
+                 random_state=rng)
     clf2 = clone(clf1)
 
     clf1.fit(X, y)
@@ -387,12 +388,14 @@ def test_sag_regressor_computed_correctly():
     spweights1, spintercept1 = sag_sparse(X, y, step_size, alpha,
                                           n_iter=max_iter,
                                           dloss=squared_dloss,
-                                          fit_intercept=fit_intercept)
+                                          fit_intercept=fit_intercept,
+                                          random_state=rng)
 
     spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha,
                                           n_iter=max_iter,
                                           dloss=squared_dloss, sparse=True,
-                                          fit_intercept=fit_intercept)
+                                          fit_intercept=fit_intercept,
+                                          random_state=rng)
 
     assert_array_almost_equal(clf1.coef_.ravel(),
                               spweights1.ravel(),