From c56c2af28efbf31de3bb5ffe694c58f09ee86e29 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 17:43:44 -0400
Subject: [PATCH 01/11] simple deprecations and removals

---
 doc/modules/classes.rst                       |  21 +-
 sklearn/cluster/hierarchical.py               |  12 +-
 sklearn/covariance/graph_lasso_.py            |   6 -
 sklearn/covariance/tests/test_graph_lasso.py  |  24 +-
 .../covariance/tests/test_graphical_lasso.py  |  24 +-
 sklearn/datasets/mlcomp.py                    | 114 ---
 sklearn/decomposition/fastica_.py             |  11 +-
 sklearn/decomposition/online_lda.py           |  38 +-
 sklearn/decomposition/sparse_pca.py           |  21 +-
 sklearn/discriminant_analysis.py              |  15 +-
 sklearn/feature_extraction/hashing.py         |  22 +-
 sklearn/feature_extraction/text.py            |  13 +-
 sklearn/gaussian_process/gpr.py               |  12 -
 sklearn/kernel_approximation.py               |   7 +-
 sklearn/linear_model/least_angle.py           |   7 -
 sklearn/linear_model/randomized_l1.py         | 670 ------------------
 .../linear_model/tests/test_randomized_l1.py  | 219 ------
 sklearn/manifold/t_sne.py                     |   5 -
 sklearn/metrics/pairwise.py                   |  10 +-
 sklearn/neighbors/approximate.py              | 589 ---------------
 sklearn/neighbors/tests/test_approximate.py   | 498 -------------
 sklearn/preprocessing/__init__.py             |   2 -
 .../preprocessing/_function_transformer.py    |  37 +-
 sklearn/preprocessing/data.py                 |  58 +-
 sklearn/semi_supervised/label_propagation.py  |  19 +-
 sklearn/tests/test_discriminant_analysis.py   |  14 -
 sklearn/utils/arpack.py                       |  23 -
 sklearn/utils/extmath.py                      |  42 --
 sklearn/utils/graph.py                        |  14 -
 sklearn/utils/random.py                       | 100 ---
 sklearn/utils/sparsetools/__init__.py         |  13 -
 sklearn/utils/sparsetools/setup.py            |  15 -
 sklearn/utils/sparsetools/tests/__init__.py   |   0
 sklearn/utils/stats.py                        |   7 -
 sklearn/utils/tests/test_extmath.py           |  28 -
 sklearn/utils/tests/test_stats.py             |   9 -
 sklearn/utils/tests/test_utils.py             |  50 --
 37 files changed, 43 insertions(+), 2726 deletions(-)
 delete mode 100644 sklearn/datasets/mlcomp.py
 delete mode 100644 sklearn/linear_model/randomized_l1.py
 delete mode 100644 sklearn/linear_model/tests/test_randomized_l1.py
 delete mode 100644 sklearn/neighbors/approximate.py
 delete mode 100644 sklearn/neighbors/tests/test_approximate.py
 delete mode 100644 sklearn/utils/arpack.py
 delete mode 100644 sklearn/utils/sparsetools/__init__.py
 delete mode 100644 sklearn/utils/sparsetools/setup.py
 delete mode 100644 sklearn/utils/sparsetools/tests/__init__.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 57ccfb5cff704..04d60cecea3d7 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1512,23 +1512,4 @@ To be removed in 0.22
    :template: deprecated_function.rst
 
    covariance.graph_lasso
-   datasets.fetch_mldata
-
-
-To be removed in 0.21
----------------------
-
-.. autosummary::
-   :toctree: generated/
-   :template: deprecated_class.rst
-
-   linear_model.RandomizedLasso
-   linear_model.RandomizedLogisticRegression
-   neighbors.LSHForest
-
-.. autosummary::
-   :toctree: generated/
-   :template: deprecated_function.rst
-
-   datasets.load_mlcomp
-   linear_model.lasso_stability_path
+   datasets.fetch_mldata
\ No newline at end of file
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index 1d6755fd72060..a23542ff5a97f 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -339,9 +339,8 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
 
 
 # single average and complete linkage
-def linkage_tree(X, connectivity=None, n_components='deprecated',
-                 n_clusters=None, linkage='complete', affinity="euclidean",
-                 return_distance=False):
+def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
+                 affinity="euclidean", return_distance=False):
     """Linkage agglomerative clustering based on a Feature matrix.
 
     The inertia matrix uses a Heapq-based representation.
@@ -362,9 +361,6 @@ def linkage_tree(X, connectivity=None, n_components='deprecated',
         be symmetric and only the upper triangular half is used.
         Default is None, i.e, the Ward algorithm is unstructured.
 
-    n_components : int (optional)
-        The number of connected components in the graph.
-
     n_clusters : int (optional)
         Stop early the construction of the tree at n_clusters. This is
         useful to decrease computation time if the number of clusters is
@@ -420,10 +416,6 @@ def linkage_tree(X, connectivity=None, n_components='deprecated',
     --------
     ward_tree : hierarchical clustering with ward linkage
     """
-    if n_components != 'deprecated':
-        warnings.warn("n_components was deprecated in 0.19"
-                      "will be removed in 0.21", DeprecationWarning)
-
     X = np.asarray(X)
     if X.ndim == 1:
         X = np.reshape(X, (-1, 1))
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index b10e3c7f3f828..3280aacbf6c8a 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -584,12 +584,6 @@ def __init__(self, alphas=4, n_refinements=4, cv='warn', tol=1e-4,
         self.cv = cv
         self.n_jobs = n_jobs
 
-    @property
-    @deprecated("Attribute grid_scores was deprecated in version 0.19 and "
-                "will be removed in 0.21. Use ``grid_scores_`` instead")
-    def grid_scores(self):
-        return self.grid_scores_
-
     def fit(self, X, y=None):
         """Fits the GraphicalLasso covariance model to X.
 
diff --git a/sklearn/covariance/tests/test_graph_lasso.py b/sklearn/covariance/tests/test_graph_lasso.py
index 8c07536363614..33c724df781d4 100644
--- a/sklearn/covariance/tests/test_graph_lasso.py
+++ b/sklearn/covariance/tests/test_graph_lasso.py
@@ -140,26 +140,4 @@ def test_graph_lasso_cv(random_state=1):
         sys.stdout = orig_stdout
 
     # Smoke test with specified alphas
-    GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
-
-
-@ignore_warnings(category=DeprecationWarning)
-@pytest.mark.filterwarnings('ignore: You should specify a value')  # 0.22
-def test_deprecated_grid_scores(random_state=1):
-    dim = 5
-    n_samples = 6
-    random_state = check_random_state(random_state)
-    prec = make_sparse_spd_matrix(dim, alpha=.96,
-                                  random_state=random_state)
-    cov = linalg.inv(prec)
-    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
-    graph_lasso = GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1)
-    graph_lasso.fit(X)
-
-    depr_message = ("Attribute grid_scores was deprecated in version "
-                    "0.19 and will be removed in 0.21. Use "
-                    "``grid_scores_`` instead")
-
-    assert_warns_message(DeprecationWarning, depr_message,
-                         lambda: graph_lasso.grid_scores)
-    assert_equal(graph_lasso.grid_scores, graph_lasso.grid_scores_)
+    GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
\ No newline at end of file
diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py
index f1d6aab6a9b26..25e2f191d3ec8 100644
--- a/sklearn/covariance/tests/test_graphical_lasso.py
+++ b/sklearn/covariance/tests/test_graphical_lasso.py
@@ -136,26 +136,4 @@ def test_graphical_lasso_cv(random_state=1):
         sys.stdout = orig_stdout
 
     # Smoke test with specified alphas
-    GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
-
-
-@pytest.mark.filterwarnings('ignore: You should specify a value')  # 0.22
-@pytest.mark.skipif(not PY3_OR_LATER,
-                    reason='On Python 2 DeprecationWarning is not issued for some unkown reason.')
-def test_deprecated_grid_scores(random_state=1):
-    dim = 5
-    n_samples = 6
-    random_state = check_random_state(random_state)
-    prec = make_sparse_spd_matrix(dim, alpha=.96,
-                                  random_state=random_state)
-    cov = linalg.inv(prec)
-    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
-    graphical_lasso = GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1)
-    graphical_lasso.fit(X)
-
-    depr_message = ("Attribute grid_scores was deprecated in version "
-                    "0.19 and will be removed in 0.21. Use "
-                    "``grid_scores_`` instead")
-
-    with pytest.warns(DeprecationWarning, match=depr_message):
-        assert_equal(graphical_lasso.grid_scores, graphical_lasso.grid_scores_)
+    GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
\ No newline at end of file
diff --git a/sklearn/datasets/mlcomp.py b/sklearn/datasets/mlcomp.py
deleted file mode 100644
index 9adb7bbc1c06e..0000000000000
--- a/sklearn/datasets/mlcomp.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2010 Olivier Grisel <olivier.grisel@ensta.org>
-# License: BSD 3 clause
-"""Glue code to load http://mlcomp.org data as a scikit.learn dataset"""
-
-import os
-import numbers
-from sklearn.datasets.base import load_files
-from sklearn.utils import deprecated
-
-
-def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
-    if set_ is not None:
-        dataset_path = os.path.join(dataset_path, set_)
-    return load_files(dataset_path, metadata.get('description'), **kwargs)
-
-
-LOADERS = {
-    'DocumentClassification': _load_document_classification,
-    # TODO: implement the remaining domain formats
-}
-
-
-@deprecated("since the http://mlcomp.org/ website will shut down "
-            "in March 2017, the load_mlcomp function was deprecated "
-            "in version 0.19 and will be removed in 0.21.")
-def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs):
-    r"""Load a datasets as downloaded from http://mlcomp.org
-
-    Read more in the :ref:`User Guide <datasets>`.
-
-    Parameters
-    ----------
-
-    name_or_id : int or str
-        The integer id or the string name metadata of the MLComp
-        dataset to load
-
-    set\_ : str, default='raw'
-        Select the portion to load: 'train', 'test' or 'raw'
-
-    mlcomp_root : str, optional
-        The filesystem path to the root folder where MLComp datasets
-        are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
-        environment variable is looked up instead.
-
-    **kwargs : domain specific kwargs to be passed to the dataset loader.
-
-    Returns
-    -------
-
-    data : Bunch
-        Dictionary-like object, the interesting attributes are:
-        'filenames', the files holding the raw to learn, 'target', the
-        classification labels (integer index), 'target_names',
-        the meaning of the labels, and 'DESCR', the full description of the
-        dataset.
-
-    Note on the lookup process: depending on the type of name_or_id,
-    will choose between integer id lookup or metadata name lookup by
-    looking at the unzipped archives and metadata file.
-
-    TODO: implement zip dataset loading too
-    """
-
-    if mlcomp_root is None:
-        try:
-            mlcomp_root = os.environ['MLCOMP_DATASETS_HOME']
-        except KeyError:
-            raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined")
-
-    mlcomp_root = os.path.expanduser(mlcomp_root)
-    mlcomp_root = os.path.abspath(mlcomp_root)
-    mlcomp_root = os.path.normpath(mlcomp_root)
-
-    if not os.path.exists(mlcomp_root):
-        raise ValueError("Could not find folder: " + mlcomp_root)
-
-    # dataset lookup
-    if isinstance(name_or_id, numbers.Integral):
-        # id lookup
-        dataset_path = os.path.join(mlcomp_root, str(name_or_id))
-    else:
-        # assume name based lookup
-        dataset_path = None
-        expected_name_line = "name: " + name_or_id
-        for dataset in os.listdir(mlcomp_root):
-            metadata_file = os.path.join(mlcomp_root, dataset, 'metadata')
-            if not os.path.exists(metadata_file):
-                continue
-            with open(metadata_file) as f:
-                for line in f:
-                    if line.strip() == expected_name_line:
-                        dataset_path = os.path.join(mlcomp_root, dataset)
-                        break
-        if dataset_path is None:
-            raise ValueError("Could not find dataset with metadata line: " +
-                             expected_name_line)
-
-    # loading the dataset metadata
-    metadata = dict()
-    metadata_file = os.path.join(dataset_path, 'metadata')
-    if not os.path.exists(metadata_file):
-        raise ValueError(dataset_path + ' is not a valid MLComp dataset')
-    with open(metadata_file) as f:
-        for line in f:
-            if ":" in line:
-                key, value = line.split(":", 1)
-                metadata[key.strip()] = value.strip()
-
-    format = metadata.get('format', 'unknow')
-    loader = LOADERS.get(format)
-    if loader is None:
-        raise ValueError("No loader implemented for format: " + format)
-    return loader(dataset_path, metadata, set_=set_, **kwargs)
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index f64d4787b3f71..2eead18b2678d 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -553,7 +553,7 @@ def fit(self, X, y=None):
         self._fit(X, compute_sources=False)
         return self
 
-    def transform(self, X, y='deprecated', copy=True):
+    def transform(self, X, copy=True):
         """Recover the sources from X (apply the unmixing matrix).
 
         Parameters
@@ -561,9 +561,7 @@ def transform(self, X, y='deprecated', copy=True):
         X : array-like, shape (n_samples, n_features)
             Data to transform, where n_samples is the number of samples
             and n_features is the number of features.
-        y : (ignored)
-            .. deprecated:: 0.19
-               This parameter will be removed in 0.21.
+
         copy : bool (optional)
             If False, data passed to fit are overwritten. Defaults to True.
 
@@ -571,11 +569,6 @@ def transform(self, X, y='deprecated', copy=True):
         -------
         X_new : array-like, shape (n_samples, n_components)
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-
         check_is_fitted(self, 'mixing_')
 
         X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index 5b48ea1a26b30..51c199e50c2d7 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -230,11 +230,6 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    n_topics : int, optional (default=None)
-        This parameter has been renamed to n_components and will
-        be removed in version 0.21.
-        .. deprecated:: 0.19
-
     Attributes
     ----------
     components_ : array, [n_components, n_features]
@@ -286,7 +281,7 @@ def __init__(self, n_components=10, doc_topic_prior=None,
                  learning_decay=.7, learning_offset=10., max_iter=10,
                  batch_size=128, evaluate_every=-1, total_samples=1e6,
                  perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100,
-                 n_jobs=None, verbose=0, random_state=None, n_topics=None):
+                 n_jobs=None, verbose=0, random_state=None):
         self.n_components = n_components
         self.doc_topic_prior = doc_topic_prior
         self.topic_word_prior = topic_word_prior
@@ -303,21 +298,12 @@ def __init__(self, n_components=10, doc_topic_prior=None,
         self.n_jobs = n_jobs
         self.verbose = verbose
         self.random_state = random_state
-        self.n_topics = n_topics
 
     def _check_params(self):
         """Check model parameters."""
-        if self.n_topics is not None:
-            self._n_components = self.n_topics
-            warnings.warn("n_topics has been renamed to n_components in "
-                          "version 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-        else:
-            self._n_components = self.n_components
-
-        if self._n_components <= 0:
+        if self.n_components <= 0:
             raise ValueError("Invalid 'n_components' parameter: %r"
-                             % self._n_components)
+                             % self.n_components)
 
         if self.total_samples <= 0:
             raise ValueError("Invalid 'total_samples' parameter: %r"
@@ -339,12 +325,12 @@ def _init_latent_vars(self, n_features):
         self.n_iter_ = 0
 
         if self.doc_topic_prior is None:
-            self.doc_topic_prior_ = 1. / self._n_components
+            self.doc_topic_prior_ = 1. / self.n_components
         else:
             self.doc_topic_prior_ = self.doc_topic_prior
 
         if self.topic_word_prior is None:
-            self.topic_word_prior_ = 1. / self._n_components
+            self.topic_word_prior_ = 1. / self.n_components
         else:
             self.topic_word_prior_ = self.topic_word_prior
 
@@ -352,7 +338,7 @@ def _init_latent_vars(self, n_features):
         init_var = 1. / init_gamma
         # In the literature, this is called `lambda`
         self.components_ = self.random_state_.gamma(
-            init_gamma, init_var, (self._n_components, n_features))
+            init_gamma, init_var, (self.n_components, n_features))
 
         # In the literature, this is `exp(E[log(beta)])`
         self.exp_dirichlet_component_ = np.exp(
@@ -711,7 +697,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size):
 
         # compute E[log p(theta | alpha) - log q(theta | gamma)]
         score += _loglikelihood(doc_topic_prior, doc_topic_distr,
-                                dirichlet_doc_topic, self._n_components)
+                                dirichlet_doc_topic, self.n_components)
 
         # Compensate for the subsampling of the population of documents
         if sub_sampling:
@@ -781,7 +767,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
                 raise ValueError("Number of samples in X and doc_topic_distr"
                                  " do not match.")
 
-            if n_components != self._n_components:
+            if n_components != self.n_components:
                 raise ValueError("Number of topics does not match.")
 
         current_samples = X.shape[0]
@@ -795,7 +781,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
 
         return np.exp(-1.0 * perword_bound)
 
-    def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False):
+    def perplexity(self, X, sub_sampling=False):
         """Calculate approximate perplexity for data X.
 
         Perplexity is defined as exp(-1. * log-likelihood per word)
@@ -823,10 +809,4 @@ def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False):
         score : float
             Perplexity score.
         """
-        if doc_topic_distr != 'deprecated':
-            warnings.warn("Argument 'doc_topic_distr' is deprecated and is "
-                          "being ignored as of 0.19. Support for this "
-                          "argument will be removed in 0.21.",
-                          DeprecationWarning)
-
         return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)
diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py
index 95c9ab8960e64..5bc2107f7f31c 100644
--- a/sklearn/decomposition/sparse_pca.py
+++ b/sklearn/decomposition/sparse_pca.py
@@ -197,7 +197,7 @@ def fit(self, X, y=None):
         self.error_ = E
         return self
 
-    def transform(self, X, ridge_alpha='deprecated'):
+    def transform(self, X):
         """Least Squares projection of the data onto the sparse components.
 
         To avoid instability issues in case the system is under-determined,
@@ -213,14 +213,6 @@ def transform(self, X, ridge_alpha='deprecated'):
             Test data to be transformed, must have the same number of
             features as the data used to train the model.
 
-        ridge_alpha : float, default: 0.01
-            Amount of ridge shrinkage to apply in order to improve
-            conditioning.
-
-            .. deprecated:: 0.19
-               This parameter will be removed in 0.21.
-               Specify ``ridge_alpha`` in the ``SparsePCA`` constructor.
-
         Returns
         -------
         X_new array, shape (n_samples, n_components)
@@ -229,20 +221,11 @@ def transform(self, X, ridge_alpha='deprecated'):
         check_is_fitted(self, 'components_')
 
         X = check_array(X)
-        if ridge_alpha != 'deprecated':
-            warnings.warn("The ridge_alpha parameter on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21. "
-                          "Specify ridge_alpha in the SparsePCA constructor.",
-                          DeprecationWarning)
-            if ridge_alpha is None:
-                ridge_alpha = self.ridge_alpha
-        else:
-            ridge_alpha = self.ridge_alpha
 
         if self.normalize_components:
             X = X - self.mean_
 
-        U = ridge_regression(self.components_.T, X.T, ridge_alpha,
+        U = ridge_regression(self.components_.T, X.T, self.ridge_alpha,
                              solver='cholesky')
 
         if not self.normalize_components:
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index a635792c6f6ca..bf6b3a4f44631 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -614,20 +614,12 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
     """
 
     def __init__(self, priors=None, reg_param=0., store_covariance=False,
-                 tol=1.0e-4, store_covariances=None):
+                 tol=1.0e-4):
         self.priors = np.asarray(priors) if priors is not None else None
         self.reg_param = reg_param
-        self.store_covariances = store_covariances
         self.store_covariance = store_covariance
         self.tol = tol
 
-    @property
-    @deprecated("Attribute ``covariances_`` was deprecated in version"
-                " 0.19 and will be removed in 0.21. Use "
-                "``covariance_`` instead")
-    def covariances_(self):
-        return self.covariance_
-
     def fit(self, X, y):
         """Fit the model according to the given training data and parameters.
 
@@ -662,10 +654,7 @@ def fit(self, X, y):
 
         cov = None
         store_covariance = self.store_covariance or self.store_covariances
-        if self.store_covariances:
-            warnings.warn("'store_covariances' was renamed to store_covariance"
-                          " in version 0.19 and will be removed in 0.21.",
-                          DeprecationWarning)
+
         if store_covariance:
             cov = []
         means = []
diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
index 744a073090bad..f670e9cbec89e 100644
--- a/sklearn/feature_extraction/hashing.py
+++ b/sklearn/feature_extraction/hashing.py
@@ -57,8 +57,7 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         feature_name should be a string, while value should be a number.
         In the case of "string", a value of 1 is implied.
         The feature_name is hashed to find the appropriate column for the
-        feature. The value's sign might be flipped in the output (but see
-        non_negative, below).
+        feature. The value's sign might be flipped in the output.
     dtype : numpy type, optional, default np.float64
         The type of feature values. Passed to scipy.sparse matrix constructors
         as the dtype argument. Do not set this to bool, np.boolean or any
@@ -68,15 +67,6 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         approximately conserve the inner product in the hashed space even for
         small n_features. This approach is similar to sparse random projection.
 
-    non_negative : boolean, optional, default False
-        When True, an absolute value is applied to the features matrix prior to
-        returning it. When used in conjunction with alternate_sign=True, this
-        significantly reduces the inner product preservation property.
-
-        .. deprecated:: 0.19
-            This option will be removed in 0.21.
-
-
     Examples
     --------
     >>> from sklearn.feature_extraction import FeatureHasher
@@ -94,18 +84,12 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, n_features=(2 ** 20), input_type="dict",
-                 dtype=np.float64, alternate_sign=True, non_negative=False):
+                 dtype=np.float64, alternate_sign=True):
         self._validate_params(n_features, input_type)
-        if non_negative:
-            warnings.warn("the option non_negative=True has been deprecated"
-                          " in 0.19 and will be removed"
-                          " in version 0.21.", DeprecationWarning)
-
         self.dtype = dtype
         self.input_type = input_type
         self.n_features = n_features
         self.alternate_sign = alternate_sign
-        self.non_negative = non_negative
 
     @staticmethod
     def _validate_params(n_features, input_type):
@@ -175,6 +159,4 @@ def transform(self, raw_X):
                           shape=(n_samples, self.n_features))
         X.sum_duplicates()  # also sorts the indices
 
-        if self.non_negative:
-            np.abs(X.data, X.data)
         return X
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 05f60d2805c7c..a4de38d959db1 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -491,13 +491,6 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
 
         .. versionadded:: 0.19
 
-    non_negative : boolean, optional, default False
-        When True, an absolute value is applied to the features matrix prior to
-        returning it. When used in conjunction with alternate_sign=True, this
-        significantly reduces the inner product preservation property.
-
-        .. deprecated:: 0.19
-            This option will be removed in 0.21.
     dtype : type, optional
         Type of the matrix returned by fit_transform() or transform().
 
@@ -526,7 +519,7 @@ def __init__(self, input='content', encoding='utf-8',
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                  ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20),
                  binary=False, norm='l2', alternate_sign=True,
-                 non_negative=False, dtype=np.float64):
+                 dtype=np.float64):
         self.input = input
         self.encoding = encoding
         self.decode_error = decode_error
@@ -542,7 +535,6 @@ def __init__(self, input='content', encoding='utf-8',
         self.binary = binary
         self.norm = norm
         self.alternate_sign = alternate_sign
-        self.non_negative = non_negative
         self.dtype = dtype
 
     def partial_fit(self, X, y=None):
@@ -630,8 +622,7 @@ def fit_transform(self, X, y=None):
     def _get_hasher(self):
         return FeatureHasher(n_features=self.n_features,
                              input_type='string', dtype=self.dtype,
-                             alternate_sign=self.alternate_sign,
-                             non_negative=self.non_negative)
+                             alternate_sign=self.alternate_sign)
 
 
 def _document_frequency(X):
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index ac2c0a46b6866..ebe8dd3b65ade 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -158,18 +158,6 @@ def __init__(self, kernel=None, alpha=1e-10,
         self.copy_X_train = copy_X_train
         self.random_state = random_state
 
-    @property
-    @deprecated("Attribute rng was deprecated in version 0.19 and "
-                "will be removed in 0.21.")
-    def rng(self):
-        return self._rng
-
-    @property
-    @deprecated("Attribute y_train_mean was deprecated in version 0.19 and "
-                "will be removed in 0.21.")
-    def y_train_mean(self):
-        return self._y_train_mean
-
     def fit(self, X, y):
         """Fit Gaussian process regression model.
 
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 79d915fa1e2df..585f453e389b2 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -619,10 +619,7 @@ def _get_kernel_params(self):
             if (self.gamma is not None or
                     self.coef0 is not None or
                     self.degree is not None):
-                warnings.warn(
-                    "Passing gamma, coef0 or degree to Nystroem when using a"
-                    " callable kernel is deprecated in version 0.19 and will"
-                    " raise an error in 0.21, as they are ignored. Use "
-                    "kernel_params instead.", DeprecationWarning)
+                raise ValueErrror("Don't pass gamma, coef0 or degree to "
+                                  "Nystroem if using a callable kernel.")
 
         return params
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index ce13b99b6aae5..0e923a424c221 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -1185,13 +1185,6 @@ def fit(self, X, y):
                   Xy=None, fit_path=True)
         return self
 
-    @property
-    @deprecated("Attribute alpha is deprecated in 0.19 and "
-                "will be removed in 0.21. See ``alpha_`` instead")
-    def alpha(self):
-        # impedance matching for the above Lars.fit (should not be documented)
-        return self.alpha_
-
 
 class LassoLarsCV(LarsCV):
     """Cross-validated Lasso, using the LARS algorithm
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
deleted file mode 100644
index 40ebe3c57826b..0000000000000
--- a/sklearn/linear_model/randomized_l1.py
+++ /dev/null
@@ -1,670 +0,0 @@
-"""
-Randomized Lasso/Logistic: feature selection based on Lasso and
-sparse Logistic Regression
-"""
-
-# Author: Gael Varoquaux, Alexandre Gramfort
-#
-# License: BSD 3 clause
-
-import warnings
-import itertools
-from abc import ABCMeta, abstractmethod
-
-import numpy as np
-from scipy.sparse import issparse
-from scipy import sparse
-from scipy.interpolate import interp1d
-
-from .base import _preprocess_data
-from ..base import BaseEstimator
-from ..externals import six
-from ..utils import Memory, Parallel, delayed
-from ..feature_selection.base import SelectorMixin
-from ..utils import (as_float_array, check_random_state, check_X_y, safe_mask,
-                     deprecated)
-from ..utils.validation import check_is_fitted
-from .least_angle import lars_path, LassoLarsIC
-from .logistic import LogisticRegression
-from ..exceptions import ConvergenceWarning
-
-
-###############################################################################
-# Randomized linear model: feature selection
-
-def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200,
-                    n_jobs=None, verbose=False, pre_dispatch='3*n_jobs',
-                    random_state=None, sample_fraction=.75, **params):
-    random_state = check_random_state(random_state)
-    # We are generating 1 - weights, and not weights
-    n_samples, n_features = X.shape
-
-    if not (0 < scaling < 1):
-        raise ValueError(
-            "'scaling' should be between 0 and 1. Got %r instead." % scaling)
-
-    scaling = 1. - scaling
-    scores_ = 0.0
-    for active_set in Parallel(n_jobs=n_jobs, verbose=verbose,
-                               pre_dispatch=pre_dispatch)(
-            delayed(estimator_func)(
-                X, y, weights=scaling * random_state.randint(
-                    0, 2, size=(n_features,)),
-                mask=(random_state.rand(n_samples) < sample_fraction),
-                verbose=max(0, verbose - 1),
-                **params)
-            for _ in range(n_resampling)):
-        scores_ += active_set
-
-    scores_ /= n_resampling
-    return scores_
-
-
-@deprecated("The class BaseRandomizedLinearModel is deprecated in 0.19"
-            " and will be removed in 0.21.")
-class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
-                                                   SelectorMixin)):
-    """Base class to implement randomized linear models for feature selection
-
-    This implements the strategy by Meinshausen and Buhlman:
-    stability selection with randomized sampling, and random re-weighting of
-    the penalty.
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-    _preprocess_data = staticmethod(_preprocess_data)
-
-    def fit(self, X, y):
-        """Fit the model using X, y as training data.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Training data.
-
-        y : array-like, shape = [n_samples]
-            Target values. Will be cast to X's dtype if necessary
-
-        Returns
-        -------
-        self : object
-               Returns an instance of self.
-        """
-        X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True,
-                         ensure_min_samples=2, estimator=self)
-        X = as_float_array(X, copy=False)
-        n_samples, n_features = X.shape
-
-        X, y, X_offset, y_offset, X_scale = \
-            self._preprocess_data(X, y, self.fit_intercept, self.normalize)
-
-        estimator_func, params = self._make_estimator_and_params(X, y)
-        memory = self.memory
-        if memory is None:
-            memory = Memory(cachedir=None, verbose=0)
-        elif isinstance(memory, six.string_types):
-            memory = Memory(cachedir=memory, verbose=0)
-        elif not isinstance(memory, Memory):
-            raise ValueError("'memory' should either be a string or"
-                             " a sklearn.utils.Memory"
-                             " instance, got 'memory={!r}' instead.".format(
-                                 type(memory)))
-
-        scores_ = memory.cache(
-            _resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch']
-        )(
-            estimator_func, X, y,
-            scaling=self.scaling, n_resampling=self.n_resampling,
-            n_jobs=self.n_jobs, verbose=self.verbose,
-            pre_dispatch=self.pre_dispatch, random_state=self.random_state,
-            sample_fraction=self.sample_fraction, **params)
-
-        if scores_.ndim == 1:
-            scores_ = scores_[:, np.newaxis]
-        self.all_scores_ = scores_
-        self.scores_ = np.max(self.all_scores_, axis=1)
-        return self
-
-    def _make_estimator_and_params(self, X, y):
-        """Return the parameters passed to the estimator"""
-        raise NotImplementedError
-
-    def _get_support_mask(self):
-        """Get the boolean mask indicating which features are selected.
-
-        Returns
-        -------
-        support : boolean array of shape [# input features]
-                  An element is True iff its corresponding feature is selected
-                  for retention.
-        """
-        check_is_fitted(self, 'scores_')
-        return self.scores_ > self.selection_threshold
-
-
-###############################################################################
-# Randomized lasso: regression settings
-
-def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
-                      precompute=False, eps=np.finfo(np.float).eps,
-                      max_iter=500):
-    X = X[safe_mask(X, mask)]
-    y = y[mask]
-
-    # Center X and y to avoid fit the intercept
-    X -= X.mean(axis=0)
-    y -= y.mean()
-
-    alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))
-
-    X = (1 - weights) * X
-
-    with warnings.catch_warnings():
-        warnings.simplefilter('ignore', ConvergenceWarning)
-        alphas_, _, coef_ = lars_path(X, y,
-                                      Gram=precompute, copy_X=False,
-                                      copy_Gram=False, alpha_min=np.min(alpha),
-                                      method='lasso', verbose=verbose,
-                                      max_iter=max_iter, eps=eps)
-
-    if len(alpha) > 1:
-        if len(alphas_) > 1:  # np.min(alpha) < alpha_min
-            interpolator = interp1d(alphas_[::-1], coef_[:, ::-1],
-                                    bounds_error=False, fill_value=0.)
-            scores = (interpolator(alpha) != 0.0)
-        else:
-            scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool)
-    else:
-        scores = coef_[:, -1] != 0.0
-    return scores
-
-
-@deprecated("The class RandomizedLasso is deprecated in 0.19"
-            " and will be removed in 0.21.")
-class RandomizedLasso(BaseRandomizedLinearModel):
-    """Randomized Lasso.
-
-    Randomized Lasso works by subsampling the training data and
-    computing a Lasso estimate where the penalty of a random subset of
-    coefficients has been scaled. By performing this double
-    randomization several times, the method assigns high scores to
-    features that are repeatedly selected across randomizations. This
-    is known as stability selection. In short, features selected more
-    often are considered good features.
-
-    Parameters
-    ----------
-    alpha : float, 'aic', or 'bic', optional
-        The regularization parameter alpha parameter in the Lasso.
-        Warning: this is not the alpha parameter in the stability selection
-        article which is scaling.
-
-    scaling : float, optional
-        The s parameter used to randomly scale the penalty of different
-        features.
-        Should be between 0 and 1.
-
-    sample_fraction : float, optional
-        The fraction of samples to be used in each randomized design.
-        Should be between 0 and 1. If 1, all samples are used.
-
-    n_resampling : int, optional
-        Number of randomized models.
-
-    selection_threshold : float, optional
-        The score above which features should be selected.
-
-    fit_intercept : boolean, optional
-        whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (e.g. data is expected to be already centered).
-
-    verbose : boolean or integer, optional
-        Sets the verbosity amount
-
-    normalize : boolean, optional, default True
-        If True, the regressors X will be normalized before regression.
-        This parameter is ignored when `fit_intercept` is set to False.
-        When the regressors are normalized, note that this makes the
-        hyperparameters learned more robust and almost independent of
-        the number of samples. The same property is not valid for
-        standardized data. However, if you wish to standardize, please
-        use `preprocessing.StandardScaler` before calling `fit` on an
-        estimator with `normalize=False`.
-
-    precompute : True | False | 'auto' | array-like
-        Whether to use a precomputed Gram matrix to speed up calculations.
-        If set to 'auto' let us decide.
-        The Gram matrix can also be passed as argument, but it will be used
-        only for the selection of parameter alpha, if alpha is 'aic' or 'bic'.
-
-    max_iter : integer, optional
-        Maximum number of iterations to perform in the Lars algorithm.
-
-    eps : float, optional
-        The machine-precision regularization in the computation of the
-        Cholesky diagonal factors. Increase this for very ill-conditioned
-        systems. Unlike the 'tol' parameter in some iterative
-        optimization-based algorithms, this parameter does not control
-        the tolerance of the optimization.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    n_jobs : int or None, optional (default=None)
-        Number of CPUs to use during the resampling.
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    memory : None, str or object with the joblib.Memory interface, optional \
-            (default=None)
-        Used for internal caching. By default, no caching is done.
-        If a string is given, it is the path to the caching directory.
-
-    Attributes
-    ----------
-    scores_ : array, shape = [n_features]
-        Feature scores between 0 and 1.
-
-    all_scores_ : array, shape = [n_features, n_reg_parameter]
-        Feature scores between 0 and 1 for all values of the regularization \
-        parameter. The reference article suggests ``scores_`` is the max of \
-        ``all_scores_``.
-
-    Examples
-    --------
-    >>> from sklearn.linear_model import RandomizedLasso
-    >>> randomized_lasso = RandomizedLasso() # doctest: +SKIP
-
-    References
-    ----------
-    Stability selection
-    Nicolai Meinshausen, Peter Buhlmann
-    Journal of the Royal Statistical Society: Series B
-    Volume 72, Issue 4, pages 417-473, September 2010
-    DOI: 10.1111/j.1467-9868.2010.00740.x
-
-    See also
-    --------
-    RandomizedLogisticRegression, Lasso, ElasticNet
-    """
-    def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75,
-                 n_resampling=200, selection_threshold=.25,
-                 fit_intercept=True, verbose=False,
-                 normalize=True, precompute='auto',
-                 max_iter=500,
-                 eps=np.finfo(np.float).eps, random_state=None,
-                 n_jobs=None, pre_dispatch='3*n_jobs',
-                 memory=None):
-        self.alpha = alpha
-        self.scaling = scaling
-        self.sample_fraction = sample_fraction
-        self.n_resampling = n_resampling
-        self.fit_intercept = fit_intercept
-        self.max_iter = max_iter
-        self.verbose = verbose
-        self.normalize = normalize
-        self.precompute = precompute
-        self.eps = eps
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-        self.selection_threshold = selection_threshold
-        self.pre_dispatch = pre_dispatch
-        self.memory = memory
-
-    def _make_estimator_and_params(self, X, y):
-        alpha = self.alpha
-        if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'):
-            model = LassoLarsIC(precompute=self.precompute,
-                                criterion=self.alpha,
-                                max_iter=self.max_iter,
-                                eps=self.eps)
-            model.fit(X, y)
-            self.alpha_ = alpha = model.alpha_
-
-        precompute = self.precompute
-        # A precomputed Gram array is useless, since _randomized_lasso
-        # change X a each iteration
-        if hasattr(precompute, '__array__'):
-            precompute = 'auto'
-        assert precompute in (True, False, None, 'auto')
-        return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter,
-                                       eps=self.eps,
-                                       precompute=precompute)
-
-
-###############################################################################
-# Randomized logistic: classification settings
-
-def _randomized_logistic(X, y, weights, mask, C=1., verbose=False,
-                         fit_intercept=True, tol=1e-3):
-    X = X[safe_mask(X, mask)]
-    y = y[mask]
-    if issparse(X):
-        size = len(weights)
-        weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
-        X = X * weight_dia
-    else:
-        X *= (1 - weights)
-
-    C = np.atleast_1d(np.asarray(C, dtype=np.float64))
-    if C.ndim > 1:
-        raise ValueError("C should be 1-dimensional array-like, "
-                         "but got a {}-dimensional array-like instead: {}."
-                         .format(C.ndim, C))
-
-    scores = np.zeros((X.shape[1], len(C)), dtype=np.bool)
-
-    for this_C, this_scores in zip(C, scores.T):
-        # XXX : would be great to do it with a warm_start ...
-        clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False,
-                                 fit_intercept=fit_intercept,
-                                 solver='liblinear', multi_class='ovr')
-        clf.fit(X, y)
-        this_scores[:] = np.any(
-            np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0)
-    return scores
-
-
-@deprecated("The class RandomizedLogisticRegression is deprecated in 0.19"
-            " and will be removed in 0.21.")
-class RandomizedLogisticRegression(BaseRandomizedLinearModel):
-    """Randomized Logistic Regression
-
-    Randomized Logistic Regression works by subsampling the training
-    data and fitting a L1-penalized LogisticRegression model where the
-    penalty of a random subset of coefficients has been scaled. By
-    performing this double randomization several times, the method
-    assigns high scores to features that are repeatedly selected across
-    randomizations. This is known as stability selection. In short,
-    features selected more often are considered good features.
-
-    Parameters
-    ----------
-    C : float or array-like of shape [n_reg_parameter], optional, default=1
-        The regularization parameter C in the LogisticRegression.
-        When C is an array, fit will take each regularization parameter in C
-        one by one for LogisticRegression and store results for each one
-        in ``all_scores_``, where columns and rows represent corresponding
-        reg_parameters and features.
-
-    scaling : float, optional, default=0.5
-        The s parameter used to randomly scale the penalty of different
-        features.
-        Should be between 0 and 1.
-
-    sample_fraction : float, optional, default=0.75
-        The fraction of samples to be used in each randomized design.
-        Should be between 0 and 1. If 1, all samples are used.
-
-    n_resampling : int, optional, default=200
-        Number of randomized models.
-
-    selection_threshold : float, optional, default=0.25
-        The score above which features should be selected.
-
-    tol : float, optional, default=1e-3
-         tolerance for stopping criteria of LogisticRegression
-
-    fit_intercept : boolean, optional, default=True
-        whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (e.g. data is expected to be already centered).
-
-    verbose : boolean or integer, optional
-        Sets the verbosity amount
-
-    normalize : boolean, optional, default True
-        If True, the regressors X will be normalized before regression.
-        This parameter is ignored when `fit_intercept` is set to False.
-        When the regressors are normalized, note that this makes the
-        hyperparameters learnt more robust and almost independent of the number
-        of samples. The same property is not valid for standardized data.
-        However, if you wish to standardize, please use
-        `preprocessing.StandardScaler` before calling `fit` on an estimator
-        with `normalize=False`.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    n_jobs : int or None, optional (default=None)
-        Number of CPUs to use during the resampling.
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    memory : None, str or object with the joblib.Memory interface, optional \
-            (default=None)
-        Used for internal caching. By default, no caching is done.
-        If a string is given, it is the path to the caching directory.
-
-    Attributes
-    ----------
-    scores_ : array, shape = [n_features]
-        Feature scores between 0 and 1.
-
-    all_scores_ : array, shape = [n_features, n_reg_parameter]
-        Feature scores between 0 and 1 for all values of the regularization \
-        parameter. The reference article suggests ``scores_`` is the max \
-        of ``all_scores_``.
-
-    Examples
-    --------
-    >>> from sklearn.linear_model import RandomizedLogisticRegression
-    >>> randomized_logistic = RandomizedLogisticRegression() # doctest: +SKIP
-
-    References
-    ----------
-    Stability selection
-    Nicolai Meinshausen, Peter Buhlmann
-    Journal of the Royal Statistical Society: Series B
-    Volume 72, Issue 4, pages 417-473, September 2010
-    DOI: 10.1111/j.1467-9868.2010.00740.x
-
-    See also
-    --------
-    RandomizedLasso, LogisticRegression
-    """
-    def __init__(self, C=1, scaling=.5, sample_fraction=.75,
-                 n_resampling=200,
-                 selection_threshold=.25, tol=1e-3,
-                 fit_intercept=True, verbose=False,
-                 normalize=True,
-                 random_state=None,
-                 n_jobs=None, pre_dispatch='3*n_jobs',
-                 memory=None):
-        self.C = C
-        self.scaling = scaling
-        self.sample_fraction = sample_fraction
-        self.n_resampling = n_resampling
-        self.fit_intercept = fit_intercept
-        self.verbose = verbose
-        self.normalize = normalize
-        self.tol = tol
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-        self.selection_threshold = selection_threshold
-        self.pre_dispatch = pre_dispatch
-        self.memory = memory
-
-    def _make_estimator_and_params(self, X, y):
-        params = dict(C=self.C, tol=self.tol,
-                      fit_intercept=self.fit_intercept)
-        return _randomized_logistic, params
-
-    def _preprocess_data(self, X, y, fit_intercept, normalize=False):
-        """Center the data in X but not in y"""
-        X, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept,
-                                                      normalize=normalize)
-        return X, y, X_offset, y, X_scale
-
-
-###############################################################################
-# Stability paths
-def _lasso_stability_path(X, y, mask, weights, eps):
-    "Inner loop of lasso_stability_path"
-    X = X * weights[np.newaxis, :]
-    X = X[safe_mask(X, mask), :]
-    y = y[mask]
-
-    alpha_max = np.max(np.abs(np.dot(X.T, y))) / X.shape[0]
-    alpha_min = eps * alpha_max  # set for early stopping in path
-    with warnings.catch_warnings():
-        warnings.simplefilter('ignore', ConvergenceWarning)
-        alphas, _, coefs = lars_path(X, y, method='lasso', verbose=False,
-                                     alpha_min=alpha_min)
-    # Scale alpha by alpha_max
-    alphas /= alphas[0]
-    # Sort alphas in ascending order
-    alphas = alphas[::-1]
-    coefs = coefs[:, ::-1]
-    # Get rid of the alphas that are too small
-    mask = alphas >= eps
-    # We also want to keep the first one: it should be close to the OLS
-    # solution
-    mask[0] = True
-    alphas = alphas[mask]
-    coefs = coefs[:, mask]
-    return alphas, coefs
-
-
-@deprecated("The function lasso_stability_path is deprecated in 0.19"
-            " and will be removed in 0.21.")
-def lasso_stability_path(X, y, scaling=0.5, random_state=None,
-                         n_resampling=200, n_grid=100,
-                         sample_fraction=0.75,
-                         eps=4 * np.finfo(np.float).eps, n_jobs=None,
-                         verbose=False):
-    """Stability path based on randomized Lasso estimates
-
-    Parameters
-    ----------
-    X : array-like, shape = [n_samples, n_features]
-        training data.
-
-    y : array-like, shape = [n_samples]
-        target values.
-
-    scaling : float, optional, default=0.5
-        The alpha parameter in the stability selection article used to
-        randomly scale the features. Should be between 0 and 1.
-
-    random_state : int, RandomState instance or None, optional, default=None
-        The generator used to randomize the design.  If int, random_state is
-        the seed used by the random number generator; If RandomState instance,
-        random_state is the random number generator; If None, the random number
-        generator is the RandomState instance used by `np.random`.
-
-    n_resampling : int, optional, default=200
-        Number of randomized models.
-
-    n_grid : int, optional, default=100
-        Number of grid points. The path is linearly reinterpolated
-        on a grid between 0 and 1 before computing the scores.
-
-    sample_fraction : float, optional, default=0.75
-        The fraction of samples to be used in each randomized design.
-        Should be between 0 and 1. If 1, all samples are used.
-
-    eps : float, optional
-        Smallest value of alpha / alpha_max considered
-
-    n_jobs : int or None, optional (default=None)
-        Number of CPUs to use during the resampling.
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
-
-    verbose : boolean or integer, optional
-        Sets the verbosity amount
-
-    Returns
-    -------
-    alphas_grid : array, shape ~ [n_grid]
-        The grid points between 0 and 1: alpha/alpha_max
-
-    scores_path : array, shape = [n_features, n_grid]
-        The scores for each feature along the path.
-    """
-    X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
-    rng = check_random_state(random_state)
-
-    if not (0 < scaling < 1):
-        raise ValueError("Parameter 'scaling' should be between 0 and 1."
-                         " Got %r instead." % scaling)
-
-    n_samples, n_features = X.shape
-
-    paths = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(_lasso_stability_path)(
-            X, y, mask=rng.rand(n_samples) < sample_fraction,
-            weights=1. - scaling * rng.randint(0, 2, size=(n_features,)),
-            eps=eps)
-        for k in range(n_resampling))
-
-    all_alphas = sorted(list(set(itertools.chain(*[p[0] for p in paths]))))
-    # Take approximately n_grid values
-    stride = int(max(1, int(len(all_alphas) / float(n_grid))))
-    all_alphas = all_alphas[::stride]
-    if not all_alphas[-1] == 1:
-        all_alphas.append(1.)
-    all_alphas = np.array(all_alphas)
-    scores_path = np.zeros((n_features, len(all_alphas)))
-
-    for alphas, coefs in paths:
-        if alphas[0] != 0:
-            alphas = np.r_[0, alphas]
-            coefs = np.c_[np.ones((n_features, 1)), coefs]
-        if alphas[-1] != all_alphas[-1]:
-            alphas = np.r_[alphas, all_alphas[-1]]
-            coefs = np.c_[coefs, np.zeros((n_features, 1))]
-        scores_path += (interp1d(alphas, coefs,
-                        kind='nearest', bounds_error=False,
-                        fill_value=0, axis=-1)(all_alphas) != 0)
-
-    scores_path /= n_resampling
-    return all_alphas, scores_path
diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py
deleted file mode 100644
index 564fbd4e7827d..0000000000000
--- a/sklearn/linear_model/tests/test_randomized_l1.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-# License: BSD 3 clause
-
-from tempfile import mkdtemp
-import shutil
-
-import numpy as np
-from scipy import sparse
-
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_raises_regex
-from sklearn.utils.testing import assert_allclose
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.testing import assert_warns_message
-
-from sklearn.linear_model.randomized_l1 import(lasso_stability_path,
-                                               RandomizedLasso,
-                                               RandomizedLogisticRegression)
-
-from sklearn.datasets import load_diabetes, load_iris
-from sklearn.feature_selection import f_regression, f_classif
-from sklearn.preprocessing import StandardScaler
-from sklearn.linear_model.base import _preprocess_data
-
-diabetes = load_diabetes()
-X = diabetes.data
-y = diabetes.target
-X = StandardScaler().fit_transform(X)
-X = X[:, [2, 3, 6, 7, 8]]
-
-# test that the feature score of the best features
-F, _ = f_regression(X, y)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_lasso_stability_path():
-    # Check lasso stability path
-    # Load diabetes data and add noisy features
-    scaling = 0.3
-    coef_grid, scores_path = lasso_stability_path(X, y, scaling=scaling,
-                                                  random_state=42,
-                                                  n_resampling=30)
-
-    assert_array_equal(np.argsort(F)[-3:],
-                       np.argsort(np.sum(scores_path, axis=1))[-3:])
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_randomized_lasso_error_memory():
-    scaling = 0.3
-    selection_threshold = 0.5
-    tempdir = 5
-    clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
-                          scaling=scaling,
-                          selection_threshold=selection_threshold,
-                          memory=tempdir)
-    assert_raises_regex(ValueError, "'memory' should either be a string or"
-                        " a sklearn.utils.Memory instance",
-                        clf.fit, X, y)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_randomized_lasso():
-    # Check randomized lasso
-    scaling = 0.3
-    selection_threshold = 0.5
-    n_resampling = 20
-
-    # or with 1 alpha
-    clf = RandomizedLasso(verbose=False, alpha=1, random_state=42,
-                          scaling=scaling, n_resampling=n_resampling,
-                          selection_threshold=selection_threshold)
-    feature_scores = clf.fit(X, y).scores_
-    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
-
-    # or with many alphas
-    clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
-                          scaling=scaling, n_resampling=n_resampling,
-                          selection_threshold=selection_threshold)
-    feature_scores = clf.fit(X, y).scores_
-    assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
-    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
-    # test caching
-    try:
-        tempdir = mkdtemp()
-        clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
-                              scaling=scaling,
-                              selection_threshold=selection_threshold,
-                              memory=tempdir)
-        feature_scores = clf.fit(X, y).scores_
-        assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
-        assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
-    finally:
-        shutil.rmtree(tempdir)
-
-    X_r = clf.transform(X)
-    X_full = clf.inverse_transform(X_r)
-    assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold))
-    assert_equal(X_full.shape, X.shape)
-
-    clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42,
-                          scaling=scaling, n_resampling=100)
-    feature_scores = clf.fit(X, y).scores_
-    assert_allclose(feature_scores, [1., 1., 1., 0.225, 1.], rtol=0.2)
-
-    clf = RandomizedLasso(verbose=False, scaling=-0.1)
-    assert_raises(ValueError, clf.fit, X, y)
-
-    clf = RandomizedLasso(verbose=False, scaling=1.1)
-    assert_raises(ValueError, clf.fit, X, y)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_randomized_lasso_precompute():
-    # Check randomized lasso for different values of precompute
-    n_resampling = 20
-    alpha = 1
-    random_state = 42
-
-    G = np.dot(X.T, X)
-
-    clf = RandomizedLasso(alpha=alpha, random_state=random_state,
-                          precompute=G, n_resampling=n_resampling)
-    feature_scores_1 = clf.fit(X, y).scores_
-
-    for precompute in [True, False, None, 'auto']:
-        clf = RandomizedLasso(alpha=alpha, random_state=random_state,
-                              precompute=precompute, n_resampling=n_resampling)
-        feature_scores_2 = clf.fit(X, y).scores_
-        assert_array_equal(feature_scores_1, feature_scores_2)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_randomized_logistic():
-    # Check randomized sparse logistic regression
-    iris = load_iris()
-    X = iris.data[:, [0, 2]]
-    y = iris.target
-    X = X[y != 2]
-    y = y[y != 2]
-
-    F, _ = f_classif(X, y)
-
-    scaling = 0.3
-    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
-                                       scaling=scaling, n_resampling=50,
-                                       tol=1e-3)
-    X_orig = X.copy()
-    feature_scores = clf.fit(X, y).scores_
-    assert_array_equal(X, X_orig)   # fit does not modify X
-    assert_array_equal(np.argsort(F), np.argsort(feature_scores))
-
-    clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5],
-                                       random_state=42, scaling=scaling,
-                                       n_resampling=50, tol=1e-3)
-    feature_scores = clf.fit(X, y).scores_
-    assert_array_equal(np.argsort(F), np.argsort(feature_scores))
-
-    clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]])
-    assert_raises(ValueError, clf.fit, X, y)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_randomized_logistic_sparse():
-    # Check randomized sparse logistic regression on sparse data
-    iris = load_iris()
-    X = iris.data[:, [0, 2]]
-    y = iris.target
-    X = X[y != 2]
-    y = y[y != 2]
-
-    # center here because sparse matrices are usually not centered
-    # labels should not be centered
-    X, _, _, _, _ = _preprocess_data(X, y, True, True)
-
-    X_sp = sparse.csr_matrix(X)
-
-    F, _ = f_classif(X, y)
-
-    scaling = 0.3
-    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
-                                       scaling=scaling, n_resampling=50,
-                                       tol=1e-3)
-    feature_scores = clf.fit(X, y).scores_
-    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
-                                       scaling=scaling, n_resampling=50,
-                                       tol=1e-3)
-    feature_scores_sp = clf.fit(X_sp, y).scores_
-    assert_array_equal(feature_scores, feature_scores_sp)
-
-
-def test_warning_raised():
-
-    scaling = 0.3
-    selection_threshold = 0.5
-    tempdir = 5
-    assert_warns_message(DeprecationWarning, "The function"
-                         " lasso_stability_path is "
-                         "deprecated in 0.19 and will be removed in 0.21.",
-                         lasso_stability_path, X, y, scaling=scaling,
-                         random_state=42, n_resampling=30)
-
-    assert_warns_message(DeprecationWarning, "Class RandomizedLasso is"
-                         " deprecated; The class RandomizedLasso is "
-                         "deprecated in 0.19 and will be removed in 0.21.",
-                         RandomizedLasso, verbose=False, alpha=[1, 0.8],
-                         random_state=42, scaling=scaling,
-                         selection_threshold=selection_threshold,
-                         memory=tempdir)
-
-    assert_warns_message(DeprecationWarning, "The class"
-                         " RandomizedLogisticRegression is "
-                         "deprecated in 0.19 and will be removed in 0.21.",
-                         RandomizedLogisticRegression,
-                         verbose=False, C=1., random_state=42,
-                         scaling=scaling, n_resampling=50,
-                         tol=1e-3)
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index 1c69036d0d27a..5ddda56491564 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -805,11 +805,6 @@ def _fit(self, X, skip_num_points=0):
                           neighbors=neighbors_nn,
                           skip_num_points=skip_num_points)
 
-    @property
-    @deprecated("Attribute n_iter_final was deprecated in version 0.19 and "
-                "will be removed in 0.21. Use ``n_iter_`` instead")
-    def n_iter_final(self):
-        return self.n_iter_
 
     def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded,
               neighbors=None, skip_num_points=0):
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 526d4d9f3d512..5aba68f861253 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -443,8 +443,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
                                          batch_size=batch_size)[0]
 
 
-def manhattan_distances(X, Y=None, sum_over_features=True,
-                        size_threshold=None):
+def manhattan_distances(X, Y=None, sum_over_features=True):
     """ Compute the L1 distances between the vectors in X and Y.
 
     With sum_over_features equal to False it returns the componentwise
@@ -465,9 +464,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True,
         else it returns the componentwise L1 pairwise-distances.
         Not supported for sparse matrix inputs.
 
-    size_threshold : int, default=5e8
-        Unused parameter.
-
     Returns
     -------
     D : array
@@ -497,10 +493,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True,
     array([[1., 1.],
            [1., 1.]])
     """
-    if size_threshold is not None:
-        warnings.warn('Use of the "size_threshold" is deprecated '
-                      'in 0.19 and it will be removed version '
-                      '0.21 of scikit-learn', DeprecationWarning)
     X, Y = check_pairwise_arrays(X, Y)
 
     if issparse(X) or issparse(Y):
diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py
deleted file mode 100644
index 650af47e0d81b..0000000000000
--- a/sklearn/neighbors/approximate.py
+++ /dev/null
@@ -1,589 +0,0 @@
-"""Approximate nearest neighbor search"""
-# Author: Maheshakya Wijewardena <maheshakya.10@cse.mrt.ac.lk>
-#         Joel Nothman <joel.nothman@gmail.com>
-
-import numpy as np
-import warnings
-
-from scipy import sparse
-
-from .base import KNeighborsMixin, RadiusNeighborsMixin
-from ..base import BaseEstimator
-from ..utils.validation import check_array
-from ..utils import check_random_state
-from ..metrics.pairwise import pairwise_distances
-
-from ..random_projection import GaussianRandomProjection
-
-__all__ = ["LSHForest"]
-
-HASH_DTYPE = '>u4'
-MAX_HASH_SIZE = np.dtype(HASH_DTYPE).itemsize * 8
-
-
-def _find_matching_indices(tree, bin_X, left_mask, right_mask):
-    """Finds indices in sorted array of integers.
-
-    Most significant h bits in the binary representations of the
-    integers are matched with the items' most significant h bits.
-    """
-    left_index = np.searchsorted(tree, bin_X & left_mask)
-    right_index = np.searchsorted(tree, bin_X | right_mask,
-                                  side='right')
-    return left_index, right_index
-
-
-def _find_longest_prefix_match(tree, bin_X, hash_size,
-                               left_masks, right_masks):
-    """Find the longest prefix match in tree for each query in bin_X
-
-    Most significant bits are considered as the prefix.
-    """
-    hi = np.empty_like(bin_X, dtype=np.intp)
-    hi.fill(hash_size)
-    lo = np.zeros_like(bin_X, dtype=np.intp)
-    res = np.empty_like(bin_X, dtype=np.intp)
-
-    left_idx, right_idx = _find_matching_indices(tree, bin_X,
-                                                 left_masks[hi],
-                                                 right_masks[hi])
-    found = right_idx > left_idx
-    res[found] = lo[found] = hash_size
-
-    r = np.arange(bin_X.shape[0])
-    kept = r[lo < hi]  # indices remaining in bin_X mask
-    while kept.shape[0]:
-        mid = (lo.take(kept) + hi.take(kept)) // 2
-
-        left_idx, right_idx = _find_matching_indices(tree,
-                                                     bin_X.take(kept),
-                                                     left_masks[mid],
-                                                     right_masks[mid])
-        found = right_idx > left_idx
-        mid_found = mid[found]
-        lo[kept[found]] = mid_found + 1
-        res[kept[found]] = mid_found
-        hi[kept[~found]] = mid[~found]
-
-        kept = r[lo < hi]
-
-    return res
-
-
-class ProjectionToHashMixin(object):
-    """Turn a transformed real-valued array into a hash"""
-    @staticmethod
-    def _to_hash(projected):
-        if projected.shape[1] % 8 != 0:
-            raise ValueError('Require reduced dimensionality to be a multiple '
-                             'of 8 for hashing')
-        # XXX: perhaps non-copying operation better
-        out = np.packbits((projected > 0).astype(int)).view(dtype=HASH_DTYPE)
-        return out.reshape(projected.shape[0], -1)
-
-    def fit_transform(self, X, y=None):
-        """
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Training vectors, where n_samples is the number of samples and
-            n_features is the number of predictors.
-        """
-
-        self.fit(X)
-        return self.transform(X)
-
-    def transform(self, X):
-        """
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Training vectors, where n_samples is the number of samples and
-            n_features is the number of predictors.
-        """
-        return self._to_hash(super(ProjectionToHashMixin, self).transform(X))
-
-
-class GaussianRandomProjectionHash(ProjectionToHashMixin,
-                                   GaussianRandomProjection):
-    """Use GaussianRandomProjection to produce a cosine LSH fingerprint
-
-    Parameters
-    ----------
-
-    n_components : int or 'auto', optional (default = 32)
-        Dimensionality of the target projection space.
-
-        n_components can be automatically adjusted according to the
-        number of samples in the dataset and the bound given by the
-        Johnson-Lindenstrauss lemma. In that case the quality of the
-        embedding is controlled by the ``eps`` parameter.
-
-        It should be noted that Johnson-Lindenstrauss lemma can yield
-        very conservative estimated of the required number of components
-        as it makes no assumption on the structure of the dataset.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-    """
-    def __init__(self,
-                 n_components=32,
-                 random_state=None):
-        super(GaussianRandomProjectionHash, self).__init__(
-            n_components=n_components,
-            random_state=random_state)
-
-
-def _array_of_arrays(list_of_arrays):
-    """Creates an array of array from list of arrays."""
-    out = np.empty(len(list_of_arrays), dtype=object)
-    out[:] = list_of_arrays
-    return out
-
-
-class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
-    """Performs approximate nearest neighbor search using LSH forest.
-
-    LSH Forest: Locality Sensitive Hashing forest [1] is an alternative
-    method for vanilla approximate nearest neighbor search methods.
-    LSH forest data structure has been implemented using sorted
-    arrays and binary search and 32 bit fixed-length hashes.
-    Random projection is used as the hash family which approximates
-    cosine distance.
-
-    The cosine distance is defined as ``1 - cosine_similarity``: the lowest
-    value is 0 (identical point) but it is bounded above by 2 for the farthest
-    points. Its value does not depend on the norm of the vector points but
-    only on their relative angles.
-
-    Parameters
-    ----------
-
-    n_estimators : int (default = 10)
-        Number of trees in the LSH Forest.
-
-    radius : float, optinal (default = 1.0)
-        Radius from the data point to its neighbors. This is the parameter
-        space to use by default for the :meth:`radius_neighbors` queries.
-
-    n_candidates : int (default = 50)
-        Minimum number of candidates evaluated per estimator, assuming enough
-        items meet the `min_hash_match` constraint.
-
-    n_neighbors : int (default = 5)
-        Number of neighbors to be returned from query function when
-        it is not provided to the :meth:`kneighbors` method.
-
-    min_hash_match : int (default = 4)
-        lowest hash length to be searched when candidate selection is
-        performed for nearest neighbors.
-
-    radius_cutoff_ratio : float, optional (default = 0.9)
-        A value ranges from 0 to 1. Radius neighbors will be searched until
-        the ratio between total neighbors within the radius and the total
-        candidates becomes less than this value unless it is terminated by
-        hash length reaching `min_hash_match`.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    Attributes
-    ----------
-
-    hash_functions_ : list of GaussianRandomProjectionHash objects
-        Hash function g(p,x) for a tree is an array of 32 randomly generated
-        float arrays with the same dimension as the data set. This array is
-        stored in GaussianRandomProjectionHash object and can be obtained
-        from ``components_`` attribute.
-
-    trees_ : array, shape (n_estimators, n_samples)
-        Each tree (corresponding to a hash function) contains an array of
-        sorted hashed values. The array representation may change in future
-        versions.
-
-    original_indices_ : array, shape (n_estimators, n_samples)
-        Original indices of sorted hashed values in the fitted index.
-
-    References
-    ----------
-
-    .. [1] M. Bawa, T. Condie and P. Ganesan, "LSH Forest: Self-Tuning
-           Indexes for Similarity Search", WWW '05 Proceedings of the
-           14th international conference on World Wide Web,  651-660,
-           2005.
-
-    Examples
-    --------
-      >>> from sklearn.neighbors import LSHForest
-
-      >>> X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]]
-      >>> X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]]
-      >>> lshf = LSHForest(random_state=42)  # doctest: +SKIP
-      >>> lshf.fit(X_train)  # doctest: +SKIP
-      LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10,
-                n_neighbors=5, radius=1.0, radius_cutoff_ratio=0.9,
-                random_state=42)
-      >>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2)
-      ... # doctest: +SKIP
-      >>> distances                                        # doctest: +SKIP
-      array([[0.069..., 0.149...],
-             [0.229..., 0.481...],
-             [0.004..., 0.014...]])
-      >>> indices  # doctest: +SKIP
-      array([[1, 2],
-             [2, 0],
-             [4, 0]])
-
-    """
-
-    def __init__(self, n_estimators=10, radius=1.0, n_candidates=50,
-                 n_neighbors=5, min_hash_match=4, radius_cutoff_ratio=.9,
-                 random_state=None):
-        self.n_estimators = n_estimators
-        self.radius = radius
-        self.random_state = random_state
-        self.n_candidates = n_candidates
-        self.n_neighbors = n_neighbors
-        self.min_hash_match = min_hash_match
-        self.radius_cutoff_ratio = radius_cutoff_ratio
-
-        warnings.warn("LSHForest has poor performance and has been deprecated "
-                      "in 0.19. It will be removed in version 0.21.",
-                      DeprecationWarning)
-
-    def _compute_distances(self, query, candidates):
-        """Computes the cosine distance.
-
-        Distance is from the query to points in the candidates array.
-        Returns argsort of distances in the candidates
-        array and sorted distances.
-        """
-        if candidates.shape == (0,):
-            # needed since _fit_X[np.array([])] doesn't work if _fit_X sparse
-            return np.empty(0, dtype=np.int), np.empty(0, dtype=float)
-
-        if sparse.issparse(self._fit_X):
-            candidate_X = self._fit_X[candidates]
-        else:
-            candidate_X = self._fit_X.take(candidates, axis=0, mode='clip')
-        distances = pairwise_distances(query, candidate_X,
-                                       metric='cosine')[0]
-        distance_positions = np.argsort(distances)
-        distances = distances.take(distance_positions, mode='clip', axis=0)
-        return distance_positions, distances
-
-    def _generate_masks(self):
-        """Creates left and right masks for all hash lengths."""
-        tri_size = MAX_HASH_SIZE + 1
-        # Called once on fitting, output is independent of hashes
-        left_mask = np.tril(np.ones((tri_size, tri_size), dtype=int))[:, 1:]
-        right_mask = left_mask[::-1, ::-1]
-
-        self._left_mask = np.packbits(left_mask).view(dtype=HASH_DTYPE)
-        self._right_mask = np.packbits(right_mask).view(dtype=HASH_DTYPE)
-
-    def _get_candidates(self, query, max_depth, bin_queries, n_neighbors):
-        """Performs the Synchronous ascending phase.
-
-        Returns an array of candidates, their distance ranks and
-        distances.
-        """
-        index_size = self._fit_X.shape[0]
-        # Number of candidates considered including duplicates
-        # XXX: not sure whether this is being calculated correctly wrt
-        #      duplicates from different iterations through a single tree
-        n_candidates = 0
-        candidate_set = set()
-        min_candidates = self.n_candidates * self.n_estimators
-        while (max_depth > self.min_hash_match and
-               (n_candidates < min_candidates or
-                len(candidate_set) < n_neighbors)):
-
-            left_mask = self._left_mask[max_depth]
-            right_mask = self._right_mask[max_depth]
-            for i in range(self.n_estimators):
-                start, stop = _find_matching_indices(self.trees_[i],
-                                                     bin_queries[i],
-                                                     left_mask, right_mask)
-                n_candidates += stop - start
-                candidate_set.update(
-                    self.original_indices_[i][start:stop].tolist())
-            max_depth -= 1
-
-        candidates = np.fromiter(candidate_set, count=len(candidate_set),
-                                 dtype=np.intp)
-        # For insufficient candidates, candidates are filled.
-        # Candidates are filled from unselected indices uniformly.
-        if candidates.shape[0] < n_neighbors:
-            warnings.warn(
-                "Number of candidates is not sufficient to retrieve"
-                " %i neighbors with"
-                " min_hash_match = %i. Candidates are filled up"
-                " uniformly from unselected"
-                " indices." % (n_neighbors, self.min_hash_match))
-            remaining = np.setdiff1d(np.arange(0, index_size), candidates)
-            to_fill = n_neighbors - candidates.shape[0]
-            candidates = np.concatenate((candidates, remaining[:to_fill]))
-
-        ranks, distances = self._compute_distances(query,
-                                                   candidates.astype(int))
-
-        return (candidates[ranks[:n_neighbors]],
-                distances[:n_neighbors])
-
-    def _get_radius_neighbors(self, query, max_depth, bin_queries, radius):
-        """Finds radius neighbors from the candidates obtained.
-
-        Their distances from query are smaller than radius.
-        Returns radius neighbors and distances.
-        """
-        ratio_within_radius = 1
-        threshold = 1 - self.radius_cutoff_ratio
-        total_candidates = np.array([], dtype=int)
-        total_neighbors = np.array([], dtype=int)
-        total_distances = np.array([], dtype=float)
-
-        while (max_depth > self.min_hash_match and
-               ratio_within_radius > threshold):
-            left_mask = self._left_mask[max_depth]
-            right_mask = self._right_mask[max_depth]
-            candidates = []
-            for i in range(self.n_estimators):
-                start, stop = _find_matching_indices(self.trees_[i],
-                                                     bin_queries[i],
-                                                     left_mask, right_mask)
-                candidates.extend(
-                    self.original_indices_[i][start:stop].tolist())
-            candidates = np.setdiff1d(candidates, total_candidates)
-            total_candidates = np.append(total_candidates, candidates)
-            ranks, distances = self._compute_distances(query, candidates)
-            m = np.searchsorted(distances, radius, side='right')
-            positions = np.searchsorted(total_distances, distances[:m])
-            total_neighbors = np.insert(total_neighbors, positions,
-                                        candidates[ranks[:m]])
-            total_distances = np.insert(total_distances, positions,
-                                        distances[:m])
-            ratio_within_radius = (total_neighbors.shape[0] /
-                                   float(total_candidates.shape[0]))
-            max_depth = max_depth - 1
-        return total_neighbors, total_distances
-
-    def fit(self, X, y=None):
-        """Fit the LSH forest on the data.
-
-        This creates binary hashes of input data points by getting the
-        dot product of input points and hash_function then
-        transforming the projection into a binary string array based
-        on the sign (positive/negative) of the projection.
-        A sorted array of binary hashes is created.
-
-        Parameters
-        ----------
-        X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
-            List of n_features-dimensional data points. Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        self : object
-        """
-
-        self._fit_X = check_array(X, accept_sparse='csr')
-
-        # Creates a g(p,x) for each tree
-        self.hash_functions_ = []
-        self.trees_ = []
-        self.original_indices_ = []
-
-        rng = check_random_state(self.random_state)
-        int_max = np.iinfo(np.int32).max
-
-        for i in range(self.n_estimators):
-            # This is g(p,x) for a particular tree.
-            # Builds a single tree. Hashing is done on an array of data points.
-            # `GaussianRandomProjection` is used for hashing.
-            # `n_components=hash size and n_features=n_dim.
-            hasher = GaussianRandomProjectionHash(MAX_HASH_SIZE,
-                                                  rng.randint(0, int_max))
-            hashes = hasher.fit_transform(self._fit_X)[:, 0]
-            original_index = np.argsort(hashes)
-            bin_hashes = hashes[original_index]
-            self.original_indices_.append(original_index)
-            self.trees_.append(bin_hashes)
-            self.hash_functions_.append(hasher)
-
-        self._generate_masks()
-
-        return self
-
-    def _query(self, X):
-        """Performs descending phase to find maximum depth."""
-        # Calculate hashes of shape (n_samples, n_estimators, [hash_size])
-        bin_queries = np.asarray([hasher.transform(X)[:, 0]
-                                  for hasher in self.hash_functions_])
-        bin_queries = np.rollaxis(bin_queries, 1)
-
-        # descend phase
-        depths = [_find_longest_prefix_match(tree, tree_queries, MAX_HASH_SIZE,
-                                             self._left_mask, self._right_mask)
-                  for tree, tree_queries in zip(self.trees_,
-                                                np.rollaxis(bin_queries, 1))]
-
-        return bin_queries, np.max(depths, axis=0)
-
-    def kneighbors(self, X, n_neighbors=None, return_distance=True):
-        """Returns n_neighbors of approximate nearest neighbors.
-
-        Parameters
-        ----------
-        X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
-            List of n_features-dimensional data points.  Each row
-            corresponds to a single query.
-
-        n_neighbors : int, optional (default = None)
-            Number of neighbors required. If not provided, this will
-            return the number specified at the initialization.
-
-        return_distance : boolean, optional (default = True)
-            Returns the distances of neighbors if set to True.
-
-        Returns
-        -------
-        dist : array, shape (n_samples, n_neighbors)
-            Array representing the cosine distances to each point,
-            only present if return_distance=True.
-
-        ind : array, shape (n_samples, n_neighbors)
-            Indices of the approximate nearest points in the population
-            matrix.
-        """
-        if not hasattr(self, 'hash_functions_'):
-            raise ValueError("estimator should be fitted.")
-
-        if n_neighbors is None:
-            n_neighbors = self.n_neighbors
-
-        X = check_array(X, accept_sparse='csr')
-
-        neighbors, distances = [], []
-        bin_queries, max_depth = self._query(X)
-        for i in range(X.shape[0]):
-
-            neighs, dists = self._get_candidates(X[[i]], max_depth[i],
-                                                 bin_queries[i],
-                                                 n_neighbors)
-            neighbors.append(neighs)
-            distances.append(dists)
-
-        if return_distance:
-            return np.array(distances), np.array(neighbors)
-        else:
-            return np.array(neighbors)
-
-    def radius_neighbors(self, X, radius=None, return_distance=True):
-        """Finds the neighbors within a given radius of a point or points.
-
-        Return the indices and distances of some points from the dataset
-        lying in a ball with size ``radius`` around the points of the query
-        array. Points lying on the boundary are included in the results.
-
-        The result points are *not* necessarily sorted by distance to their
-        query point.
-
-        LSH Forest being an approximate method, some true neighbors from the
-        indexed dataset might be missing from the results.
-
-        Parameters
-        ----------
-        X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
-            List of n_features-dimensional data points. Each row
-            corresponds to a single query.
-
-        radius : float
-            Limiting distance of neighbors to return.
-            (default is the value passed to the constructor).
-
-        return_distance : boolean, optional (default = False)
-            Returns the distances of neighbors if set to True.
-
-        Returns
-        -------
-        dist : array, shape (n_samples,) of arrays
-            Each element is an array representing the cosine distances
-            to some points found within ``radius`` of the respective query.
-            Only present if ``return_distance=True``.
-
-        ind : array, shape (n_samples,) of arrays
-            Each element is an array of indices for neighbors within ``radius``
-            of the respective query.
-        """
-        if not hasattr(self, 'hash_functions_'):
-            raise ValueError("estimator should be fitted.")
-
-        if radius is None:
-            radius = self.radius
-
-        X = check_array(X, accept_sparse='csr')
-
-        neighbors, distances = [], []
-        bin_queries, max_depth = self._query(X)
-        for i in range(X.shape[0]):
-
-            neighs, dists = self._get_radius_neighbors(X[[i]], max_depth[i],
-                                                       bin_queries[i], radius)
-            neighbors.append(neighs)
-            distances.append(dists)
-
-        if return_distance:
-            return _array_of_arrays(distances), _array_of_arrays(neighbors)
-        else:
-            return _array_of_arrays(neighbors)
-
-    def partial_fit(self, X, y=None):
-        """
-        Inserts new data into the already fitted LSH Forest.
-        Cost is proportional to new total size, so additions
-        should be batched.
-
-        Parameters
-        ----------
-        X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
-            New data point to be inserted into the LSH Forest.
-        """
-        X = check_array(X, accept_sparse='csr')
-        if not hasattr(self, 'hash_functions_'):
-            return self.fit(X)
-
-        if X.shape[1] != self._fit_X.shape[1]:
-            raise ValueError("Number of features in X and"
-                             " fitted array does not match.")
-        n_samples = X.shape[0]
-        n_indexed = self._fit_X.shape[0]
-
-        for i in range(self.n_estimators):
-            bin_X = self.hash_functions_[i].transform(X)[:, 0]
-            # gets the position to be added in the tree.
-            positions = self.trees_[i].searchsorted(bin_X)
-            # adds the hashed value into the tree.
-            self.trees_[i] = np.insert(self.trees_[i],
-                                       positions, bin_X)
-            # add the entry into the original_indices_.
-            self.original_indices_[i] = np.insert(self.original_indices_[i],
-                                                  positions,
-                                                  np.arange(n_indexed,
-                                                            n_indexed +
-                                                            n_samples))
-
-        # adds the entry into the input_array.
-        if sparse.issparse(X) or sparse.issparse(self._fit_X):
-            self._fit_X = sparse.vstack((self._fit_X, X))
-        else:
-            self._fit_X = np.row_stack((self._fit_X, X))
-
-        return self
diff --git a/sklearn/neighbors/tests/test_approximate.py b/sklearn/neighbors/tests/test_approximate.py
deleted file mode 100644
index 1536271897625..0000000000000
--- a/sklearn/neighbors/tests/test_approximate.py
+++ /dev/null
@@ -1,498 +0,0 @@
-"""
-Testing for the approximate neighbor search using
-Locality Sensitive Hashing Forest module
-(sklearn.neighbors.LSHForest).
-"""
-
-# Author: Maheshakya Wijewardena, Joel Nothman
-
-import numpy as np
-import scipy.sparse as sp
-
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_array_less
-from sklearn.utils.testing import assert_greater
-from sklearn.utils.testing import assert_true
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_warns_message
-from sklearn.utils.testing import ignore_warnings
-
-from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.neighbors import LSHForest
-from sklearn.neighbors import NearestNeighbors
-
-
-def test_lsh_forest_deprecation():
-    assert_warns_message(DeprecationWarning,
-                         "LSHForest has poor performance and has been "
-                         "deprecated in 0.19. It will be removed "
-                         "in version 0.21.", LSHForest)
-
-
-def test_neighbors_accuracy_with_n_candidates():
-    # Checks whether accuracy increases as `n_candidates` increases.
-    n_candidates_values = np.array([.1, 50, 500])
-    n_samples = 100
-    n_features = 10
-    n_iter = 10
-    n_points = 5
-    rng = np.random.RandomState(42)
-    accuracies = np.zeros(n_candidates_values.shape[0], dtype=float)
-    X = rng.rand(n_samples, n_features)
-
-    for i, n_candidates in enumerate(n_candidates_values):
-        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-            n_candidates=n_candidates, random_state=0)
-        ignore_warnings(lshf.fit)(X)
-        for j in range(n_iter):
-            query = X[rng.randint(0, n_samples)].reshape(1, -1)
-
-            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
-                                        return_distance=False)
-            distances = pairwise_distances(query, X, metric='cosine')
-            ranks = np.argsort(distances)[0, :n_points]
-
-            intersection = np.intersect1d(ranks, neighbors).shape[0]
-            ratio = intersection / float(n_points)
-            accuracies[i] = accuracies[i] + ratio
-
-        accuracies[i] = accuracies[i] / float(n_iter)
-    # Sorted accuracies should be equal to original accuracies
-    print('accuracies:', accuracies)
-    assert_true(np.all(np.diff(accuracies) >= 0),
-                msg="Accuracies are not non-decreasing.")
-    # Highest accuracy should be strictly greater than the lowest
-    assert_true(np.ptp(accuracies) > 0,
-                msg="Highest accuracy is not strictly greater than lowest.")
-
-
-def test_neighbors_accuracy_with_n_estimators():
-    # Checks whether accuracy increases as `n_estimators` increases.
-    n_estimators = np.array([1, 10, 100])
-    n_samples = 100
-    n_features = 10
-    n_iter = 10
-    n_points = 5
-    rng = np.random.RandomState(42)
-    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
-    X = rng.rand(n_samples, n_features)
-
-    for i, t in enumerate(n_estimators):
-        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-            n_candidates=500, n_estimators=t)
-        ignore_warnings(lshf.fit)(X)
-        for j in range(n_iter):
-            query = X[rng.randint(0, n_samples)].reshape(1, -1)
-            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
-                                        return_distance=False)
-            distances = pairwise_distances(query, X, metric='cosine')
-            ranks = np.argsort(distances)[0, :n_points]
-
-            intersection = np.intersect1d(ranks, neighbors).shape[0]
-            ratio = intersection / float(n_points)
-            accuracies[i] = accuracies[i] + ratio
-
-        accuracies[i] = accuracies[i] / float(n_iter)
-    # Sorted accuracies should be equal to original accuracies
-    assert_true(np.all(np.diff(accuracies) >= 0),
-                msg="Accuracies are not non-decreasing.")
-    # Highest accuracy should be strictly greater than the lowest
-    assert_true(np.ptp(accuracies) > 0,
-                msg="Highest accuracy is not strictly greater than lowest.")
-
-
-@ignore_warnings
-def test_kneighbors():
-    # Checks whether desired number of neighbors are returned.
-    # It is guaranteed to return the requested number of neighbors
-    # if `min_hash_match` is set to 0. Returned distances should be
-    # in ascending order.
-    n_samples = 12
-    n_features = 2
-    n_iter = 10
-    rng = np.random.RandomState(42)
-    X = rng.rand(n_samples, n_features)
-
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        min_hash_match=0)
-    # Test unfitted estimator
-    assert_raises(ValueError, lshf.kneighbors, X[0])
-
-    ignore_warnings(lshf.fit)(X)
-
-    for i in range(n_iter):
-        n_neighbors = rng.randint(0, n_samples)
-        query = X[rng.randint(0, n_samples)].reshape(1, -1)
-        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
-                                    return_distance=False)
-        # Desired number of neighbors should be returned.
-        assert_equal(neighbors.shape[1], n_neighbors)
-
-    # Multiple points
-    n_queries = 5
-    queries = X[rng.randint(0, n_samples, n_queries)]
-    distances, neighbors = lshf.kneighbors(queries,
-                                           n_neighbors=1,
-                                           return_distance=True)
-    assert_equal(neighbors.shape[0], n_queries)
-    assert_equal(distances.shape[0], n_queries)
-    # Test only neighbors
-    neighbors = lshf.kneighbors(queries, n_neighbors=1,
-                                return_distance=False)
-    assert_equal(neighbors.shape[0], n_queries)
-    # Test random point(not in the data set)
-    query = rng.randn(n_features).reshape(1, -1)
-    lshf.kneighbors(query, n_neighbors=1,
-                    return_distance=False)
-    # Test n_neighbors at initialization
-    neighbors = lshf.kneighbors(query, return_distance=False)
-    assert_equal(neighbors.shape[1], 5)
-    # Test `neighbors` has an integer dtype
-    assert_true(neighbors.dtype.kind == 'i',
-                msg="neighbors are not in integer dtype.")
-
-
-def test_radius_neighbors():
-    # Checks whether Returned distances are less than `radius`
-    # At least one point should be returned when the `radius` is set
-    # to mean distance from the considering point to other points in
-    # the database.
-    # Moreover, this test compares the radius neighbors of LSHForest
-    # with the `sklearn.neighbors.NearestNeighbors`.
-    n_samples = 12
-    n_features = 2
-    n_iter = 10
-    rng = np.random.RandomState(42)
-    X = rng.rand(n_samples, n_features)
-
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
-    # Test unfitted estimator
-    assert_raises(ValueError, lshf.radius_neighbors, X[0])
-
-    ignore_warnings(lshf.fit)(X)
-
-    for i in range(n_iter):
-        # Select a random point in the dataset as the query
-        query = X[rng.randint(0, n_samples)].reshape(1, -1)
-
-        # At least one neighbor should be returned when the radius is the
-        # mean distance from the query to the points of the dataset.
-        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
-        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
-                                          return_distance=False)
-
-        assert_equal(neighbors.shape, (1,))
-        assert_equal(neighbors.dtype, object)
-        assert_greater(neighbors[0].shape[0], 0)
-        # All distances to points in the results of the radius query should
-        # be less than mean_dist
-        distances, neighbors = lshf.radius_neighbors(query,
-                                                     radius=mean_dist,
-                                                     return_distance=True)
-        assert_array_less(distances[0], mean_dist)
-
-    # Multiple points
-    n_queries = 5
-    queries = X[rng.randint(0, n_samples, n_queries)]
-    distances, neighbors = lshf.radius_neighbors(queries,
-                                                 return_distance=True)
-
-    # dists and inds should not be 1D arrays or arrays of variable lengths
-    # hence the use of the object dtype.
-    assert_equal(distances.shape, (n_queries,))
-    assert_equal(distances.dtype, object)
-    assert_equal(neighbors.shape, (n_queries,))
-    assert_equal(neighbors.dtype, object)
-
-    # Compare with exact neighbor search
-    query = X[rng.randint(0, n_samples)].reshape(1, -1)
-    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
-    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
-
-    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
-    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
-
-    # The following fails on some platforms. See #10244
-
-    # # Radius-based queries do not sort the result points and the order
-    # # depends on the method, the random_state and the dataset order.
-    # # We need to sort the results ourselves before performing any comparison.
-    # sorted_dists_exact = np.sort(distances_exact[0])
-    # sorted_dists_approx = np.sort(distances_approx[0])
-    #
-    # # Distances to exact neighbors are less than or equal to approximate
-    # # counterparts as the approximate radius query might have missed some
-    # # closer neighbors.
-    #
-    # assert_true(np.all(np.less_equal(sorted_dists_exact,
-    #                                  sorted_dists_approx)))
-
-
-@ignore_warnings
-def test_radius_neighbors_boundary_handling():
-    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
-    n_points = len(X)
-
-    # Build an exact nearest neighbors model as reference model to ensure
-    # consistency between exact and approximate methods
-    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
-
-    # Build a LSHForest model with hyperparameter values that always guarantee
-    # exact results on this toy dataset.
-    lsfh = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        min_hash_match=0, n_candidates=n_points, random_state=42).fit(X)
-
-    # define a query aligned with the first axis
-    query = [[1., 0.]]
-
-    # Compute the exact cosine distances of the query to the four points of
-    # the dataset
-    dists = pairwise_distances(query, X, metric='cosine').ravel()
-
-    # The first point is almost aligned with the query (very small angle),
-    # the cosine distance should therefore be almost null:
-    assert_almost_equal(dists[0], 0, decimal=5)
-
-    # The second point form an angle of 45 degrees to the query vector
-    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))
-
-    # The third point is orthogonal from the query vector hence at a distance
-    # exactly one:
-    assert_almost_equal(dists[2], 1)
-
-    # The last point is almost colinear but with opposite sign to the query
-    # therefore it has a cosine 'distance' very close to the maximum possible
-    # value of 2.
-    assert_almost_equal(dists[3], 2, decimal=5)
-
-    # If we query with a radius of one, all the samples except the last sample
-    # should be included in the results. This means that the third sample
-    # is lying on the boundary of the radius query:
-    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
-    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)
-
-    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
-    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
-    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
-    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])
-
-    # If we perform the same query with a slightly lower radius, the third
-    # point of the dataset that lay on the boundary of the previous query
-    # is now rejected:
-    eps = np.finfo(np.float64).eps
-    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
-    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)
-
-    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
-    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
-    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
-    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
-
-
-def test_distances():
-    # Checks whether returned neighbors are from closest to farthest.
-    n_samples = 12
-    n_features = 2
-    n_iter = 10
-    rng = np.random.RandomState(42)
-    X = rng.rand(n_samples, n_features)
-
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
-    ignore_warnings(lshf.fit)(X)
-
-    for i in range(n_iter):
-        n_neighbors = rng.randint(0, n_samples)
-        query = X[rng.randint(0, n_samples)].reshape(1, -1)
-        distances, neighbors = lshf.kneighbors(query,
-                                               n_neighbors=n_neighbors,
-                                               return_distance=True)
-
-        # Returned neighbors should be from closest to farthest, that is
-        # increasing distance values.
-        assert_true(np.all(np.diff(distances[0]) >= 0))
-
-        # Note: the radius_neighbors method does not guarantee the order of
-        # the results.
-
-
-def test_fit():
-    # Checks whether `fit` method sets all attribute values correctly.
-    n_samples = 12
-    n_features = 2
-    n_estimators = 5
-    rng = np.random.RandomState(42)
-    X = rng.rand(n_samples, n_features)
-
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        n_estimators=n_estimators)
-    ignore_warnings(lshf.fit)(X)
-
-    # _input_array = X
-    assert_array_equal(X, lshf._fit_X)
-    # A hash function g(p) for each tree
-    assert_equal(n_estimators, len(lshf.hash_functions_))
-    # Hash length = 32
-    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
-    # Number of trees_ in the forest
-    assert_equal(n_estimators, len(lshf.trees_))
-    # Each tree has entries for every data point
-    assert_equal(n_samples, len(lshf.trees_[0]))
-    # Original indices after sorting the hashes
-    assert_equal(n_estimators, len(lshf.original_indices_))
-    # Each set of original indices in a tree has entries for every data point
-    assert_equal(n_samples, len(lshf.original_indices_[0]))
-
-
-def test_partial_fit():
-    # Checks whether inserting array is consistent with fitted data.
-    # `partial_fit` method should set all attribute values correctly.
-    n_samples = 12
-    n_samples_partial_fit = 3
-    n_features = 2
-    rng = np.random.RandomState(42)
-    X = rng.rand(n_samples, n_features)
-    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)
-
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
-
-    # Test unfitted estimator
-    ignore_warnings(lshf.partial_fit)(X)
-    assert_array_equal(X, lshf._fit_X)
-
-    ignore_warnings(lshf.fit)(X)
-
-    # Insert wrong dimension
-    assert_raises(ValueError, lshf.partial_fit,
-                  np.random.randn(n_samples_partial_fit, n_features - 1))
-
-    ignore_warnings(lshf.partial_fit)(X_partial_fit)
-
-    # size of _input_array = samples + 1 after insertion
-    assert_equal(lshf._fit_X.shape[0],
-                 n_samples + n_samples_partial_fit)
-    # size of original_indices_[1] = samples + 1
-    assert_equal(len(lshf.original_indices_[0]),
-                 n_samples + n_samples_partial_fit)
-    # size of trees_[1] = samples + 1
-    assert_equal(len(lshf.trees_[1]),
-                 n_samples + n_samples_partial_fit)
-
-
-def test_hash_functions():
-    # Checks randomness of hash functions.
-    # Variance and mean of each hash function (projection vector)
-    # should be different from flattened array of hash functions.
-    # If hash functions are not randomly built (seeded with
-    # same value), variances and means of all functions are equal.
-    n_samples = 12
-    n_features = 2
-    n_estimators = 5
-    rng = np.random.RandomState(42)
-    X = rng.rand(n_samples, n_features)
-
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        n_estimators=n_estimators,
-        random_state=rng.randint(0, np.iinfo(np.int32).max))
-    ignore_warnings(lshf.fit)(X)
-
-    hash_functions = []
-    for i in range(n_estimators):
-        hash_functions.append(lshf.hash_functions_[i].components_)
-
-    for i in range(n_estimators):
-        assert_not_equal(np.var(hash_functions),
-                         np.var(lshf.hash_functions_[i].components_))
-
-    for i in range(n_estimators):
-        assert_not_equal(np.mean(hash_functions),
-                         np.mean(lshf.hash_functions_[i].components_))
-
-
-def test_candidates():
-    # Checks whether candidates are sufficient.
-    # This should handle the cases when number of candidates is 0.
-    # User should be warned when number of candidates is less than
-    # requested number of neighbors.
-    X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1],
-                        [6, 10, 2]], dtype=np.float32)
-    X_test = np.array([7, 10, 3], dtype=np.float32).reshape(1, -1)
-
-    # For zero candidates
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        min_hash_match=32)
-    ignore_warnings(lshf.fit)(X_train)
-
-    message = ("Number of candidates is not sufficient to retrieve"
-               " %i neighbors with"
-               " min_hash_match = %i. Candidates are filled up"
-               " uniformly from unselected"
-               " indices." % (3, 32))
-    assert_warns_message(UserWarning, message, lshf.kneighbors,
-                         X_test, n_neighbors=3)
-    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
-    assert_equal(distances.shape[1], 3)
-
-    # For candidates less than n_neighbors
-    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        min_hash_match=31)
-    ignore_warnings(lshf.fit)(X_train)
-
-    message = ("Number of candidates is not sufficient to retrieve"
-               " %i neighbors with"
-               " min_hash_match = %i. Candidates are filled up"
-               " uniformly from unselected"
-               " indices." % (5, 31))
-    assert_warns_message(UserWarning, message, lshf.kneighbors,
-                         X_test, n_neighbors=5)
-    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
-    assert_equal(distances.shape[1], 5)
-
-
-def test_graphs():
-    # Smoke tests for graph methods.
-    n_samples_sizes = [5, 10, 20]
-    n_features = 3
-    rng = np.random.RandomState(42)
-
-    for n_samples in n_samples_sizes:
-        X = rng.rand(n_samples, n_features)
-        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
-            min_hash_match=0)
-        ignore_warnings(lshf.fit)(X)
-
-        kneighbors_graph = lshf.kneighbors_graph(X)
-        radius_neighbors_graph = lshf.radius_neighbors_graph(X)
-
-        assert_equal(kneighbors_graph.shape[0], n_samples)
-        assert_equal(kneighbors_graph.shape[1], n_samples)
-        assert_equal(radius_neighbors_graph.shape[0], n_samples)
-        assert_equal(radius_neighbors_graph.shape[1], n_samples)
-
-
-def test_sparse_input():
-    X1 = sp.rand(50, 100, random_state=0)
-    X2 = sp.rand(10, 100, random_state=1)
-    forest_sparse = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        radius=1, random_state=0).fit(X1)
-    forest_dense = ignore_warnings(LSHForest, category=DeprecationWarning)(
-        radius=1, random_state=0).fit(X1.A)
-
-    d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True)
-    d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True)
-
-    assert_almost_equal(d_sparse, d_dense)
-    assert_almost_equal(i_sparse, i_dense)
-
-    d_sparse, i_sparse = forest_sparse.radius_neighbors(X2,
-                                                        return_distance=True)
-    d_dense, i_dense = forest_dense.radius_neighbors(X2.A,
-                                                     return_distance=True)
-    assert_equal(d_sparse.shape, d_dense.shape)
-    for a, b in zip(d_sparse, d_dense):
-        assert_almost_equal(a, b)
-    for a, b in zip(i_sparse, i_dense):
-        assert_almost_equal(a, b)
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index 15905bf37d2e5..d1d69bde6f4a8 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -37,8 +37,6 @@
 
 from .imputation import Imputer
 
-# stub, remove in version 0.21
-from .data import CategoricalEncoder  # noqa
 
 __all__ = [
     'Binarizer',
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 0c79543338212..93afcc646e3fb 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -136,7 +136,7 @@ def fit(self, X, y=None):
             self._check_inverse_transform(X)
         return self
 
-    def transform(self, X, y='deprecated'):
+    def transform(self, X):
         """Transform X using the forward function.
 
         Parameters
@@ -144,22 +144,14 @@ def transform(self, X, y='deprecated'):
         X : array-like, shape (n_samples, n_features)
             Input array.
 
-        y : (ignored)
-            .. deprecated::0.19
-
         Returns
         -------
         X_out : array-like, shape (n_samples, n_features)
             Transformed input.
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-
-        return self._transform(X, y=y, func=self.func, kw_args=self.kw_args)
+        return self._transform(X, func=self.func, kw_args=self.kw_args)
 
-    def inverse_transform(self, X, y='deprecated'):
+    def inverse_transform(self, X):
         """Transform X using the inverse function.
 
         Parameters
@@ -167,35 +159,18 @@ def inverse_transform(self, X, y='deprecated'):
         X : array-like, shape (n_samples, n_features)
             Input array.
 
-        y : (ignored)
-            .. deprecated::0.19
-
         Returns
         -------
         X_out : array-like, shape (n_samples, n_features)
             Transformed input.
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on inverse_transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-        return self._transform(X, y=y, func=self.inverse_func,
+        return self._transform(X, func=self.inverse_func,
                                kw_args=self.inv_kw_args)
 
-    def _transform(self, X, y=None, func=None, kw_args=None):
+    def _transform(self, X, func=None, kw_args=None):
         X = self._check_input(X)
 
         if func is None:
             func = _identity
 
-        if (not isinstance(self.pass_y, string_types) or
-                self.pass_y != 'deprecated'):
-            # We do this to know if pass_y was set to False / True
-            pass_y = self.pass_y
-            warnings.warn("The parameter pass_y is deprecated since 0.19 and "
-                          "will be removed in 0.21", DeprecationWarning)
-        else:
-            pass_y = False
-
-        return func(X, *((y,) if pass_y else ()),
-                    **(kw_args if kw_args else {}))
+        return func(X, **(kw_args if kw_args else {}))
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 0a33f9140f902..9b3eaa98e4c08 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -718,24 +718,16 @@ def partial_fit(self, X, y=None):
 
         return self
 
-    def transform(self, X, y='deprecated', copy=None):
+    def transform(self, X, copy=None):
         """Perform standardization by centering and scaling
 
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data used to scale along the features axis.
-        y : (ignored)
-            .. deprecated:: 0.19
-               This parameter will be removed in 0.21.
         copy : bool, optional (default: None)
             Copy the input X or not.
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-
         check_is_fitted(self, 'scale_')
 
         copy = copy if copy is not None else self.copy
@@ -1655,7 +1647,7 @@ def fit(self, X, y=None):
         X = check_array(X, accept_sparse='csr')
         return self
 
-    def transform(self, X, y='deprecated', copy=None):
+    def transform(self, X, copy=None):
         """Scale each non zero row of X to unit norm
 
         Parameters
@@ -1663,17 +1655,9 @@ def transform(self, X, y='deprecated', copy=None):
         X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data to normalize, row by row. scipy.sparse matrices should be
             in CSR format to avoid an un-necessary copy.
-        y : (ignored)
-            .. deprecated:: 0.19
-               This parameter will be removed in 0.21.
         copy : bool, optional (default: None)
             Copy the input X or not.
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-
         copy = copy if copy is not None else self.copy
         X = check_array(X, accept_sparse='csr')
         return normalize(X, norm=self.norm, axis=1, copy=copy)
@@ -1794,7 +1778,7 @@ def fit(self, X, y=None):
         check_array(X, accept_sparse='csr')
         return self
 
-    def transform(self, X, y='deprecated', copy=None):
+    def transform(self, X, copy=None):
         """Binarize each element of X
 
         Parameters
@@ -1803,17 +1787,10 @@ def transform(self, X, y='deprecated', copy=None):
             The data to binarize, element by element.
             scipy.sparse matrices should be in CSR format to avoid an
             un-necessary copy.
-        y : (ignored)
-            .. deprecated:: 0.19
-               This parameter will be removed in 0.21.
+
         copy : bool
             Copy the input X or not.
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-
         copy = copy if copy is not None else self.copy
         return binarize(X, threshold=self.threshold, copy=copy)
 
@@ -1872,16 +1849,14 @@ def fit(self, K, y=None):
         self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples
         return self
 
-    def transform(self, K, y='deprecated', copy=True):
+    def transform(self, K, copy=True):
         """Center kernel matrix.
 
         Parameters
         ----------
         K : numpy array of shape [n_samples1, n_samples2]
             Kernel matrix.
-        y : (ignored)
-            .. deprecated:: 0.19
-               This parameter will be removed in 0.21.
+
         copy : boolean, optional, default True
             Set to False to perform inplace computation.
 
@@ -1889,11 +1864,6 @@ def transform(self, K, y='deprecated', copy=True):
         -------
         K_new : numpy array of shape [n_samples1, n_samples2]
         """
-        if not isinstance(y, string_types) or y != 'deprecated':
-            warnings.warn("The parameter y on transform() is "
-                          "deprecated since 0.19 and will be removed in 0.21",
-                          DeprecationWarning)
-
         check_is_fitted(self, 'K_fit_all_')
 
         K = check_array(K, copy=copy, dtype=FLOAT_DTYPES)
@@ -2902,18 +2872,4 @@ def power_transform(X, method='box-cox', standardize=True, copy=True):
     Royal Statistical Society B, 26, 211-252 (1964).
     """
     pt = PowerTransformer(method=method, standardize=standardize, copy=copy)
-    return pt.fit_transform(X)
-
-
-class CategoricalEncoder:
-    """
-    CategoricalEncoder briefly existed in 0.20dev. Its functionality
-    has been rolled into the OneHotEncoder and OrdinalEncoder.
-    This stub will be removed in version 0.21.
-    """
-
-    def __init__(*args, **kwargs):
-        raise RuntimeError(
-            "CategoricalEncoder briefly existed in 0.20dev. Its functionality "
-            "has been rolled into the OneHotEncoder and OrdinalEncoder. "
-            "This stub will be removed in version 0.21.")
+    return pt.fit_transform(X)
\ No newline at end of file
diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py
index ff32005399fe2..081e54fbb0dfb 100644
--- a/sklearn/semi_supervised/label_propagation.py
+++ b/sklearn/semi_supervised/label_propagation.py
@@ -322,13 +322,6 @@ class LabelPropagation(BaseLabelPropagation):
     n_neighbors : integer > 0
         Parameter for knn kernel
 
-    alpha : float
-        Clamping factor.
-
-        .. deprecated:: 0.19
-            This parameter will be removed in 0.21.
-            'alpha' is fixed to zero in 'LabelPropagation'.
-
     max_iter : integer
         Change maximum number of iterations allowed
 
@@ -388,10 +381,10 @@ class LabelPropagation(BaseLabelPropagation):
     _variant = 'propagation'
 
     def __init__(self, kernel='rbf', gamma=20, n_neighbors=7,
-                 alpha=None, max_iter=1000, tol=1e-3, n_jobs=None):
+                 max_iter=1000, tol=1e-3, n_jobs=None):
         super(LabelPropagation, self).__init__(
-            kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha,
-            max_iter=max_iter, tol=tol, n_jobs=n_jobs)
+            kernel=kernel, gamma=gamma, n_neighbors=n_neighbors,
+            max_iter=max_iter, tol=tol, n_jobs=n_jobs, alpha=None)
 
     def _build_graph(self):
         """Matrix representing a fully connected graph between each sample
@@ -410,12 +403,6 @@ class distributions will exceed 1 (normalization may be desired).
         return affinity_matrix
 
     def fit(self, X, y):
-        if self.alpha is not None:
-            warnings.warn(
-                "alpha is deprecated since 0.19 and will be removed in 0.21.",
-                DeprecationWarning
-            )
-            self.alpha = None
         return super(LabelPropagation, self).fit(X, y)
 
 
diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
index 6e509949b0a88..4cb8f5d148b04 100644
--- a/sklearn/tests/test_discriminant_analysis.py
+++ b/sklearn/tests/test_discriminant_analysis.py
@@ -316,20 +316,6 @@ def test_qda_store_covariance():
     )
 
 
-def test_qda_deprecation():
-    # Test the deprecation
-    clf = QuadraticDiscriminantAnalysis(store_covariances=True)
-    assert_warns_message(DeprecationWarning, "'store_covariances' was renamed"
-                         " to store_covariance in version 0.19 and will be "
-                         "removed in 0.21.", clf.fit, X, y)
-
-    # check that covariance_ (and covariances_ with warning) is stored
-    assert_warns_message(DeprecationWarning, "Attribute ``covariances_`` was "
-                         "deprecated in version 0.19 and will be removed "
-                         "in 0.21. Use ``covariance_`` instead", getattr, clf,
-                         'covariances_')
-
-
 def test_qda_regularization():
     # the default is reg_param=0. and will cause issues
     # when there is a constant variable
diff --git a/sklearn/utils/arpack.py b/sklearn/utils/arpack.py
deleted file mode 100644
index 0343f7243ebdb..0000000000000
--- a/sklearn/utils/arpack.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Remove this module in version 0.21
-
-from scipy.sparse.linalg import eigs as _eigs, eigsh as _eigsh, svds as _svds
-
-from .deprecation import deprecated
-
-
-@deprecated("sklearn.utils.arpack.eigs was deprecated in version 0.19 and "
-            "will be removed in 0.21. Use scipy.sparse.linalg.eigs instead.")
-def eigs(A, *args, **kwargs):
-    return _eigs(A, *args, **kwargs)
-
-
-@deprecated("sklearn.utils.arpack.eigsh was deprecated in version 0.19 and "
-            "will be removed in 0.21. Use scipy.sparse.linalg.eigsh instead.")
-def eigsh(A, *args, **kwargs):
-    return _eigsh(A, *args, **kwargs)
-
-
-@deprecated("sklearn.utils.arpack.svds was deprecated in version 0.19 and "
-            "will be removed in 0.21. Use scipy.sparse.linalg.svds instead.")
-def svds(A, *args, **kwargs):
-    return _svds(A, *args, **kwargs)
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 95e464f071644..07a83a17377b5 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -26,17 +26,6 @@
 from .validation import check_array
 
 
-@deprecated("sklearn.utils.extmath.norm was deprecated in version 0.19 "
-            "and will be removed in 0.21. Use scipy.linalg.norm instead.")
-def norm(x):
-    """Compute the Euclidean or Frobenius norm of x.
-
-    Returns the Euclidean norm when x is a vector, the Frobenius norm when x
-    is a matrix (2-d array). More precise than sqrt(squared_norm(x)).
-    """
-    return linalg.norm(x)
-
-
 def squared_norm(x):
     """Squared Euclidean or Frobenius norm of x.
 
@@ -119,12 +108,6 @@ def _impose_f_order(X):
         return check_array(X, copy=False, order='F'), False
 
 
-@deprecated("sklearn.utils.extmath.fast_dot was deprecated in version 0.19 "
-            "and will be removed in 0.21. Use the equivalent np.dot instead.")
-def fast_dot(a, b, out=None):
-    return np.dot(a, b, out)
-
-
 def density(w, **kwargs):
     """Compute density of a sparse vector
 
@@ -388,25 +371,6 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
         return U[:, :n_components], s[:n_components], V[:n_components, :]
 
 
-@deprecated("sklearn.utils.extmath.logsumexp was deprecated in version 0.19 "
-            "and will be removed in 0.21. Use scipy.misc.logsumexp instead.")
-def logsumexp(arr, axis=0):
-    """Computes the sum of arr assuming arr is in the log domain.
-    Returns log(sum(exp(arr))) while minimizing the possibility of
-    over/underflow.
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.utils.extmath import logsumexp
-    >>> a = np.arange(10)
-    >>> np.log(np.sum(np.exp(a)))
-    9.458...
-    >>> logsumexp(a)  # doctest: +SKIP
-    9.458...
-    """
-    return scipy_logsumexp(arr, axis)
-
-
 def weighted_mode(a, w, axis=0):
     """Returns an array of the weighted modal (most common) value in a
 
@@ -480,12 +444,6 @@ def weighted_mode(a, w, axis=0):
     return mostfrequent, oldcounts
 
 
-@deprecated("sklearn.utils.extmath.pinvh was deprecated in version 0.19 "
-            "and will be removed in 0.21. Use scipy.linalg.pinvh instead.")
-def pinvh(a, cond=None, rcond=None, lower=True):
-    return linalg.pinvh(a, cond, rcond, lower)
-
-
 def cartesian(arrays, out=None):
     """Generate a cartesian product of input arrays.
 
diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py
index 8bbebbd377451..17caa4fa2cb0d 100644
--- a/sklearn/utils/graph.py
+++ b/sklearn/utils/graph.py
@@ -68,17 +68,3 @@ def single_source_shortest_path_length(graph, source, cutoff=None):
             break
         level += 1
     return seen  # return all path lengths as dictionary
-
-
-@deprecated("sklearn.utils.graph.connected_components was deprecated in "
-            "version 0.19 and will be removed in 0.21. Use "
-            "scipy.sparse.csgraph.connected_components instead.")
-def connected_components(*args, **kwargs):
-    return csgraph.connected_components(*args, **kwargs)
-
-
-@deprecated("sklearn.utils.graph.graph_laplacian was deprecated in version "
-            "0.19 and will be removed in 0.21. Use "
-            "scipy.sparse.csgraph.laplacian instead.")
-def graph_laplacian(*args, **kwargs):
-    return csgraph.laplacian(*args, **kwargs)
diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py
index 24ddf4680c742..29d465fff8705 100644
--- a/sklearn/utils/random.py
+++ b/sklearn/utils/random.py
@@ -13,106 +13,6 @@
 __all__ = ['sample_without_replacement', 'choice']
 
 
-# This is a backport of np.random.choice from numpy 1.7
-# The function can be removed when we bump the requirements to >=1.7
-@deprecated("sklearn.utils.random.choice was deprecated in version 0.19 "
-            "and will be removed in 0.21. Use np.random.choice or "
-            "np.random.RandomState.choice instead.")
-def choice(a, size=None, replace=True, p=None, random_state=None):
-    """
-    choice(a, size=None, replace=True, p=None)
-
-    Generates a random sample from a given 1-D array
-
-    .. versionadded:: 1.7.0
-
-    Parameters
-    -----------
-    a : 1-D array-like or int
-        If an ndarray, a random sample is generated from its elements.
-        If an int, the random sample is generated as if a was np.arange(n)
-
-    size : int or tuple of ints, optional
-        Output shape. Default is None, in which case a single value is
-        returned.
-
-    replace : boolean, optional
-        Whether the sample is with or without replacement.
-
-    p : 1-D array-like, optional
-        The probabilities associated with each entry in a.
-        If not given the sample assumes a uniform distribution over all
-        entries in a.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-
-    Returns
-    --------
-    samples : 1-D ndarray, shape (size,)
-    The generated random samples
-
-    Raises
-    -------
-    ValueError
-    If a is an int and less than zero, if a or p are not 1-dimensional,
-    if a is an array-like of size 0, if p is not a vector of
-    probabilities, if a and p have different lengths, or if
-    replace=False and the sample size is greater than the population
-    size
-
-    See Also
-    ---------
-    randint, shuffle, permutation
-
-    Examples
-    ---------
-    Generate a uniform random sample from np.arange(5) of size 3:
-
-    >>> np.random.choice(5, 3)  # doctest: +SKIP
-    array([0, 3, 4])
-    >>> #This is equivalent to np.random.randint(0,5,3)
-
-    Generate a non-uniform random sample from np.arange(5) of size 3:
-
-    >>> np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0])  # doctest: +SKIP
-    array([3, 3, 0])
-
-    Generate a uniform random sample from np.arange(5) of size 3 without
-    replacement:
-
-    >>> np.random.choice(5, 3, replace=False)  # doctest: +SKIP
-    array([3,1,0])
-    >>> #This is equivalent to np.random.shuffle(np.arange(5))[:3]
-
-    Generate a non-uniform random sample from np.arange(5) of size
-    3 without replacement:
-
-    >>> np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0])
-    ... # doctest: +SKIP
-    array([2, 3, 0])
-
-    Any of the above can be repeated with an arbitrary array-like
-    instead of just integers. For instance:
-
-    >>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
-    >>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
-    ... # doctest: +SKIP
-    array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'],
-    dtype='|S11')
-
-    """
-    if random_state is not None:
-        random_state = check_random_state(random_state)
-        return random_state.choice(a, size, replace, p)
-    else:
-        return np.random.choice(a, size, replace, p)
-
-
 def random_choice_csc(n_samples, classes, class_probability=None,
                       random_state=None):
     """Generate a sparse random matrix given column class distributions
diff --git a/sklearn/utils/sparsetools/__init__.py b/sklearn/utils/sparsetools/__init__.py
deleted file mode 100644
index a86598410e7fe..0000000000000
--- a/sklearn/utils/sparsetools/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Remove in version 0.21
-
-from scipy.sparse.csgraph import connected_components as \
-     scipy_connected_components
-
-from sklearn.utils.deprecation import deprecated
-
-
-@deprecated("sklearn.utils.sparsetools.connected_components was deprecated in "
-            "version 0.19 and will be removed in 0.21. Use "
-            "scipy.sparse.csgraph.connected_components instead.")
-def connected_components(*args, **kwargs):
-    return scipy_connected_components(*args, **kwargs)
diff --git a/sklearn/utils/sparsetools/setup.py b/sklearn/utils/sparsetools/setup.py
deleted file mode 100644
index 1ff3097b0db73..0000000000000
--- a/sklearn/utils/sparsetools/setup.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Remove in version 0.21
-
-
-def configuration(parent_package='', top_path=None):
-    from numpy.distutils.misc_util import Configuration
-
-    config = Configuration('sparsetools', parent_package, top_path)
-    config.add_subpackage('tests')
-
-    return config
-
-
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(**configuration(top_path='').todict())
diff --git a/sklearn/utils/sparsetools/tests/__init__.py b/sklearn/utils/sparsetools/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index 82b8912b78824..458669e23eb3a 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -5,13 +5,6 @@
 from sklearn.utils.deprecation import deprecated
 
 
-# Remove in sklearn 0.21
-@deprecated("sklearn.utils.stats.rankdata was deprecated in version 0.19 and "
-            "will be removed in 0.21. Use scipy.stats.rankdata instead.")
-def rankdata(*args, **kwargs):
-    return scipy_rankdata(*args, **kwargs)
-
-
 def _weighted_percentile(array, sample_weight, percentile=50):
     """
     Compute the weighted ``percentile`` of ``array`` with ``sample_weight``.
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 3de67e5a2130c..07431ed11c3bf 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -88,18 +88,6 @@ def test_random_weights():
     assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
 
 
-@ignore_warnings  # Test deprecated backport to be removed in 0.21
-def test_logsumexp():
-    # Try to add some smallish numbers in logspace
-    x = np.array([1e-40] * 1000000)
-    logx = np.log(x)
-    assert_almost_equal(np.exp(logsumexp(logx)), x.sum())
-
-    X = np.vstack([x, x])
-    logX = np.vstack([logx, logx])
-    assert_array_almost_equal(np.exp(logsumexp(logX, axis=0)), X.sum(axis=0))
-    assert_array_almost_equal(np.exp(logsumexp(logX, axis=1)), X.sum(axis=1))
-
 
 def check_randomized_svd_low_rank(dtype):
     # Check that extmath.randomized_svd is consistent with linalg.svd
@@ -179,22 +167,6 @@ def test_randomized_svd_low_rank_all_dtypes(dtype):
     check_randomized_svd_low_rank(dtype)
 
 
-@ignore_warnings  # extmath.norm is deprecated to be removed in 0.21
-def test_norm_squared_norm():
-    X = np.random.RandomState(42).randn(50, 63)
-    X *= 100        # check stability
-    X += 200
-
-    assert_almost_equal(np.linalg.norm(X.ravel()), norm(X))
-    assert_almost_equal(norm(X) ** 2, squared_norm(X), decimal=6)
-    assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
-    # Check the warning with an int array and np.dot potential overflow
-    assert_warns_message(
-                    UserWarning, 'Array type is integer, np.dot may '
-                    'overflow. Data should be float type to avoid this issue',
-                    squared_norm, X.astype(int))
-
-
 @pytest.mark.parametrize('dtype',
                          (np.float32, np.float64))
 def test_row_norms(dtype):
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
index 36e3bf72b609b..b6b43644e476e 100644
--- a/sklearn/utils/tests/test_stats.py
+++ b/sklearn/utils/tests/test_stats.py
@@ -12,12 +12,3 @@
     ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
     ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
 )
-
-
-@pytest.mark.parametrize("values, method, expected", _cases)
-def test_cases_rankdata(values, method, expected):
-
-    # Test deprecated backport to be removed in 0.21
-    with ignore_warnings():
-        r = rankdata(values, method=method)
-        assert_array_equal(r, expected)
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index c2474c58c13f7..840e08524c384 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -108,56 +108,6 @@ def test_safe_mask():
     assert_equal(X_csr[mask].shape[0], 3)
 
 
-@ignore_warnings  # Test deprecated backport to be removed in 0.21
-def test_pinvh_simple_real():
-    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 10]], dtype=np.float64)
-    a = np.dot(a, a.T)
-    a_pinv = pinvh(a)
-    assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
-
-
-@ignore_warnings  # Test deprecated backport to be removed in 0.21
-def test_pinvh_nonpositive():
-    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float64)
-    a = np.dot(a, a.T)
-    u, s, vt = np.linalg.svd(a)
-    s[0] *= -1
-    a = np.dot(u * s, vt)  # a is now symmetric non-positive and singular
-    a_pinv = pinv2(a)
-    a_pinvh = pinvh(a)
-    assert_almost_equal(a_pinv, a_pinvh)
-
-
-@ignore_warnings  # Test deprecated backport to be removed in 0.21
-def test_pinvh_simple_complex():
-    a = (np.array([[1, 2, 3], [4, 5, 6], [7, 8, 10]])
-         + 1j * np.array([[10, 8, 7], [6, 5, 4], [3, 2, 1]]))
-    a = np.dot(a, a.conj().T)
-    a_pinv = pinvh(a)
-    assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
-
-
-@ignore_warnings  # Test deprecated backport to be removed in 0.21
-def test_arpack_eigsh_initialization():
-    # Non-regression test that shows null-space computation is better with
-    # initialization of eigsh from [-1,1] instead of [0,1]
-    random_state = check_random_state(42)
-
-    A = random_state.rand(50, 50)
-    A = np.dot(A.T, A)  # create s.p.d. matrix
-    A = laplacian(A) + 1e-7 * np.identity(A.shape[0])
-    k = 5
-
-    # Test if eigsh is working correctly
-    # New initialization [-1,1] (as in original ARPACK)
-    # Was [0,1] before, with which this test could fail
-    v0 = random_state.uniform(-1, 1, A.shape[0])
-    w, _ = eigsh(A, k=k, sigma=0.0, v0=v0)
-
-    # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest
-    assert_greater_equal(w[0], 0)
-
-
 def test_column_or_1d():
     EXAMPLES = [
         ("binary", ["spam", "egg", "spam"]),

From acf7659fdf565708db8ef971c8008befade19a05 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 18:11:32 -0400
Subject: [PATCH 02/11] fixed some inits and test imports

---
 sklearn/datasets/__init__.py        |  2 --
 sklearn/linear_model/__init__.py    |  6 ------
 sklearn/neighbors/__init__.py       |  2 --
 sklearn/utils/tests/test_extmath.py |  3 +--
 sklearn/utils/tests/test_graph.py   | 26 --------------------------
 sklearn/utils/tests/test_stats.py   | 14 --------------
 sklearn/utils/tests/test_utils.py   |  2 --
 7 files changed, 1 insertion(+), 54 deletions(-)
 delete mode 100644 sklearn/utils/tests/test_graph.py
 delete mode 100644 sklearn/utils/tests/test_stats.py

diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
index c7d78e633493d..77dac99c1d970 100644
--- a/sklearn/datasets/__init__.py
+++ b/sklearn/datasets/__init__.py
@@ -17,7 +17,6 @@
 from .base import clear_data_home
 from .covtype import fetch_covtype
 from .kddcup99 import fetch_kddcup99
-from .mlcomp import load_mlcomp
 from .lfw import fetch_lfw_pairs
 from .lfw import fetch_lfw_people
 from .twenty_newsgroups import fetch_20newsgroups
@@ -75,7 +74,6 @@
            'load_iris',
            'load_breast_cancer',
            'load_linnerud',
-           'load_mlcomp',
            'load_sample_image',
            'load_sample_images',
            'load_svmlight_file',
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index f3100d45e2e66..2e01990ccce8c 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -31,9 +31,6 @@
 from .passive_aggressive import PassiveAggressiveRegressor
 from .perceptron import Perceptron
 
-from .randomized_l1 import (RandomizedLasso, RandomizedLogisticRegression,
-                            lasso_stability_path)
-
 from .ransac import RANSACRegressor
 from .theil_sen import TheilSenRegressor
 
@@ -65,8 +62,6 @@
            'PassiveAggressiveClassifier',
            'PassiveAggressiveRegressor',
            'Perceptron',
-           'RandomizedLasso',
-           'RandomizedLogisticRegression',
            'Ridge',
            'RidgeCV',
            'RidgeClassifier',
@@ -78,7 +73,6 @@
            'enet_path',
            'lars_path',
            'lasso_path',
-           'lasso_stability_path',
            'logistic_regression_path',
            'orthogonal_mp',
            'orthogonal_mp_gram',
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 93c1bbbba0ba8..51116b3f470e6 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -12,7 +12,6 @@
 from .regression import KNeighborsRegressor, RadiusNeighborsRegressor
 from .nearest_centroid import NearestCentroid
 from .kde import KernelDensity
-from .approximate import LSHForest
 from .lof import LocalOutlierFactor
 from .base import VALID_METRICS, VALID_METRICS_SPARSE
 
@@ -28,7 +27,6 @@
            'kneighbors_graph',
            'radius_neighbors_graph',
            'KernelDensity',
-           'LSHForest',
            'LocalOutlierFactor',
            'VALID_METRICS',
            'VALID_METRICS_SPARSE']
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 07431ed11c3bf..d22ec5b886c89 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -27,8 +27,7 @@
 from sklearn.utils.fixes import np_version
 
 from sklearn.utils.extmath import density
-from sklearn.utils.extmath import logsumexp
-from sklearn.utils.extmath import norm, squared_norm
+from sklearn.utils.extmath import squared_norm
 from sklearn.utils.extmath import randomized_svd
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.extmath import weighted_mode
diff --git a/sklearn/utils/tests/test_graph.py b/sklearn/utils/tests/test_graph.py
deleted file mode 100644
index ae1ce4a56cb8e..0000000000000
--- a/sklearn/utils/tests/test_graph.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
-# License: BSD 3 clause
-
-import numpy as np
-from scipy import sparse
-
-from sklearn.utils.graph import graph_laplacian
-from sklearn.utils.testing import ignore_warnings
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_graph_laplacian():
-    for mat in (np.arange(10) * np.arange(10)[:, np.newaxis],
-                np.ones((7, 7)),
-                np.eye(19),
-                np.vander(np.arange(4)) + np.vander(np.arange(4)).T,):
-        sp_mat = sparse.csr_matrix(mat)
-        for normed in (True, False):
-            laplacian = graph_laplacian(mat, normed=normed)
-            n_nodes = mat.shape[0]
-            if not normed:
-                np.testing.assert_array_almost_equal(laplacian.sum(axis=0),
-                                                     np.zeros(n_nodes))
-            np.testing.assert_array_almost_equal(laplacian.T, laplacian)
-            np.testing.assert_array_almost_equal(
-                laplacian, graph_laplacian(sp_mat, normed=normed).toarray())
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
deleted file mode 100644
index b6b43644e476e..0000000000000
--- a/sklearn/utils/tests/test_stats.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import pytest
-from sklearn.utils.testing import assert_array_equal, ignore_warnings
-
-from sklearn.utils.stats import rankdata
-
-
-_cases = (
-    # values, method, expected
-    ([100], 'max', [1.0]),
-    ([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
-    ([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
-    ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
-    ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
-)
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 840e08524c384..ce69b70cb1cbb 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -22,8 +22,6 @@
 from sklearn.utils import gen_even_slices
 from sklearn.utils import get_chunk_n_rows
 from sklearn.utils import is_scalar_nan
-from sklearn.utils.extmath import pinvh
-from sklearn.utils.arpack import eigsh
 from sklearn.utils.mocking import MockDataFrame
 from sklearn import config_context
 

From cd48d6d43ce317d4c1805ecac89119948105ac4b Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 18:14:49 -0400
Subject: [PATCH 03/11] remove some more deprecation tests

---
 sklearn/decomposition/tests/test_sparse_pca.py     |  7 -------
 sklearn/discriminant_analysis.py                   |  9 ++-------
 .../tests/test_feature_hasher.py                   | 14 +++++++-------
 sklearn/metrics/tests/test_pairwise.py             |  4 ----
 4 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py
index 5365ccb8f0d36..11ef39869d02a 100644
--- a/sklearn/decomposition/tests/test_sparse_pca.py
+++ b/sklearn/decomposition/tests/test_sparse_pca.py
@@ -78,13 +78,6 @@ def test_fit_transform(norm_comp):
     spca_lasso.fit(Y)
     assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
 
-    # Test that deprecated ridge_alpha parameter throws warning
-    warning_msg = "The ridge_alpha parameter on transform()"
-    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
-                         Y, ridge_alpha=0.01)
-    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
-                         Y, ridge_alpha=None)
-
 
 @pytest.mark.filterwarnings("ignore:normalize_components")
 @pytest.mark.parametrize("norm_comp", [False, True])
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index bf6b3a4f44631..ff8b6833cc557 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -567,9 +567,6 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
 
         .. versionadded:: 0.17
 
-    store_covariances : boolean
-        Deprecated, use `store_covariance`.
-
     Attributes
     ----------
     covariance_ : list of array-like, shape = [n_features, n_features]
@@ -602,8 +599,7 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
     >>> clf.fit(X, y)
     ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
     QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
-                                  store_covariance=False,
-                                  store_covariances=None, tol=0.0001)
+                                  store_covariance=False, tol=0.0001)
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
@@ -653,8 +649,7 @@ def fit(self, X, y):
             self.priors_ = self.priors
 
         cov = None
-        store_covariance = self.store_covariance or self.store_covariances
-
+        store_covariance = self.store_covariance
         if store_covariance:
             cov = []
         means = []
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
index 77a21ff4364a7..41fc027a2b1b4 100644
--- a/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -33,7 +33,7 @@ def test_feature_hasher_strings():
 
         it = (x for x in raw_X)                 # iterable
 
-        h = FeatureHasher(n_features, non_negative=True, input_type="string")
+        h = FeatureHasher(n_features, input_type="string")
         X = h.transform(it)
 
         assert_equal(X.shape[0], len(raw_X))
@@ -120,11 +120,11 @@ def test_hasher_alternate_sign():
                        input_type='string').fit_transform(X)
     assert Xt.data.min() < 0 and Xt.data.max() > 0
 
-    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
+    Xt = FeatureHasher(alternate_sign=True, 
                        input_type='string').fit_transform(X)
     assert Xt.data.min() > 0
 
-    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
+    Xt = FeatureHasher(alternate_sign=False, 
                        input_type='string').fit_transform(X)
     assert Xt.data.min() > 0
     Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
@@ -144,11 +144,11 @@ def test_hash_collisions():
     # with an opposite sign and cancel out
     assert abs(Xt.data[0]) < len(X[0])
 
-    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
+    Xt = FeatureHasher(alternate_sign=True, 
                        n_features=1, input_type='string').fit_transform(X)
     assert abs(Xt.data[0]) < len(X[0])
 
-    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
+    Xt = FeatureHasher(alternate_sign=False, 
                        n_features=1, input_type='string').fit_transform(X)
     assert Xt.data[0] == len(X[0])
 
@@ -159,12 +159,12 @@ def test_hasher_negative():
     Xt = FeatureHasher(alternate_sign=False, non_negative=False,
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
-    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
+    Xt = FeatureHasher(alternate_sign=False, 
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() > 0)
     Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
-    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
+    Xt = FeatureHasher(alternate_sign=True, 
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() > 0)
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index e28453ee70086..571e764a2c48a 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -82,10 +82,6 @@ def test_pairwise_distances():
     assert_equal(S.shape[0], X.shape[0])
     assert_equal(S.shape[1], Y.shape[0])
     assert_array_almost_equal(S, S2)
-    # Using size_threshold argument should raise
-    # a deprecation warning
-    assert_warns(DeprecationWarning,
-                 manhattan_distances, X, Y, size_threshold=10)
     # Test cosine as a string metric versus cosine callable
     # The string "cosine" uses sklearn.metric,
     # while the function cosine is scipy.spatial

From d4450f3f761937a2defcc8b9e802e76f4dace5b2 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 18:46:50 -0400
Subject: [PATCH 04/11] many more test fixes

---
 sklearn/cluster/tests/test_hierarchical.py    | 15 -------------
 .../decomposition/tests/test_online_lda.py    | 21 +------------------
 .../tests/test_feature_hasher.py              | 10 ++++-----
 sklearn/kernel_approximation.py               |  4 ++--
 sklearn/preprocessing/tests/test_encoders.py  |  7 +------
 .../tests/test_function_transformer.py        | 11 ++++------
 .../tests/test_label_propagation.py           | 13 ------------
 sklearn/utils/setup.py                        |  1 -
 8 files changed, 13 insertions(+), 69 deletions(-)

diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 6f03f9aa32106..2456f61c872c5 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -38,21 +38,6 @@
 from sklearn.datasets import make_moons, make_circles
 
 
-def test_deprecation_of_n_components_in_linkage_tree():
-    rng = np.random.RandomState(0)
-    X = rng.randn(50, 100)
-    # Test for warning of deprecation of n_components in linkage_tree
-    children, n_nodes, n_leaves, parent = assert_warns(DeprecationWarning,
-                                                       linkage_tree,
-                                                       X.T,
-                                                       n_components=10)
-    children_t, n_nodes_t, n_leaves_t, parent_t = linkage_tree(X.T)
-    assert_array_equal(children, children_t)
-    assert_equal(n_nodes, n_nodes_t)
-    assert_equal(n_leaves, n_leaves_t)
-    assert_equal(parent, parent_t)
-
-
 def test_linkage_misc():
     # Misc tests on linkage
     rng = np.random.RandomState(42)
diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py
index f3354cba375c3..655b367e0735a 100644
--- a/sklearn/decomposition/tests/test_online_lda.py
+++ b/sklearn/decomposition/tests/test_online_lda.py
@@ -347,19 +347,6 @@ def test_lda_fit_perplexity():
     assert_almost_equal(perplexity1, perplexity2)
 
 
-def test_doc_topic_distr_deprecation():
-    # Test that the appropriate warning message is displayed when a user
-    # attempts to pass the doc_topic_distr argument to the perplexity method
-    n_components, X = _build_sparse_mtx()
-    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
-                                    learning_method='batch',
-                                    total_samples=100, random_state=0)
-    distr1 = lda.fit_transform(X)
-    distr2 = None
-    assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
-    assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
-
-
 def test_lda_empty_docs():
     """Test LDA on empty document (all-zero rows)."""
     Z = np.zeros((5, 4))
@@ -414,10 +401,4 @@ def check_verbosity(verbose, evaluate_every, expected_lines,
 def test_verbosity(verbose, evaluate_every, expected_lines,
                    expected_perplexities):
     check_verbosity(verbose, evaluate_every, expected_lines,
-                    expected_perplexities)
-
-
-def test_lda_n_topics_deprecation():
-    n_components, X = _build_sparse_mtx()
-    lda = LatentDirichletAllocation(n_topics=10, learning_method='batch')
-    assert_warns(DeprecationWarning, lda.fit, X)
+                    expected_perplexities)
\ No newline at end of file
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
index 41fc027a2b1b4..dff5f090c9b28 100644
--- a/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -116,7 +116,7 @@ def test_hasher_zeros():
 def test_hasher_alternate_sign():
     X = [list("Thequickbrownfoxjumped")]
 
-    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
+    Xt = FeatureHasher(alternate_sign=True, 
                        input_type='string').fit_transform(X)
     assert Xt.data.min() < 0 and Xt.data.max() > 0
 
@@ -127,7 +127,7 @@ def test_hasher_alternate_sign():
     Xt = FeatureHasher(alternate_sign=False, 
                        input_type='string').fit_transform(X)
     assert Xt.data.min() > 0
-    Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
+    Xt_2 = FeatureHasher(alternate_sign=False, 
                          input_type='string').fit_transform(X)
     # With initially positive features, the non_negative option should
     # have no impact when alternate_sign=False
@@ -138,7 +138,7 @@ def test_hasher_alternate_sign():
 def test_hash_collisions():
     X = [list("Thequickbrownfoxjumped")]
 
-    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
+    Xt = FeatureHasher(alternate_sign=True, 
                        n_features=1, input_type='string').fit_transform(X)
     # check that some of the hashed tokens are added
     # with an opposite sign and cancel out
@@ -156,13 +156,13 @@ def test_hash_collisions():
 @ignore_warnings(category=DeprecationWarning)
 def test_hasher_negative():
     X = [{"foo": 2, "bar": -4, "baz": -1}.items()]
-    Xt = FeatureHasher(alternate_sign=False, non_negative=False,
+    Xt = FeatureHasher(alternate_sign=False, 
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
     Xt = FeatureHasher(alternate_sign=False, 
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() > 0)
-    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
+    Xt = FeatureHasher(alternate_sign=True, 
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
     Xt = FeatureHasher(alternate_sign=True, 
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 585f453e389b2..d44c67b417764 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -619,7 +619,7 @@ def _get_kernel_params(self):
             if (self.gamma is not None or
                     self.coef0 is not None or
                     self.degree is not None):
-                raise ValueErrror("Don't pass gamma, coef0 or degree to "
-                                  "Nystroem if using a callable kernel.")
+                raise ValueError("Don't pass gamma, coef0 or degree to "
+                                 "Nystroem if using a callable kernel.")
 
         return params
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 67169432defdc..13dfe08201c1e 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -607,9 +607,4 @@ def test_encoder_dtypes_pandas():
 def test_one_hot_encoder_warning():
     enc = OneHotEncoder()
     X = [['Male', 1], ['Female', 3]]
-    np.testing.assert_no_warnings(enc.fit_transform, X)
-
-
-def test_categorical_encoder_stub():
-    from sklearn.preprocessing import CategoricalEncoder
-    assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal')
+    np.testing.assert_no_warnings(enc.fit_transform, X)
\ No newline at end of file
diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py
index 464581e5e9c2c..b65d6614e2fbd 100644
--- a/sklearn/preprocessing/tests/test_function_transformer.py
+++ b/sklearn/preprocessing/tests/test_function_transformer.py
@@ -50,20 +50,17 @@ def test_delegate_to_func():
     # reset the argument stores.
     args_store[:] = []  # python2 compatible inplace list clear.
     kwargs_store.clear()
-    y = object()
-    transformed = assert_warns_message(
-        DeprecationWarning, "pass_y is deprecated",
-        FunctionTransformer(
+    FunctionTransformer(
             _make_func(args_store, kwargs_store),
-            pass_y=True, validate=False).transform, X, y)
+            validate=False).transform(X)
 
     assert_array_equal(transformed, X,
                        err_msg='transform should have returned X unchanged')
 
-    # The function should have received X and y.
+    # The function should have received X
     assert_equal(
         args_store,
-        [X, y],
+        [X],
         'Incorrect positional arguments passed to func: {args}'.format(
             args=args_store,
         ),
diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index 51b725030cb64..ef594fccb7076 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -66,19 +66,6 @@ def test_predict_proba():
                                   np.array([[0.5, 0.5]]))
 
 
-def test_alpha_deprecation():
-    X, y = make_classification(n_samples=100)
-    y[::3] = -1
-
-    lp_default = label_propagation.LabelPropagation(kernel='rbf', gamma=0.1)
-    lp_default_y = lp_default.fit(X, y).transduction_
-
-    lp_0 = label_propagation.LabelPropagation(alpha=0, kernel='rbf', gamma=0.1)
-    lp_0_y = assert_warns(DeprecationWarning, lp_0.fit, X, y).transduction_
-
-    assert_array_equal(lp_default_y, lp_0_y)
-
-
 def test_label_spreading_closed_form():
     n_classes = 2
     X, y = make_classification(n_classes=n_classes, n_samples=200,
diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index 9590692b0dff0..13d772a5a53b7 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -9,7 +9,6 @@ def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
 
     config = Configuration('utils', parent_package, top_path)
-    config.add_subpackage('sparsetools')
 
     cblas_libs, blas_info = get_blas_info()
     cblas_compile_args = blas_info.pop('extra_compile_args', [])

From 3d1fae5fa26a7ff2a85dc71925a35326c8f3a620 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 18:51:28 -0400
Subject: [PATCH 05/11] more test fixes

---
 sklearn/decomposition/online_lda.py                      | 6 ------
 sklearn/preprocessing/tests/test_function_transformer.py | 6 +++---
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index 51c199e50c2d7..cfeba87ef3ce3 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -795,12 +795,6 @@ def perplexity(self, X, sub_sampling=False):
         X : array-like or sparse matrix, [n_samples, n_features]
             Document word matrix.
 
-        doc_topic_distr : None or array, shape=(n_samples, n_components)
-            Document topic distribution.
-            This argument is deprecated and is currently being ignored.
-
-            .. deprecated:: 0.19
-
         sub_sampling : bool
             Do sub-sampling or not.
 
diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py
index b65d6614e2fbd..663b4e6b4c9f0 100644
--- a/sklearn/preprocessing/tests/test_function_transformer.py
+++ b/sklearn/preprocessing/tests/test_function_transformer.py
@@ -50,9 +50,9 @@ def test_delegate_to_func():
     # reset the argument stores.
     args_store[:] = []  # python2 compatible inplace list clear.
     kwargs_store.clear()
-    FunctionTransformer(
-            _make_func(args_store, kwargs_store),
-            validate=False).transform(X)
+    transformed = FunctionTransformer(
+        _make_func(args_store, kwargs_store),
+        validate=False).transform(X)
 
     assert_array_equal(transformed, X,
                        err_msg='transform should have returned X unchanged')

From b2b0136934870e66123cea7e76414b05eda9f954 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 18:52:31 -0400
Subject: [PATCH 06/11] undo non-negative stuff for now, seems annoying

---
 sklearn/feature_extraction/hashing.py         | 22 +++++++++++++++--
 .../tests/test_feature_hasher.py              | 24 +++++++++----------
 sklearn/feature_extraction/text.py            | 13 ++++++++--
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
index f670e9cbec89e..744a073090bad 100644
--- a/sklearn/feature_extraction/hashing.py
+++ b/sklearn/feature_extraction/hashing.py
@@ -57,7 +57,8 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         feature_name should be a string, while value should be a number.
         In the case of "string", a value of 1 is implied.
         The feature_name is hashed to find the appropriate column for the
-        feature. The value's sign might be flipped in the output.
+        feature. The value's sign might be flipped in the output (but see
+        non_negative, below).
     dtype : numpy type, optional, default np.float64
         The type of feature values. Passed to scipy.sparse matrix constructors
         as the dtype argument. Do not set this to bool, np.boolean or any
@@ -67,6 +68,15 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         approximately conserve the inner product in the hashed space even for
         small n_features. This approach is similar to sparse random projection.
 
+    non_negative : boolean, optional, default False
+        When True, an absolute value is applied to the features matrix prior to
+        returning it. When used in conjunction with alternate_sign=True, this
+        significantly reduces the inner product preservation property.
+
+        .. deprecated:: 0.19
+            This option will be removed in 0.21.
+
+
     Examples
     --------
     >>> from sklearn.feature_extraction import FeatureHasher
@@ -84,12 +94,18 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, n_features=(2 ** 20), input_type="dict",
-                 dtype=np.float64, alternate_sign=True):
+                 dtype=np.float64, alternate_sign=True, non_negative=False):
         self._validate_params(n_features, input_type)
+        if non_negative:
+            warnings.warn("the option non_negative=True has been deprecated"
+                          " in 0.19 and will be removed"
+                          " in version 0.21.", DeprecationWarning)
+
         self.dtype = dtype
         self.input_type = input_type
         self.n_features = n_features
         self.alternate_sign = alternate_sign
+        self.non_negative = non_negative
 
     @staticmethod
     def _validate_params(n_features, input_type):
@@ -159,4 +175,6 @@ def transform(self, raw_X):
                           shape=(n_samples, self.n_features))
         X.sum_duplicates()  # also sorts the indices
 
+        if self.non_negative:
+            np.abs(X.data, X.data)
         return X
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
index dff5f090c9b28..77a21ff4364a7 100644
--- a/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -33,7 +33,7 @@ def test_feature_hasher_strings():
 
         it = (x for x in raw_X)                 # iterable
 
-        h = FeatureHasher(n_features, input_type="string")
+        h = FeatureHasher(n_features, non_negative=True, input_type="string")
         X = h.transform(it)
 
         assert_equal(X.shape[0], len(raw_X))
@@ -116,18 +116,18 @@ def test_hasher_zeros():
 def test_hasher_alternate_sign():
     X = [list("Thequickbrownfoxjumped")]
 
-    Xt = FeatureHasher(alternate_sign=True, 
+    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                        input_type='string').fit_transform(X)
     assert Xt.data.min() < 0 and Xt.data.max() > 0
 
-    Xt = FeatureHasher(alternate_sign=True, 
+    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                        input_type='string').fit_transform(X)
     assert Xt.data.min() > 0
 
-    Xt = FeatureHasher(alternate_sign=False, 
+    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                        input_type='string').fit_transform(X)
     assert Xt.data.min() > 0
-    Xt_2 = FeatureHasher(alternate_sign=False, 
+    Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
                          input_type='string').fit_transform(X)
     # With initially positive features, the non_negative option should
     # have no impact when alternate_sign=False
@@ -138,17 +138,17 @@ def test_hasher_alternate_sign():
 def test_hash_collisions():
     X = [list("Thequickbrownfoxjumped")]
 
-    Xt = FeatureHasher(alternate_sign=True, 
+    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                        n_features=1, input_type='string').fit_transform(X)
     # check that some of the hashed tokens are added
     # with an opposite sign and cancel out
     assert abs(Xt.data[0]) < len(X[0])
 
-    Xt = FeatureHasher(alternate_sign=True, 
+    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                        n_features=1, input_type='string').fit_transform(X)
     assert abs(Xt.data[0]) < len(X[0])
 
-    Xt = FeatureHasher(alternate_sign=False, 
+    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                        n_features=1, input_type='string').fit_transform(X)
     assert Xt.data[0] == len(X[0])
 
@@ -156,15 +156,15 @@ def test_hash_collisions():
 @ignore_warnings(category=DeprecationWarning)
 def test_hasher_negative():
     X = [{"foo": 2, "bar": -4, "baz": -1}.items()]
-    Xt = FeatureHasher(alternate_sign=False, 
+    Xt = FeatureHasher(alternate_sign=False, non_negative=False,
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
-    Xt = FeatureHasher(alternate_sign=False, 
+    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() > 0)
-    Xt = FeatureHasher(alternate_sign=True, 
+    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
-    Xt = FeatureHasher(alternate_sign=True, 
+    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                        input_type="pair").fit_transform(X)
     assert_true(Xt.data.min() > 0)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index a4de38d959db1..05f60d2805c7c 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -491,6 +491,13 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
 
         .. versionadded:: 0.19
 
+    non_negative : boolean, optional, default False
+        When True, an absolute value is applied to the features matrix prior to
+        returning it. When used in conjunction with alternate_sign=True, this
+        significantly reduces the inner product preservation property.
+
+        .. deprecated:: 0.19
+            This option will be removed in 0.21.
     dtype : type, optional
         Type of the matrix returned by fit_transform() or transform().
 
@@ -519,7 +526,7 @@ def __init__(self, input='content', encoding='utf-8',
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                  ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20),
                  binary=False, norm='l2', alternate_sign=True,
-                 dtype=np.float64):
+                 non_negative=False, dtype=np.float64):
         self.input = input
         self.encoding = encoding
         self.decode_error = decode_error
@@ -535,6 +542,7 @@ def __init__(self, input='content', encoding='utf-8',
         self.binary = binary
         self.norm = norm
         self.alternate_sign = alternate_sign
+        self.non_negative = non_negative
         self.dtype = dtype
 
     def partial_fit(self, X, y=None):
@@ -622,7 +630,8 @@ def fit_transform(self, X, y=None):
     def _get_hasher(self):
         return FeatureHasher(n_features=self.n_features,
                              input_type='string', dtype=self.dtype,
-                             alternate_sign=self.alternate_sign)
+                             alternate_sign=self.alternate_sign,
+                             non_negative=self.non_negative)
 
 
 def _document_frequency(X):

From a750abc6266ab7412aaf3d5969cec22918be3dcb Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 18:52:47 -0400
Subject: [PATCH 07/11] fix kernel_approximation test

---
 sklearn/tests/test_kernel_approximation.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py
index 8a2208b20af99..71ec07452abeb 100644
--- a/sklearn/tests/test_kernel_approximation.py
+++ b/sklearn/tests/test_kernel_approximation.py
@@ -245,13 +245,3 @@ def logging_histogram_kernel(x, y, log):
              n_components=(n_samples - 1),
              kernel_params={'log': kernel_log}).fit(X)
     assert_equal(len(kernel_log), n_samples * (n_samples - 1) / 2)
-
-    def linear_kernel(X, Y):
-        return np.dot(X, Y.T)
-
-    # if degree, gamma or coef0 is passed, we raise a warning
-    msg = "Passing gamma, coef0 or degree to Nystroem"
-    params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2})
-    for param in params:
-        ny = Nystroem(kernel=linear_kernel, **param)
-        assert_warns_message(DeprecationWarning, msg, ny.fit, X)

From 5069dcf2196ffcbd79683ee3aa825d7e8ef7cc05 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 19:10:22 -0400
Subject: [PATCH 08/11] remove unused imports

---
 sklearn/covariance/tests/test_graph_lasso.py   | 5 +----
 sklearn/decomposition/fastica_.py              | 1 -
 sklearn/decomposition/online_lda.py            | 1 -
 sklearn/discriminant_analysis.py               | 1 -
 sklearn/gaussian_process/gpr.py                | 1 -
 sklearn/linear_model/least_angle.py            | 2 +-
 sklearn/manifold/t_sne.py                      | 2 --
 sklearn/preprocessing/_function_transformer.py | 1 -
 sklearn/preprocessing/data.py                  | 3 +--
 sklearn/tests/test_calibration.py              | 3 +--
 sklearn/tests/test_discriminant_analysis.py    | 1 -
 sklearn/tests/test_kernel_approximation.py     | 1 -
 sklearn/utils/extmath.py                       | 3 +--
 sklearn/utils/graph.py                         | 2 --
 sklearn/utils/random.py                        | 3 +--
 sklearn/utils/stats.py                         | 2 --
 sklearn/utils/tests/test_utils.py              | 5 +----
 17 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/sklearn/covariance/tests/test_graph_lasso.py b/sklearn/covariance/tests/test_graph_lasso.py
index 33c724df781d4..d368356100a4f 100644
--- a/sklearn/covariance/tests/test_graph_lasso.py
+++ b/sklearn/covariance/tests/test_graph_lasso.py
@@ -9,7 +9,6 @@
 
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_less
-from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn.covariance import (graph_lasso, GraphLasso, GraphLassoCV,
@@ -19,8 +18,6 @@
 from sklearn.utils import check_random_state
 from sklearn import datasets
 
-from numpy.testing import assert_equal
-
 
 @ignore_warnings(category=DeprecationWarning)
 def test_graph_lasso(random_state=0):
@@ -140,4 +137,4 @@ def test_graph_lasso_cv(random_state=1):
         sys.stdout = orig_stdout
 
     # Smoke test with specified alphas
-    GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
\ No newline at end of file
+    GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index 2eead18b2678d..693d46d31fab5 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -18,7 +18,6 @@
 from ..exceptions import ConvergenceWarning
 from ..externals import six
 from ..externals.six import moves
-from ..externals.six import string_types
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index cfeba87ef3ce3..4c0f8625771c7 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -14,7 +14,6 @@
 import numpy as np
 import scipy.sparse as sp
 from scipy.special import gammaln
-import warnings
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import (check_random_state, check_array,
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index ff8b6833cc557..e0084741e583f 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -12,7 +12,6 @@
 from __future__ import print_function
 import warnings
 import numpy as np
-from .utils import deprecated
 from scipy import linalg
 from .externals.six import string_types
 from .externals.six.moves import xrange
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index ebe8dd3b65ade..c5ff9674bc575 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -15,7 +15,6 @@
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_X_y, check_array
-from sklearn.utils.deprecation import deprecated
 from sklearn.exceptions import ConvergenceWarning
 
 
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 0e923a424c221..bdee91f468737 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -20,7 +20,7 @@
 
 from .base import LinearModel
 from ..base import RegressorMixin
-from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated
+from ..utils import arrayfuncs, as_float_array, check_X_y
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
 from ..utils import Parallel, delayed
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index 5ddda56491564..213d75c2a4730 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -26,7 +26,6 @@
 from . import _utils
 from . import _barnes_hut_tsne
 from ..externals.six import string_types
-from ..utils import deprecated
 
 
 MACHINE_EPSILON = np.finfo(np.double).eps
@@ -805,7 +804,6 @@ def _fit(self, X, skip_num_points=0):
                           neighbors=neighbors_nn,
                           skip_num_points=skip_num_points)
 
-
     def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded,
               neighbors=None, skip_num_points=0):
         """Runs t-SNE."""
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 93afcc646e3fb..66034f6740a8e 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -3,7 +3,6 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.testing import assert_allclose_dense_sparse
-from ..externals.six import string_types
 
 
 def _identity(X):
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 9b3eaa98e4c08..23eecacfda163 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -21,7 +21,6 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..externals import six
-from ..externals.six import string_types
 from ..utils import check_array
 from ..utils.extmath import row_norms
 from ..utils.extmath import _incremental_mean_and_var
@@ -2872,4 +2871,4 @@ def power_transform(X, method='box-cox', standardize=True, copy=True):
     Royal Statistical Society B, 26, 211-252 (1964).
     """
     pt = PowerTransformer(method=method, standardize=standardize, copy=copy)
-    return pt.fit_transform(X)
\ No newline at end of file
+    return pt.fit_transform(X)
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index e454633a3a294..8b18e5aafeb93 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -11,8 +11,7 @@
                                    assert_greater, assert_almost_equal,
                                    assert_greater_equal,
                                    assert_array_equal,
-                                   assert_raises,
-                                   ignore_warnings)
+                                   assert_raises)
 from sklearn.datasets import make_classification, make_blobs
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
index 4cb8f5d148b04..789b274f8f7bf 100644
--- a/sklearn/tests/test_discriminant_analysis.py
+++ b/sklearn/tests/test_discriminant_analysis.py
@@ -9,7 +9,6 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import ignore_warnings
 
diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py
index 71ec07452abeb..be0d249f1a4a7 100644
--- a/sklearn/tests/test_kernel_approximation.py
+++ b/sklearn/tests/test_kernel_approximation.py
@@ -5,7 +5,6 @@
 from sklearn.utils.testing import assert_not_equal
 from sklearn.utils.testing import assert_array_almost_equal, assert_raises
 from sklearn.utils.testing import assert_less_equal
-from sklearn.utils.testing import assert_warns_message
 
 from sklearn.metrics.pairwise import kernel_metrics
 from sklearn.kernel_approximation import RBFSampler
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 07a83a17377b5..80bcfd5585c2c 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -17,9 +17,8 @@
 import numpy as np
 from scipy import linalg, sparse
 
-from . import check_random_state, deprecated
+from . import check_random_state
 from .fixes import np_version
-from .fixes import logsumexp as scipy_logsumexp
 from ._logistic_sigmoid import _log_logistic_sigmoid
 from ..externals.six.moves import xrange
 from .sparsefuncs_fast import csr_row_norms
diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py
index 17caa4fa2cb0d..b030af2fed81c 100644
--- a/sklearn/utils/graph.py
+++ b/sklearn/utils/graph.py
@@ -11,10 +11,8 @@
 # License: BSD 3 clause
 
 from scipy import sparse
-from scipy.sparse import csgraph
 
 from .graph_shortest_path import graph_shortest_path  # noqa
-from .deprecation import deprecated
 
 
 ###############################################################################
diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py
index 29d465fff8705..61be8214dd1f1 100644
--- a/sklearn/utils/random.py
+++ b/sklearn/utils/random.py
@@ -8,9 +8,8 @@
 
 from sklearn.utils import check_random_state
 from ._random import sample_without_replacement
-from .deprecation import deprecated
 
-__all__ = ['sample_without_replacement', 'choice']
+__all__ = ['sample_without_replacement']
 
 
 def random_choice_csc(n_samples, classes, class_probability=None,
diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index 458669e23eb3a..ff770afa55ad6 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -1,8 +1,6 @@
 import numpy as np
-from scipy.stats import rankdata as scipy_rankdata
 
 from sklearn.utils.extmath import stable_cumsum
-from sklearn.utils.deprecation import deprecated
 
 
 def _weighted_percentile(array, sample_weight, percentile=50):
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index ce69b70cb1cbb..2c3b22a4f38f7 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -4,13 +4,10 @@
 import pytest
 import numpy as np
 import scipy.sparse as sp
-from scipy.linalg import pinv2
-from scipy.sparse.csgraph import laplacian
 
 from sklearn.utils.testing import (assert_equal, assert_raises, assert_true,
-                                   assert_almost_equal, assert_array_equal,
+                                   assert_array_equal,
                                    SkipTest, assert_raises_regex,
-                                   assert_greater_equal, ignore_warnings,
                                    assert_warns_message, assert_no_warnings)
 from sklearn.utils import check_random_state
 from sklearn.utils import deprecated

From cfa9216bfdbd622f8a03ab3a67c56825189caf32 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 1 Oct 2018 19:13:41 -0400
Subject: [PATCH 09/11] remove more unused imports

---
 sklearn/covariance/tests/test_graphical_lasso.py | 6 +-----
 sklearn/decomposition/tests/test_kernel_pca.py   | 2 +-
 sklearn/decomposition/tests/test_online_lda.py   | 3 +--
 sklearn/decomposition/tests/test_pca.py          | 2 --
 sklearn/linear_model/tests/test_huber.py         | 1 -
 sklearn/metrics/tests/test_classification.py     | 1 -
 sklearn/metrics/tests/test_pairwise.py           | 1 -
 sklearn/metrics/tests/test_score_objects.py      | 1 -
 sklearn/model_selection/tests/test_split.py      | 1 -
 sklearn/neighbors/tests/test_kd_tree.py          | 2 +-
 sklearn/neighbors/tests/test_lof.py              | 2 +-
 sklearn/preprocessing/tests/test_data.py         | 1 -
 sklearn/utils/tests/test_extmath.py              | 3 ---
 13 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py
index 25e2f191d3ec8..47f15f4a762ac 100644
--- a/sklearn/covariance/tests/test_graphical_lasso.py
+++ b/sklearn/covariance/tests/test_graphical_lasso.py
@@ -8,7 +8,6 @@
 
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_less
-from sklearn.utils.testing import assert_warns_message
 
 from sklearn.covariance import (graphical_lasso, GraphicalLasso,
                                 GraphicalLassoCV, empirical_covariance)
@@ -16,9 +15,6 @@
 from sklearn.externals.six.moves import StringIO
 from sklearn.utils import check_random_state
 from sklearn import datasets
-from sklearn.utils.fixes import PY3_OR_LATER
-
-from numpy.testing import assert_equal
 
 
 def test_graphical_lasso(random_state=0):
@@ -136,4 +132,4 @@ def test_graphical_lasso_cv(random_state=1):
         sys.stdout = orig_stdout
 
     # Smoke test with specified alphas
-    GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
\ No newline at end of file
+    GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index b0f2c5aeae52a..040f9e49d590b 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -4,7 +4,7 @@
 
 from sklearn.utils.testing import (assert_array_almost_equal, assert_less,
                                    assert_equal, assert_not_equal,
-                                   assert_raises, ignore_warnings)
+                                   assert_raises)
 
 from sklearn.decomposition import PCA, KernelPCA
 from sklearn.datasets import make_circles
diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py
index 655b367e0735a..0abc2efe75ec2 100644
--- a/sklearn/decomposition/tests/test_online_lda.py
+++ b/sklearn/decomposition/tests/test_online_lda.py
@@ -19,7 +19,6 @@
 from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_raises_regexp
 from sklearn.utils.testing import if_safe_multiprocessing_with_blas
-from sklearn.utils.testing import assert_warns
 
 from sklearn.exceptions import NotFittedError
 from sklearn.externals.six.moves import xrange
@@ -401,4 +400,4 @@ def check_verbosity(verbose, evaluate_every, expected_lines,
 def test_verbosity(verbose, evaluate_every, expected_lines,
                    expected_perplexities):
     check_verbosity(verbose, evaluate_every, expected_lines,
-                    expected_perplexities)
\ No newline at end of file
+                    expected_perplexities)
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index 7484367127157..c852e4bed0e58 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -13,7 +13,6 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_no_warnings
-from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.testing import assert_less
 
@@ -685,7 +684,6 @@ def test_svd_solver_auto():
     assert_array_almost_equal(pca.components_, pca_test.components_)
 
 
-
 @pytest.mark.parametrize('svd_solver', solver_list)
 def test_pca_sparse_input(svd_solver):
     X = np.random.RandomState(0).rand(5, 4)
diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py
index d7658396b3f22..3bc77ee8c1778 100644
--- a/sklearn/linear_model/tests/test_huber.py
+++ b/sklearn/linear_model/tests/test_huber.py
@@ -4,7 +4,6 @@
 import numpy as np
 from scipy import optimize, sparse
 
-from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c07f9d66aa0f9..8e18af7128350 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -14,7 +14,6 @@
 from sklearn.datasets import make_multilabel_classification
 from sklearn.preprocessing import label_binarize
 from sklearn.utils.validation import check_random_state
-from sklearn.utils.testing import assert_dict_equal
 from sklearn.utils.testing import assert_raises, clean_warning_registry
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_equal
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 571e764a2c48a..62aaec5fdc9a6 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -17,7 +17,6 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regexp
 from sklearn.utils.testing import assert_true
-from sklearn.utils.testing import assert_warns
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.testing import assert_warns_message
 
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index da04b4215dce0..9033a2b2d86ee 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -17,7 +17,6 @@
 from sklearn.utils.testing import assert_false
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_warns_message
 
 from sklearn.base import BaseEstimator
 from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score,
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 28286bf2402fd..637b4dca5537f 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -49,7 +49,6 @@
 from sklearn.linear_model import Ridge
 
 from sklearn.model_selection._split import _validate_shuffle_split
-from sklearn.model_selection._split import _CVIterableWrapper
 from sklearn.model_selection._split import _build_repr
 from sklearn.model_selection._split import CV_WARNING
 from sklearn.model_selection._split import NSPLIT_WARNING
diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py
index 18d2138021605..0b9c612624cff 100644
--- a/sklearn/neighbors/tests/test_kd_tree.py
+++ b/sklearn/neighbors/tests/test_kd_tree.py
@@ -8,7 +8,7 @@
                                        nodeheap_sort, DTYPE, ITYPE)
 from sklearn.neighbors.dist_metrics import DistanceMetric
 from sklearn.utils import check_random_state
-from sklearn.utils.testing import SkipTest, assert_allclose
+from sklearn.utils.testing import assert_allclose
 
 rng = np.random.RandomState(42)
 V = rng.random_sample((3, 3))
diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py
index ed57a1d0fba29..ef833024d5cb8 100644
--- a/sklearn/neighbors/tests/test_lof.py
+++ b/sklearn/neighbors/tests/test_lof.py
@@ -14,7 +14,7 @@
 from sklearn.metrics import roc_auc_score
 
 from sklearn.utils import check_random_state
-from sklearn.utils.testing import assert_greater, ignore_warnings
+from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_warns_message
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index f4d0b5af9799f..3279387dcce7a 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -6,7 +6,6 @@
 from __future__ import division
 
 import warnings
-import re
 import itertools
 
 import numpy as np
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index d22ec5b886c89..7586bbfd1eeb1 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -23,11 +23,9 @@
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import skip_if_32bit
 from sklearn.utils.testing import SkipTest
-from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.fixes import np_version
 
 from sklearn.utils.extmath import density
-from sklearn.utils.extmath import squared_norm
 from sklearn.utils.extmath import randomized_svd
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.extmath import weighted_mode
@@ -87,7 +85,6 @@ def test_random_weights():
     assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
 
 
-
 def check_randomized_svd_low_rank(dtype):
     # Check that extmath.randomized_svd is consistent with linalg.svd
     n_samples = 100

From 45289e88bf4ba83453a9fd20200c0979499c34b9 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 2 Oct 2018 12:33:48 -0400
Subject: [PATCH 10/11] fix no newline at end of file in test_encoders.py

---
 sklearn/preprocessing/tests/test_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 13dfe08201c1e..d3833ed97c79d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -607,4 +607,4 @@ def test_encoder_dtypes_pandas():
 def test_one_hot_encoder_warning():
     enc = OneHotEncoder()
     X = [['Male', 1], ['Female', 3]]
-    np.testing.assert_no_warnings(enc.fit_transform, X)
\ No newline at end of file
+    np.testing.assert_no_warnings(enc.fit_transform, X)

From 3afc42bb8512119be5c99e270421746c72fb73dc Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 2 Oct 2018 13:39:07 -0400
Subject: [PATCH 11/11] fix kernel approximation test

---
 sklearn/tests/test_kernel_approximation.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py
index be0d249f1a4a7..c2ba50f3728ff 100644
--- a/sklearn/tests/test_kernel_approximation.py
+++ b/sklearn/tests/test_kernel_approximation.py
@@ -1,5 +1,6 @@
 import numpy as np
 from scipy.sparse import csr_matrix
+import pytest
 
 from sklearn.utils.testing import assert_array_equal, assert_equal, assert_true
 from sklearn.utils.testing import assert_not_equal
@@ -244,3 +245,14 @@ def logging_histogram_kernel(x, y, log):
              n_components=(n_samples - 1),
              kernel_params={'log': kernel_log}).fit(X)
     assert_equal(len(kernel_log), n_samples * (n_samples - 1) / 2)
+
+    def linear_kernel(X, Y):
+        return np.dot(X, Y.T)
+
+    # if degree, gamma or coef0 is passed, we raise a warning
+    msg = "Don't pass gamma, coef0 or degree to Nystroem"
+    params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2})
+    for param in params:
+        ny = Nystroem(kernel=linear_kernel, **param)
+        with pytest.raises(ValueError, match=msg):
+            ny.fit(X)