From 7d9dcc4f03a13eb5116f6c0299e2822a9b0011a9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 9 Apr 2019 14:10:47 -0400
Subject: [PATCH 01/53] Basic validate_X and validate_X_y methods for
 _n_features_in attribute

---
 sklearn/base.py               | 26 ++++++++++++++++++++++++++
 sklearn/preprocessing/data.py | 14 ++++++++------
 sklearn/tests/test_base.py    | 15 +++++++++++++++
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 167baaf2b7ebd..ba380f2a70902 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -293,6 +293,32 @@ def _get_tags(self):
         tags.update(collected_tags)
         return tags
 
+    def _validate_n_features(self, X, check_n_features):
+        if check_n_features:
+            if not hasattr(self, '_n_features_in'):
+                raise RuntimeError(
+                    "check_n_features is True but there is no _n_features_in "
+                    "attribute."
+                )
+            if X.shape[1] != self._n_features_in:
+                raise ValueError(
+                    'X has {} features, but this {} is expecting {} features '
+                    'as input.'.format(X.shape[1], self.__class__.__name__,
+                                       self._n_features_in)
+                )
+        self._n_features_in = X.shape[1]
+
+    def validate_X(self, X, check_n_features=False, **check_array_params):
+        from .utils.validation import check_array
+        X = check_array(X, **check_array_params)
+        self._validate_n_features(X, check_n_features)
+        return X
+
+    def validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
+        from .utils.validation import check_X_y
+        X, y = check_X_y(X, **check_X_y_params)
+        self._validate_n_features(X, check_n_features)
+        return X, y
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index bab41f3bdd492..a6ad1ddd930ce 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -658,9 +658,9 @@ def partial_fit(self, X, y=None):
         y
             Ignored
         """
-        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                        warn_on_dtype=False, estimator=self,
-                        dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
+        X = self.validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy,
+                            warn_on_dtype=False, estimator=self,
+                            dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
 
         # Even in the case of `with_mean=False`, we update the mean anyway
         # This is needed for the incremental computation of the var
@@ -753,9 +753,11 @@ def transform(self, X, copy=None):
         check_is_fitted(self, 'scale_')
 
         copy = copy if copy is not None else self.copy
-        X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=False,
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
+        X = self.validate_X(X, check_n_features=True,
+                            accept_sparse='csr', copy=copy,
+                            warn_on_dtype=False, estimator=self,
+                            dtype=FLOAT_DTYPES,
+                            force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             if self.with_mean:
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 316b01ff33415..b808b2190f238 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -19,6 +19,7 @@
 from sklearn.svm import SVC
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor
@@ -509,3 +510,17 @@ def test_regressormixin_score_multioutput():
            "built-in scorer 'r2' uses "
            "multioutput='uniform_average').")
     assert_warns_message(FutureWarning, msg, reg.score, X, y)
+
+
+def test_validate_X():
+    # Make sure ValueError is raised when there is a n_features mismatch
+    # between fit and predict/transform
+
+    X = [[0, 1], [2, 3]]
+
+    ss = StandardScaler().fit(X)
+    ss.transform(X)  # All good
+
+    with pytest.raises(ValueError, match="X has 3 features, but"):
+        X_more_features = [[0, 1, 4], [2, 3, 5]]
+        ss.transform(X_more_features)

From f117745a01415fefd678cfc48cf2ce9694d036d2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 19 Apr 2019 15:36:24 -0400
Subject: [PATCH 02/53] created NonRectangularInputMixin

---
 sklearn/base.py                               | 23 +++++++++++++------
 sklearn/feature_extraction/dict_vectorizer.py |  5 ++--
 sklearn/feature_extraction/text.py            |  4 ++--
 sklearn/tests/test_base.py                    | 22 ++++++++++++++++++
 4 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index ba380f2a70902..94ba53ffa7789 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -13,6 +13,8 @@
 
 from . import __version__
 from sklearn.utils import _IS_32BIT
+from .utils.validation import check_X_y
+from .utils.validation import check_array
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -295,27 +297,25 @@ def _get_tags(self):
 
     def _validate_n_features(self, X, check_n_features):
         if check_n_features:
-            if not hasattr(self, '_n_features_in'):
+            if not hasattr(self, 'n_features_in_'):
                 raise RuntimeError(
-                    "check_n_features is True but there is no _n_features_in "
+                    "check_n_features is True but there is no n_features_in_ "
                     "attribute."
                 )
-            if X.shape[1] != self._n_features_in:
+            if X.shape[1] != self.n_features_in_:
                 raise ValueError(
                     'X has {} features, but this {} is expecting {} features '
                     'as input.'.format(X.shape[1], self.__class__.__name__,
-                                       self._n_features_in)
+                                       self.n_features_in_)
                 )
-        self._n_features_in = X.shape[1]
+        self.n_features_in_ = X.shape[1]
 
     def validate_X(self, X, check_n_features=False, **check_array_params):
-        from .utils.validation import check_array
         X = check_array(X, **check_array_params)
         self._validate_n_features(X, check_n_features)
         return X
 
     def validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
-        from .utils.validation import check_X_y
         X, y = check_X_y(X, **check_X_y_params)
         self._validate_n_features(X, check_n_features)
         return X, y
@@ -551,6 +551,15 @@ def fit_transform(self, X, y=None, **fit_params):
             return self.fit(X, y, **fit_params).transform(X)
 
 
+class NonRectangularInputMixin:
+    """Mixin class for all estimators with non-rectangular input.
+
+    For now only vectorizers are relevant for this mixin.
+    """
+
+    n_features_in_ = None
+
+
 class DensityMixin:
     """Mixin class for all density estimators in scikit-learn."""
     _estimator_type = "DensityEstimator"
diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py
index 8273834acdb20..f21b3ede1f931 100644
--- a/sklearn/feature_extraction/dict_vectorizer.py
+++ b/sklearn/feature_extraction/dict_vectorizer.py
@@ -9,7 +9,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin
 from ..utils import check_array, tosequence
 
 
@@ -21,7 +21,8 @@ def _tosequence(X):
         return tosequence(X)
 
 
-class DictVectorizer(BaseEstimator, TransformerMixin):
+class DictVectorizer(BaseEstimator, TransformerMixin,
+                     NonRectangularInputMixin):
     """Transforms lists of feature-value mappings to vectors.
 
     This transformer turns lists of mappings (dict-like objects) of feature
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 9cdbace6224aa..1e0db090cf456 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -24,7 +24,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin
 from ..preprocessing import normalize
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
@@ -113,7 +113,7 @@ def _check_stop_list(stop):
         return frozenset(stop)
 
 
-class VectorizerMixin:
+class VectorizerMixin(NonRectangularInputMixin):
     """Provides common code for text vectorizers (tokenization logic)."""
 
     _white_spaces = re.compile(r"\s\s+")
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index b808b2190f238..32b9aab3da42d 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -20,6 +20,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler
+from sklearn.feature_extraction import DictVectorizer
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor
@@ -524,3 +525,24 @@ def test_validate_X():
     with pytest.raises(ValueError, match="X has 3 features, but"):
         X_more_features = [[0, 1, 4], [2, 3, 5]]
         ss.transform(X_more_features)
+
+
+def test_n_features_in_attribute():
+    # Make sure n_features_in_ is correctly set.
+    # Note that n_features_in_ is always None for vectorizers, while for other
+    # estimators the attribute doesn't exist until fit() is called.
+    X_2 = [[0, 1], [2, 3]]
+    X_3 = [[0, 1, 4], [2, 3, 5]]
+
+    ss = StandardScaler()
+    assert not hasattr(ss, 'n_features_in_')
+    ss.fit(X_2)
+    assert ss.n_features_in_ == 2
+    ss = ss.fit(X_3)
+    assert ss.n_features_in_ == 3
+
+    dv = DictVectorizer()
+    assert dv.n_features_in_ is None
+    d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+    dv.fit(d)
+    assert dv.n_features_in_ is None

From e56592b7f3bc3103eb7326e83605e8858f0d5b6f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 19 Apr 2019 15:38:20 -0400
Subject: [PATCH 03/53] resolved conflicts

---
 sklearn/preprocessing/data.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index fdd04087e40cd..225f7c2794cf8 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -658,15 +658,9 @@ def partial_fit(self, X, y=None):
         y
             Ignored
         """
-<<<<<<< HEAD
         X = self.validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                            warn_on_dtype=False, estimator=self,
-                            dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
-=======
-        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
->>>>>>> upstream/master
+                            estimator=self, dtype=FLOAT_DTYPES,
+                            force_all_finite='allow-nan')
 
         # Even in the case of `with_mean=False`, we update the mean anyway
         # This is needed for the incremental computation of the var
@@ -759,17 +753,10 @@ def transform(self, X, copy=None):
         check_is_fitted(self, 'scale_')
 
         copy = copy if copy is not None else self.copy
-<<<<<<< HEAD
         X = self.validate_X(X, check_n_features=True,
                             accept_sparse='csr', copy=copy,
-                            warn_on_dtype=False, estimator=self,
-                            dtype=FLOAT_DTYPES,
+                            estimator=self, dtype=FLOAT_DTYPES,
                             force_all_finite='allow-nan')
-=======
-        X = check_array(X, accept_sparse='csr', copy=copy,
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
->>>>>>> upstream/master
 
         if sparse.issparse(X):
             if self.with_mean:

From 8ecc690d47da03e08ca2b4b1326940cecd4296c7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 30 May 2019 15:26:10 -0400
Subject: [PATCH 04/53] _validate** is not private

---
 sklearn/base.py               |  6 +++---
 sklearn/preprocessing/data.py | 14 +++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 9cc73d3c7998a..3aa28b210fdc4 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -342,13 +342,13 @@ def _validate_n_features(self, X, check_n_features):
                 )
         self.n_features_in_ = X.shape[1]
 
-    def validate_X(self, X, check_n_features=False, **check_array_params):
+    def _validate_X(self, X, check_n_features=False, **check_array_params):
         X = check_array(X, **check_array_params)
         self._validate_n_features(X, check_n_features)
         return X
 
-    def validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
-        X, y = check_X_y(X, **check_X_y_params)
+    def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
+        X, y = check_X_y(X, y, **check_X_y_params)
         self._validate_n_features(X, check_n_features)
         return X, y
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 45b7aed4ae605..1b924aa8ecf4d 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -658,9 +658,9 @@ def partial_fit(self, X, y=None):
         y
             Ignored
         """
-        X = self.validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                            estimator=self, dtype=FLOAT_DTYPES,
-                            force_all_finite='allow-nan')
+        X = self._validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy,
+                             estimator=self, dtype=FLOAT_DTYPES,
+                             force_all_finite='allow-nan')
 
         # Even in the case of `with_mean=False`, we update the mean anyway
         # This is needed for the incremental computation of the var
@@ -753,10 +753,10 @@ def transform(self, X, copy=None):
         check_is_fitted(self, 'scale_')
 
         copy = copy if copy is not None else self.copy
-        X = self.validate_X(X, check_n_features=True,
-                            accept_sparse='csr', copy=copy,
-                            estimator=self, dtype=FLOAT_DTYPES,
-                            force_all_finite='allow-nan')
+        X = self._validate_X(X, check_n_features=True,
+                             accept_sparse='csr', copy=copy,
+                             estimator=self, dtype=FLOAT_DTYPES,
+                             force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             if self.with_mean:

From 60e4cea50ae74ccc8e91659eb51fcf2ff5c4dfe1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 30 May 2019 17:02:05 -0400
Subject: [PATCH 05/53] Added support for pipeline and grid search

---
 .../gradient_boosting.py                      |  2 +-
 sklearn/model_selection/_search.py            |  5 +++
 sklearn/pipeline.py                           |  4 +++
 sklearn/tests/test_base.py                    | 31 +++++++++++++++++++
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 466181f445ee8..6eb692d743af0 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -95,7 +95,7 @@ def fit(self, X, y):
         acc_compute_hist_time = 0.  # time spent computing histograms
         # time spent predicting X for gradient and hessians update
         acc_prediction_time = 0.
-        X, y = check_X_y(X, y, dtype=[X_DTYPE])
+        X, y = self._validate_X_y(X, y, dtype=[X_DTYPE])
         y = self._encode_y(y)
         rng = check_random_state(self.random_state)
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 0447f4857fa9d..cc1f9fc2a6f1c 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -540,6 +540,11 @@ def inverse_transform(self, Xt):
         self._check_is_fitted('inverse_transform')
         return self.best_estimator_.inverse_transform(Xt)
 
+    @property
+    def n_features_in_(self):
+        check_is_fitted(self, 'best_estimator_')
+        return self.best_estimator_.n_features_in_
+
     @property
     def classes_(self):
         self._check_is_fitted("classes_")
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 55df0de701db4..9eddfd9e51d30 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -609,6 +609,10 @@ def _pairwise(self):
         # check if first estimator expects pairwise input
         return getattr(self.steps[0][1], '_pairwise', False)
 
+    @property
+    def n_features_in_(self):
+        return self.steps[0][1].n_features_in_
+
 
 def _name_estimators(estimators):
     """Generate names for estimators."""
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 32b9aab3da42d..ae09dfd501bd2 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -21,6 +21,11 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.datasets import make_classification
+from sklearn.pipeline import make_pipeline
+from sklearn.exceptions import NotFittedError
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor
@@ -546,3 +551,29 @@ def test_n_features_in_attribute():
     d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
     dv.fit(d)
     assert dv.n_features_in_ is None
+
+    # meta estimator need specific ways of dealing with the attribute:
+    # grid search delegates this to # the best estimator
+    n_features = 4
+    X, y = make_classification(n_features=n_features)
+    gbdt = HistGradientBoostingClassifier()
+    param_grid = {'max_iter': [3, 4]}
+    gs = GridSearchCV(gbdt, param_grid)
+    assert hasattr(ss, 'n_features_in_')  # that might be a bit unintuitive
+    with pytest.raises(NotFittedError):
+        gs.n_features_in_
+    gs.fit(X, y)
+    assert gs.n_features_in_ == n_features
+
+    # pipelines delegate to the first step
+    pipe = make_pipeline(gbdt)
+    assert not hasattr(pipe, 'n_features_in_')
+    pipe.fit(X, y)
+    assert pipe.n_features_in_ == n_features
+
+    dv = DictVectorizer()
+    pipe = make_pipeline(dv)
+    assert pipe.n_features_in_ is None
+    d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+    dv.fit(d)
+    assert pipe.n_features_in_ is None

From ff19f2226d150b127f698c0c2f90e09caae8a8f3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 31 May 2019 08:53:30 -0400
Subject: [PATCH 06/53] pep8

---
 sklearn/tests/test_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index ae09dfd501bd2..036cf69a3d24d 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -21,7 +21,7 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.datasets import make_classification
 from sklearn.pipeline import make_pipeline

From a44318b3b6b8b0abef81cfbd3c583763dab9c22d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 31 May 2019 11:35:05 -0400
Subject: [PATCH 07/53] Trigger CI??


From abdc94e3b8e92851ed63c0467cf7b6c4218071e5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 26 Jun 2019 13:23:52 -0400
Subject: [PATCH 08/53] Added to decision tree for gridsearch tests to pass

---
 sklearn/tree/tree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 77b0ad6f6592f..367359805fce6 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -125,7 +125,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         random_state = check_random_state(self.random_state)
         if check_input:
-            X = check_array(X, dtype=DTYPE, accept_sparse="csc")
+            X = self._validate_X(X, dtype=DTYPE, accept_sparse="csc")
             y = check_array(y, ensure_2d=False, dtype=None)
             if issparse(X):
                 X.sort_indices()

From 62fc42e2065edfa4e2ae9cc7c4feb754a6e665a8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 1 Aug 2019 17:34:20 -0400
Subject: [PATCH 09/53] Added support for ColumnTransformer and FeatureUnion

---
 sklearn/compose/_column_transformer.py        |  2 +
 .../compose/tests/test_column_transformer.py  | 12 +++++
 .../tests/test_dict_vectorizer.py             | 10 +++++
 sklearn/model_selection/tests/test_search.py  | 15 +++++++
 sklearn/pipeline.py                           |  5 +++
 sklearn/tests/test_base.py                    | 45 +++++--------------
 sklearn/tests/test_pipeline.py                | 45 +++++++++++++++++++
 7 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index c0f537776cb6a..25e473dbc51ac 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -471,6 +471,7 @@ def fit_transform(self, X, y=None):
 
         """
         X = _check_X(X)
+        self._validate_n_features(X, check_n_features=False)
         self._validate_transformers()
         self._validate_column_callables(X)
         self._validate_remainder(X)
@@ -518,6 +519,7 @@ def transform(self, X):
         """
         check_is_fitted(self, 'transformers_')
         X = _check_X(X)
+        self._validate_n_features(X, check_n_features=True)
 
         if self._n_features > X.shape[1]:
             raise ValueError('Number of features of the input must be equal '
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index ae7ef31d6c7f1..9d133fbf0a4d7 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -1108,3 +1108,15 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
     err_msg = 'Specifying the columns'
     with pytest.raises(ValueError, match=err_msg):
         tf.transform(X_array)
+
+
+def test_n_features_in():
+    # make sure n_features_in is what is passed as input to the column
+    # transformer.
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    ct = ColumnTransformer([('a', DoubleTrans(), [0]),
+                            ('b', DoubleTrans(), [1])])
+    assert not hasattr(ct, 'n_features_in_')
+    ct.fit(X)
+    assert ct.n_features_in_ == 2
diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
index 7e7481a369646..32a14fe82be5b 100644
--- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py
+++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -110,3 +110,13 @@ def test_deterministic_vocabulary():
     v_2 = DictVectorizer().fit([d_shuffled])
 
     assert v_1.vocabulary_ == v_2.vocabulary_
+
+
+def test_n_features_in():
+    # For vectorizers, n_features_in_ does not make sense and it is always
+    # None
+    dv = DictVectorizer()
+    assert dv.n_features_in_ is None
+    d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
+    dv.fit(d)
+    assert dv.n_features_in_ is None
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 90a837e7f49f1..42841dcb248a8 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -63,6 +63,8 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import Ridge, SGDClassifier
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
 
 from sklearn.model_selection.tests.common import OneTimeSplitter
 
@@ -1762,3 +1764,16 @@ def get_n_splits(self, *args, **kw):
                              'inconsistent results. Expected \\d+ '
                              'splits, got \\d+'):
         ridge.fit(X[:train_size], y[:train_size])
+
+
+def test_n_features_in():
+    # make sure grid search delegates n_features_in to the best estimator
+    n_features = 4
+    X, y = make_classification(n_features=n_features)
+    gbdt = HistGradientBoostingClassifier()
+    param_grid = {'max_iter': [3, 4]}
+    gs = GridSearchCV(gbdt, param_grid)
+    with pytest.raises(NotFittedError):
+        gs.n_features_in_
+    gs.fit(X, y)
+    assert gs.n_features_in_ == n_features
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 344a0d11210ad..7649c9225a390 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -985,6 +985,11 @@ def _update_transformer_list(self, transformers):
                                      else next(transformers))
                                     for name, old in self.transformer_list]
 
+    @property
+    def n_features_in_(self):
+        # X is passed to all transformers so we just delegate to the first one
+        return self.transformer_list[0][1].n_features_in_
+
 
 def make_union(*transformers, **kwargs):
     """Construct a FeatureUnion from the given transformers.
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 6d2bf83b8b6e1..198570c588cc5 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -514,7 +514,7 @@ def test_regressormixin_score_multioutput():
     assert_warns_message(FutureWarning, msg, reg.score, X, y)
 
 
-def test_validate_X():
+def test_validate_X_n_feature_mismatch():
     # Make sure ValueError is raised when there is a n_features mismatch
     # between fit and predict/transform
 
@@ -528,10 +528,17 @@ def test_validate_X():
         ss.transform(X_more_features)
 
 
+def test_validate_X_bad_kwargs():
+
+    est = BaseEstimator()
+    with pytest.raises(TypeError,
+                       match="check_array\(\) got an unexpected keyword"):
+        est._validate_X([1], bad_param=4)
+
+
 def test_n_features_in_attribute():
     # Make sure n_features_in_ is correctly set.
-    # Note that n_features_in_ is always None for vectorizers, while for other
-    # estimators the attribute doesn't exist until fit() is called.
+    # TODO: eventually move this in estimator_checks
     X_2 = [[0, 1], [2, 3]]
     X_3 = [[0, 1, 4], [2, 3, 5]]
 
@@ -542,38 +549,6 @@ def test_n_features_in_attribute():
     ss = ss.fit(X_3)
     assert ss.n_features_in_ == 3
 
-    dv = DictVectorizer()
-    assert dv.n_features_in_ is None
-    d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
-    dv.fit(d)
-    assert dv.n_features_in_ is None
-
-    # meta estimator need specific ways of dealing with the attribute:
-    # grid search delegates this to # the best estimator
-    n_features = 4
-    X, y = make_classification(n_features=n_features)
-    gbdt = HistGradientBoostingClassifier()
-    param_grid = {'max_iter': [3, 4]}
-    gs = GridSearchCV(gbdt, param_grid)
-    assert hasattr(ss, 'n_features_in_')  # that might be a bit unintuitive
-    with pytest.raises(NotFittedError):
-        gs.n_features_in_
-    gs.fit(X, y)
-    assert gs.n_features_in_ == n_features
-
-    # pipelines delegate to the first step
-    pipe = make_pipeline(gbdt)
-    assert not hasattr(pipe, 'n_features_in_')
-    pipe.fit(X, y)
-    assert pipe.n_features_in_ == n_features
-
-    dv = DictVectorizer()
-    pipe = make_pipeline(dv)
-    assert pipe.n_features_in_ is None
-    d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
-    dv.fit(d)
-    assert pipe.n_features_in_ is None
-
 
 def test_warns_on_get_params_non_attribute():
     class MyEstimator(BaseEstimator):
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index e02b5ef96b7b0..bfdac25f50e32 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -34,6 +34,8 @@
 from sklearn.datasets import load_iris
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
 
 
 JUNK_FOOD_DOCS = (
@@ -1161,3 +1163,46 @@ def test_verbose(est, method, pattern, capsys):
     est.set_params(verbose=True)
     func(X, y)
     assert re.match(pattern, capsys.readouterr().out)
+
+
+def test_n_features_in_pipeline():
+    # make sure pipelines delegate n_features_in to the first step
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    ss = StandardScaler()
+    gbdt = HistGradientBoostingClassifier()
+    pipe = make_pipeline(ss, gbdt)
+    assert not hasattr(pipe, 'n_features_in_')
+    pipe.fit(X, y)
+    assert pipe.n_features_in_ == ss.n_features_in_ == 2
+
+    # if the first step has the n_features_in attribute then the pipeline also
+    # has it, even though it isn't fitted.
+    ss = StandardScaler()
+    gbdt = HistGradientBoostingClassifier()
+    pipe = make_pipeline(ss, gbdt)
+    ss.fit(X, y)
+    assert pipe.n_features_in_ == ss.n_features_in_ == 2
+    assert not hasattr(gbdt, 'n_features_in_')
+
+
+def test_n_features_in_feature_union():
+    # make sure FeatureUnion delegates n_features_in to the first transformer
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    ss = StandardScaler()
+    fu = make_union(ss)
+    assert not hasattr(fu, 'n_features_in_')
+    fu.fit(X, y)
+    assert fu.n_features_in_ == ss.n_features_in_ == 2
+
+    # if the first step has the n_features_in attribute then the feature_union
+    # also has it, even though it isn't fitted.
+    ss = StandardScaler()
+    fu = make_union(ss)
+    ss.fit(X, y)
+    assert fu.n_features_in_ == ss.n_features_in_ == 2

From 6845788aa8c665ea444517d8856177cb404b6c9d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 1 Aug 2019 17:39:35 -0400
Subject: [PATCH 10/53] pep8

---
 sklearn/model_selection/tests/test_search.py | 2 +-
 sklearn/tests/test_base.py                   | 8 +-------
 sklearn/tests/test_pipeline.py               | 2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 42841dcb248a8..17a7493c8675a 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -63,7 +63,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import Ridge, SGDClassifier
-from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
 from sklearn.model_selection.tests.common import OneTimeSplitter
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 198570c588cc5..4096680362d2b 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -17,12 +17,6 @@
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler
-from sklearn.feature_extraction import DictVectorizer
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.datasets import make_classification
-from sklearn.pipeline import make_pipeline
-from sklearn.exceptions import NotFittedError
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor
@@ -532,7 +526,7 @@ def test_validate_X_bad_kwargs():
 
     est = BaseEstimator()
     with pytest.raises(TypeError,
-                       match="check_array\(\) got an unexpected keyword"):
+                       match="got an unexpected keyword"):
         est._validate_X([1], bad_param=4)
 
 
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index bfdac25f50e32..4fffcc0a4dc70 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -34,7 +34,7 @@
 from sklearn.datasets import load_iris
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
 
From ee2598bf143da1696f9cc380480a9d496f6db4b2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 12 Aug 2019 09:01:30 -0400
Subject: [PATCH 11/53] BaseSearchCV now raises AttributeError

---
 sklearn/model_selection/_search.py           | 11 ++++++++++-
 sklearn/model_selection/tests/test_search.py |  3 +--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 590d6e5d3558a..1f12bf4096708 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -565,7 +565,16 @@ def inverse_transform(self, Xt):
 
     @property
     def n_features_in_(self):
-        check_is_fitted(self, 'best_estimator_')
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the search estimator isn't fitted.
+        try:
+            check_is_fitted(self, 'best_estimator_')
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__ )
+            ) from nfe
+
         return self.best_estimator_.n_features_in_
 
     @property
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 7a74438f2a5bc..d6a31ca96bbe4 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -1786,7 +1786,6 @@ def test_n_features_in():
     gbdt = HistGradientBoostingClassifier()
     param_grid = {'max_iter': [3, 4]}
     gs = GridSearchCV(gbdt, param_grid)
-    with pytest.raises(NotFittedError):
-        gs.n_features_in_
+    assert not hasattr(gs, 'n_features_in_')
     gs.fit(X, y)
     assert gs.n_features_in_ == n_features

From 25fda0ff2b2633b4455c111d5791099731f5bcb7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 2 Sep 2019 18:12:22 -0400
Subject: [PATCH 12/53] Added common test + used _validate_XXX on most
 estimators

---
 sklearn/base.py                               |  3 +-
 sklearn/calibration.py                        |  4 +-
 sklearn/cluster/affinity_propagation_.py      |  2 +-
 sklearn/cluster/birch.py                      |  2 +-
 sklearn/cluster/dbscan_.py                    |  2 +-
 sklearn/cluster/hierarchical.py               |  2 +-
 sklearn/cluster/mean_shift_.py                |  2 +-
 sklearn/cluster/optics_.py                    |  2 +-
 sklearn/cluster/spectral.py                   |  4 +-
 sklearn/compose/_target.py                    | 15 ++++++
 sklearn/covariance/empirical_covariance_.py   |  2 +-
 sklearn/covariance/graph_lasso_.py            |  6 +--
 sklearn/covariance/robust_covariance.py       |  2 +-
 sklearn/covariance/shrunk_covariance_.py      |  6 +--
 sklearn/cross_decomposition/pls_.py           |  8 +--
 sklearn/decomposition/dict_learning.py        |  4 +-
 sklearn/decomposition/factor_analysis.py      |  2 +-
 sklearn/decomposition/incremental_pca.py      |  4 +-
 sklearn/decomposition/kernel_pca.py           |  2 +-
 sklearn/decomposition/nmf.py                  |  2 +-
 sklearn/decomposition/pca.py                  |  4 +-
 sklearn/decomposition/sparse_pca.py           |  4 +-
 sklearn/decomposition/truncated_svd.py        |  4 +-
 sklearn/discriminant_analysis.py              |  6 +--
 sklearn/ensemble/bagging.py                   |  6 +--
 sklearn/ensemble/forest.py                    |  2 +-
 sklearn/ensemble/gradient_boosting.py         |  3 +-
 sklearn/feature_selection/from_model.py       | 15 ++++++
 sklearn/feature_selection/rfe.py              |  6 ++-
 .../feature_selection/univariate_selection.py |  3 +-
 .../feature_selection/variance_threshold.py   |  2 +-
 sklearn/gaussian_process/gpc.py               |  2 +-
 sklearn/gaussian_process/gpr.py               |  2 +-
 sklearn/impute/_iterative.py                  |  4 +-
 sklearn/kernel_approximation.py               |  8 +--
 sklearn/kernel_ridge.py                       |  4 +-
 sklearn/linear_model/base.py                  |  4 +-
 sklearn/linear_model/bayes.py                 |  6 +--
 sklearn/linear_model/coordinate_descent.py    | 19 ++++---
 sklearn/linear_model/huber.py                 |  2 +-
 sklearn/linear_model/least_angle.py           |  6 +--
 sklearn/linear_model/logistic.py              | 11 ++--
 sklearn/linear_model/omp.py                   |  6 +--
 sklearn/linear_model/ransac.py                |  2 +-
 sklearn/linear_model/ridge.py                 | 21 ++++----
 sklearn/linear_model/stochastic_gradient.py   | 10 ++--
 sklearn/linear_model/theil_sen.py             |  2 +-
 sklearn/manifold/isomap.py                    |  2 +-
 sklearn/manifold/locally_linear.py            |  2 +-
 sklearn/manifold/mds.py                       |  2 +-
 sklearn/manifold/spectral_embedding_.py       |  2 +-
 sklearn/manifold/t_sne.py                     |  8 +--
 sklearn/mixture/base.py                       |  1 +
 sklearn/model_selection/_search.py            |  2 +-
 sklearn/multiclass.py                         | 18 ++++++-
 sklearn/multioutput.py                        |  6 +--
 sklearn/naive_bayes.py                        |  4 +-
 sklearn/neighbors/base.py                     |  2 +-
 sklearn/neighbors/kde.py                      |  2 +-
 sklearn/neighbors/nca.py                      |  2 +-
 sklearn/neighbors/nearest_centroid.py         |  4 +-
 .../neural_network/multilayer_perceptron.py   |  8 +--
 sklearn/neural_network/rbm.py                 |  2 +-
 sklearn/preprocessing/_discretization.py      |  2 +-
 sklearn/preprocessing/data.py                 | 10 ++--
 sklearn/random_projection.py                  |  2 +-
 sklearn/semi_supervised/label_propagation.py  |  2 +-
 sklearn/svm/base.py                           |  6 +--
 sklearn/svm/classes.py                        | 12 ++---
 sklearn/utils/estimator_checks.py             | 54 +++++++++++++++++++
 70 files changed, 252 insertions(+), 143 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index e39e3f2ec94df..dbe96a6365450 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -350,7 +350,8 @@ def _validate_n_features(self, X, check_n_features):
                     'as input.'.format(X.shape[1], self.__class__.__name__,
                                        self.n_features_in_)
                 )
-        self.n_features_in_ = X.shape[1]
+        else:
+            self.n_features_in_ = X.shape[1]
 
     def _validate_X(self, X, check_n_features=False, **check_array_params):
         X = check_array(X, **check_array_params)
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b88a8b8eb37ef..73d56d40af9fa 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -130,8 +130,8 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns an instance of self.
         """
-        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
-                         force_all_finite=False, allow_nd=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
+                                  force_all_finite=False, allow_nd=True)
         X, y = indexable(X, y)
         le = LabelBinarizer().fit(y)
         self.classes_ = le.classes_
diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
index 89c6ce9fe8b34..0ae5496d0ff3f 100644
--- a/sklearn/cluster/affinity_propagation_.py
+++ b/sklearn/cluster/affinity_propagation_.py
@@ -372,7 +372,7 @@ def fit(self, X, y=None):
             accept_sparse = False
         else:
             accept_sparse = 'csr'
-        X = check_array(X, accept_sparse=accept_sparse)
+        X = self._validate_X(X, accept_sparse=accept_sparse)
         if self.affinity == "precomputed":
             self.affinity_matrix_ = X
         elif self.affinity == "euclidean":
diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py
index 11bb0f17a1dc6..87bb6e1695be8 100644
--- a/sklearn/cluster/birch.py
+++ b/sklearn/cluster/birch.py
@@ -445,7 +445,7 @@ def fit(self, X, y=None):
         return self._fit(X)
 
     def _fit(self, X):
-        X = check_array(X, accept_sparse='csr', copy=self.copy)
+        X = self._validate_X(X, accept_sparse='csr', copy=self.copy)
         threshold = self.threshold
         branching_factor = self.branching_factor
 
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 9f4a55d3ad5b3..3927a532d17bd 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -352,7 +352,7 @@ def fit(self, X, y=None, sample_weight=None):
         self
 
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
         clust = dbscan(X, sample_weight=sample_weight,
                        **self.get_params())
         self.core_sample_indices_, self.labels_ = clust
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index edf4dae76cd49..3c183c24f9a95 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -790,7 +790,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_X(X, ensure_min_samples=2, estimator=self)
         memory = check_memory(self.memory)
 
         if self.n_clusters is not None and self.n_clusters <= 0:
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index e588ccd6df1c8..0fc6e8cdc9292 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -414,7 +414,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         self.cluster_centers_, self.labels_ = \
             mean_shift(X, bandwidth=self.bandwidth, seeds=self.seeds,
                        min_bin_freq=self.min_bin_freq,
diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index ecf5fa6a2bcc0..a4b97323a86c5 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -233,7 +233,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float)
+        X = self._validate_X(X, dtype=np.float)
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index 0398ec0df006f..262275e6ab1ba 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -466,8 +466,8 @@ def fit(self, X, y=None):
         self
 
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype=np.float64, ensure_min_samples=2)
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                             dtype=np.float64, ensure_min_samples=2)
         if X.shape[0] == X.shape[1] and self.affinity != "precomputed":
             warnings.warn("The spectral clustering API has changed. ``fit``"
                           "now constructs an affinity matrix from data. To use"
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 35b7ed6af962a..4f0cc12a272f1 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -10,6 +10,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils import check_array, safe_indexing
 from ..preprocessing import FunctionTransformer
+from ..exceptions import NotFittedError
 
 __all__ = ['TransformedTargetRegressor']
 
@@ -236,3 +237,17 @@ def predict(self, X):
 
     def _more_tags(self):
         return {'poor_score': True, 'no_validation': True}
+
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return self.regressor_.n_features_in_
diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py
index 924f7edd7ffee..aa78d788142e2 100644
--- a/sklearn/covariance/empirical_covariance_.py
+++ b/sklearn/covariance/empirical_covariance_.py
@@ -191,7 +191,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index e78950bd60421..874b5d5576c50 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -378,8 +378,8 @@ def fit(self, X, y=None):
         y : (ignored)
         """
         # Covariance does not make sense for a single feature
-        X = check_array(X, ensure_min_features=2, ensure_min_samples=2,
-                        estimator=self)
+        X = self._validate_X(X, ensure_min_features=2, ensure_min_samples=2,
+                             estimator=self)
 
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
@@ -645,7 +645,7 @@ def fit(self, X, y=None):
         y : (ignored)
         """
         # Covariance does not make sense for a single feature
-        X = check_array(X, ensure_min_features=2, estimator=self)
+        X = self._validate_X(X, ensure_min_features=2, estimator=self)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py
index 173794e5340c2..6057c4c6058d0 100644
--- a/sklearn/covariance/robust_covariance.py
+++ b/sklearn/covariance/robust_covariance.py
@@ -636,7 +636,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X, ensure_min_samples=2, estimator='MinCovDet')
+        X = self._validate_X(X, ensure_min_samples=2, estimator='MinCovDet')
         random_state = check_random_state(self.random_state)
         n_samples, n_features = X.shape
         # check that the empirical covariance is full rank
diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py
index 6a0c80d2e4ff6..26b8ce237cbb5 100644
--- a/sklearn/covariance/shrunk_covariance_.py
+++ b/sklearn/covariance/shrunk_covariance_.py
@@ -143,7 +143,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         # Not calling the parent object to fit, to avoid a potential
         # matrix inversion when setting the precision
         if self.assume_centered:
@@ -419,7 +419,7 @@ def fit(self, X, y=None):
         """
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
-        X = check_array(X)
+        X = self._validate_X(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
@@ -572,7 +572,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
         if self.assume_centered:
diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
index af45d4fa53a09..c2714db3af1ba 100644
--- a/sklearn/cross_decomposition/pls_.py
+++ b/sklearn/cross_decomposition/pls_.py
@@ -252,8 +252,8 @@ def fit(self, X, Y):
 
         # copy since this will contains the residuals (deflated) matrices
         check_consistent_length(X, Y)
-        X = check_array(X, dtype=np.float64, copy=self.copy,
-                        ensure_min_samples=2)
+        X = self._validate_X(X, dtype=np.float64, copy=self.copy,
+                             ensure_min_samples=2)
         Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
@@ -828,8 +828,8 @@ def fit(self, X, Y):
         """
         # copy since this will contains the centered data
         check_consistent_length(X, Y)
-        X = check_array(X, dtype=np.float64, copy=self.copy,
-                        ensure_min_samples=2)
+        X = self._validate_X(X, dtype=np.float64, copy=self.copy,
+                             ensure_min_samples=2)
         Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 1a5a42d526917..6ed224c344b3d 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -1217,7 +1217,7 @@ def fit(self, X, y=None):
             Returns the object itself
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X)
+        X = self._validate_X(X)
         if self.n_components is None:
             n_components = X.shape[1]
         else:
@@ -1423,7 +1423,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X)
+        X = self._validate_X(X)
 
         U, (A, B), self.n_iter_ = dict_learning_online(
             X, self.n_components, self.alpha,
diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py
index ba624140ce1fc..c42c0ef617d60 100644
--- a/sklearn/decomposition/factor_analysis.py
+++ b/sklearn/decomposition/factor_analysis.py
@@ -167,7 +167,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, copy=self.copy, dtype=np.float64)
+        X = self._validate_X(X, copy=self.copy, dtype=np.float64)
 
         n_samples, n_features = X.shape
         n_components = self.n_components
diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
index c6d611dcd5fea..815a912f92f5d 100644
--- a/sklearn/decomposition/incremental_pca.py
+++ b/sklearn/decomposition/incremental_pca.py
@@ -192,8 +192,8 @@ def fit(self, X, y=None):
         self.singular_values_ = None
         self.noise_variance_ = None
 
-        X = check_array(X, accept_sparse=['csr', 'csc', 'lil'],
-                        copy=self.copy, dtype=[np.float64, np.float32])
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'lil'],
+                             copy=self.copy, dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
 
         if self.batch_size is None:
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index 59785fed3ac0e..ea907210fd5d3 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -271,7 +271,7 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        X = check_array(X, accept_sparse='csr', copy=self.copy_X)
+        X = self._validate_X(X, accept_sparse='csr', copy=self.copy_X)
         self._centerer = KernelCenterer()
         K = self._get_kernel(X)
         self._fit_transform(K)
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 0cc8713679136..3b28f6c638961 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -1266,7 +1266,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : array, shape (n_samples, n_components)
             Transformed data.
         """
-        X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
+        X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=float)
 
         W, H, n_iter_ = non_negative_factorization(
             X=X, W=W, H=H, n_components=self.n_components, init=self.init,
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index 1bf3d6e6b19e6..001bf5b0c3953 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -385,8 +385,8 @@ def _fit(self, X):
             raise TypeError('PCA does not support sparse input. See '
                             'TruncatedSVD for a possible alternative.')
 
-        X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True,
-                        copy=self.copy)
+        X = self._validate_X(X, dtype=[np.float64, np.float32], ensure_2d=True,
+                             copy=self.copy)
 
         # Handle n_components==None
         if self.n_components is None:
diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py
index 3ca14cb528bb8..3e9afd4df598d 100644
--- a/sklearn/decomposition/sparse_pca.py
+++ b/sklearn/decomposition/sparse_pca.py
@@ -166,7 +166,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X)
+        X = self._validate_X(X)
 
         _check_normalize_components(
             self.normalize_components, self.__class__.__name__
@@ -364,7 +364,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X)
+        X = self._validate_X(X)
 
         _check_normalize_components(
             self.normalize_components, self.__class__.__name__
diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
index ce79fba2fad1d..3211fc39f0eec 100644
--- a/sklearn/decomposition/truncated_svd.py
+++ b/sklearn/decomposition/truncated_svd.py
@@ -155,8 +155,8 @@ def fit_transform(self, X, y=None):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'],
-                        ensure_min_features=2)
+        X = self._validate_X(X, accept_sparse=['csr', 'csc'],
+                             ensure_min_features=2)
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index efe39b8c3fb9a..bf33d1803493e 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -424,8 +424,8 @@ def fit(self, X, y):
             Target values.
         """
         # FIXME: Future warning to be removed in 0.23
-        X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self,
-                         dtype=[np.float64, np.float32])
+        X, y = self._validate_X_y(X, y, ensure_min_samples=2, estimator=self,
+                                  dtype=[np.float64, np.float32])
         self.classes_ = unique_labels(y)
         n_samples, _ = X.shape
         n_classes = len(self.classes_)
@@ -656,7 +656,7 @@ def fit(self, X, y):
         y : array, shape = [n_samples]
             Target values (integers)
         """
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         check_classification_targets(y)
         self.classes_, y = np.unique(y, return_inverse=True)
         n_samples, n_features = X.shape
diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
index 37dc5a97b4e67..c2fe356452a06 100644
--- a/sklearn/ensemble/bagging.py
+++ b/sklearn/ensemble/bagging.py
@@ -277,9 +277,9 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
         random_state = check_random_state(self.random_state)
 
         # Convert data (X is required to be 2d and indexable)
-        X, y = check_X_y(
-            X, y, ['csr', 'csc'], dtype=None, force_all_finite=False,
-            multi_output=True
+        X, y = self._validate_X_y(
+            X, y, accept_sparse=['csr', 'csc'], dtype=None,
+            force_all_finite=False, multi_output=True
         )
         if sample_weight is not None:
             sample_weight = check_array(sample_weight, ensure_2d=False)
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index df24411c4a974..25c75064d279e 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -240,7 +240,7 @@ def fit(self, X, y, sample_weight=None):
         self : object
         """
         # Validate or convert input data
-        X = check_array(X, accept_sparse="csc", dtype=DTYPE)
+        X = self._validate_X(X, accept_sparse="csc", dtype=DTYPE)
         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         if sample_weight is not None:
             sample_weight = check_array(sample_weight, ensure_2d=False)
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index ec5f9a111ccf1..cdb57fc70dfd3 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -1436,7 +1436,8 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         # Check input
         # Since check_array converts both X and y to the same dtype, but the
         # trees use different types for X and y, checking them separately.
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                             dtype=DTYPE)
         n_samples, self.n_features_ = X.shape
 
         sample_weight_is_none = sample_weight is None
diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py
index fb26f9d685688..e258a56e8a771 100644
--- a/sklearn/feature_selection/from_model.py
+++ b/sklearn/feature_selection/from_model.py
@@ -6,6 +6,7 @@
 
 from .base import SelectorMixin
 from ..base import BaseEstimator, clone, MetaEstimatorMixin
+from ..utils.validation import check_is_fitted
 
 from ..exceptions import NotFittedError
 from ..utils.metaestimators import if_delegate_has_method
@@ -227,3 +228,17 @@ def partial_fit(self, X, y=None, **fit_params):
             self.estimator_ = clone(self.estimator)
         self.estimator_.partial_fit(X, y, **fit_params)
         return self
+
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return self.estimator_.n_features_in_
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index 4e957e8463a7c..9562148227c05 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -150,7 +150,8 @@ def _fit(self, X, y, step_score=None):
         # and is used when implementing RFECV
         # self.scores_ will not be calculated when calling _fit through fit
 
-        X, y = check_X_y(X, y, "csc", ensure_min_features=2)
+        X, y = self._validate_X_y(X, y, accept_sparse="csc",
+                                  ensure_min_features=2)
         # Initialization
         n_features = X.shape[1]
         if self.n_features_to_select is None:
@@ -479,7 +480,8 @@ def fit(self, X, y, groups=None):
             train/test set. Only used in conjunction with a "Group" `cv`
             instance (e.g., `GroupKFold`).
         """
-        X, y = check_X_y(X, y, "csr", ensure_min_features=2)
+        X, y = self._validate_X_y(X, y, accept_sparse="csr",
+                                  ensure_min_features=2)
 
         # Initialization
         cv = check_cv(self.cv, y, is_classifier(self.estimator))
diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py
index 5b1cae1823e9c..970bda28b7e46 100644
--- a/sklearn/feature_selection/univariate_selection.py
+++ b/sklearn/feature_selection/univariate_selection.py
@@ -338,7 +338,8 @@ def fit(self, X, y):
         -------
         self : object
         """
-        X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'],
+                                  multi_output=True)
 
         if not callable(self.score_func):
             raise TypeError("The score function should be a callable, %s (%s) "
diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py
index c9eb973dc86c3..77d2bf8ee14b0 100644
--- a/sklearn/feature_selection/variance_threshold.py
+++ b/sklearn/feature_selection/variance_threshold.py
@@ -61,7 +61,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, ('csr', 'csc'), dtype=np.float64)
+        X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=np.float64)
 
         if hasattr(X, "toarray"):   # sparse matrix
             _, self.variances_ = mean_variance_axis(X, axis=0)
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 5421f7e408472..1adf6af0148b3 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -612,7 +612,7 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = check_X_y(X, y, multi_output=False)
+        X, y = self._validate_X_y(X, y, multi_output=False)
 
         self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
             self.kernel, self.optimizer, self.n_restarts_optimizer,
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index cc9806cd1c41e..673a558b2d566 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -182,7 +182,7 @@ def fit(self, X, y):
 
         self._rng = check_random_state(self.random_state)
 
-        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
+        X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True)
 
         # Normalize target value
         if self.normalize_y:
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 05e2f1484fccf..bb74580517e59 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -472,8 +472,8 @@ def _initial_imputation(self, X):
         else:
             force_all_finite = True
 
-        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
-                        force_all_finite=force_all_finite)
+        X = self._validate_X(X, dtype=FLOAT_DTYPES, order="F",
+                             force_all_finite=force_all_finite)
         _check_inputs_dtype(X, self.missing_values)
 
         mask_missing_values = _get_mask(X, self.missing_values)
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 7a2b404304daf..bb04fa85998ca 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -91,7 +91,7 @@ def fit(self, X, y=None):
             Returns the transformer.
         """
 
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
 
@@ -197,7 +197,7 @@ def fit(self, X, y=None):
             Returns the transformer.
         """
 
-        X = check_array(X)
+        X = self._validate_X(X)
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
         uniform = random_state.uniform(size=(n_features, self.n_components))
@@ -324,7 +324,7 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        check_array(X, accept_sparse='csr')
+        self._validate_X(X, accept_sparse='csr')
         if self.sample_interval is None:
             # See reference, figure 2 c)
             if self.sample_steps == 1:
@@ -540,7 +540,7 @@ def fit(self, X, y=None):
         X : array-like, shape=(n_samples, n_feature)
             Training data.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
         rnd = check_random_state(self.random_state)
         n_samples = X.shape[0]
 
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 3d69066e342d6..ba1b9867956fa 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -148,8 +148,8 @@ def fit(self, X, y=None, sample_weight=None):
         self : returns an instance of self.
         """
         # Convert data
-        X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True,
-                         y_numeric=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=("csr", "csc"),
+                                  multi_output=True, y_numeric=True)
         if sample_weight is not None and not isinstance(sample_weight, float):
             sample_weight = check_array(sample_weight, ensure_2d=False)
 
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index d2af98d07ac09..8def021d68974 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -464,8 +464,8 @@ def fit(self, X, y, sample_weight=None):
         """
 
         n_jobs_ = self.n_jobs
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         y_numeric=True, multi_output=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                  y_numeric=True, multi_output=True)
 
         if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
             raise ValueError("Sample weights must be 1D array or scalar")
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
index 7b19ed3ce607f..72d956b555945 100644
--- a/sklearn/linear_model/bayes.py
+++ b/sklearn/linear_model/bayes.py
@@ -189,7 +189,7 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError('n_iter should be greater than or equal to 1.'
                              ' Got {!r}.'.format(self.n_iter))
 
-        X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
+        X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True)
         X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X,
             sample_weight=sample_weight)
@@ -516,8 +516,8 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True,
-                         ensure_min_samples=2)
+        X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True,
+                                  ensure_min_samples=2)
 
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 35bfcb692ca2f..24366a5ee0463 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -696,9 +696,11 @@ def fit(self, X, y, check_input=True):
         # when bypassing checks
         if check_input:
             X_copied = self.copy_X and self.fit_intercept
-            X, y = check_X_y(X, y, accept_sparse='csc',
-                             order='F', dtype=[np.float64, np.float32],
-                             copy=X_copied, multi_output=True, y_numeric=True)
+            X, y = self._validate_X_y(X, y, accept_sparse='csc',
+                                      order='F',
+                                      dtype=[np.float64, np.float32],
+                                      copy=X_copied, multi_output=True,
+                                      y_numeric=True)
             y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
                             ensure_2d=False)
 
@@ -1120,7 +1122,7 @@ def fit(self, X, y):
             # Let us not impose fortran ordering so far: it is
             # not useful for the cross-validation loop and will be done
             # by the model fitting itself
-            X = check_array(X, 'csc', copy=False)
+            X = self._validate_X(X, accept_sparse='csc', copy=False)
             if sparse.isspmatrix(X):
                 if (hasattr(reference_to_old_X, "data") and
                    not np.may_share_memory(reference_to_old_X.data, X.data)):
@@ -1131,8 +1133,9 @@ def fit(self, X, y):
                 copy_X = False
             del reference_to_old_X
         else:
-            X = check_array(X, 'csc', dtype=[np.float64, np.float32],
-                            order='F', copy=copy_X)
+            X = self._validate_X(X, accept_sparse='csc',
+                                 dtype=[np.float64, np.float32], order='F',
+                                 copy=copy_X)
             copy_X = False
 
         if X.shape[0] != y.shape[0]:
@@ -1752,8 +1755,8 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        X = check_array(X, dtype=[np.float64, np.float32], order='F',
-                        copy=self.copy_X and self.fit_intercept)
+        X = self._validate_X(X, dtype=[np.float64, np.float32], order='F',
+                             copy=self.copy_X and self.fit_intercept)
         y = check_array(y, dtype=X.dtype.type, ensure_2d=False)
 
         if hasattr(self, 'l1_ratio'):
diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py
index e518feae29b78..7ac5b3d0e19c4 100644
--- a/sklearn/linear_model/huber.py
+++ b/sklearn/linear_model/huber.py
@@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = check_X_y(
+        X, y = self._validate_X_y(
             X, y, copy=False, accept_sparse=['csr'], y_numeric=True,
             dtype=[np.float64, np.float32])
 
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 2df43cca9365f..54abd87b27b7d 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -954,7 +954,7 @@ def fit(self, X, y, Xy=None):
         self : object
             returns an instance of self.
         """
-        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)
+        X, y = self._validate_X_y(X, y, y_numeric=True, multi_output=True)
 
         alpha = getattr(self, 'alpha', 0.)
         if hasattr(self, 'n_nonzero_coefs'):
@@ -1374,7 +1374,7 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = check_X_y(X, y, y_numeric=True)
+        X, y = self._validate_X_y(X, y, y_numeric=True)
         X = as_float_array(X, copy=self.copy_X)
         y = as_float_array(y, copy=self.copy_X)
 
@@ -1752,7 +1752,7 @@ def fit(self, X, y, copy_X=None):
         """
         if copy_X is None:
             copy_X = self.copy_X
-        X, y = check_X_y(X, y, y_numeric=True)
+        X, y = self._validate_X_y(X, y, y_numeric=True)
 
         X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data(
             X, y, self.fit_intercept, self.normalize, copy_X)
diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
index 5ba3eb99fa25e..50a2a6c91a535 100644
--- a/sklearn/linear_model/logistic.py
+++ b/sklearn/linear_model/logistic.py
@@ -1511,8 +1511,9 @@ def fit(self, X, y, sample_weight=None):
         else:
             _dtype = [np.float64, np.float32]
 
-        X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-                         accept_large_sparse=solver != 'liblinear')
+        X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=_dtype,
+                                  order="C",
+                                  accept_large_sparse=solver != 'liblinear')
         check_classification_targets(y)
         self.classes_ = np.unique(y)
         n_samples, n_features = X.shape
@@ -1981,9 +1982,9 @@ def fit(self, X, y, sample_weight=None):
                 "LogisticRegressionCV."
             )
 
-        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-                         order="C",
-                         accept_large_sparse=solver != 'liblinear')
+        X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=np.float64,
+                                  order="C",
+                                  accept_large_sparse=solver != 'liblinear')
         check_classification_targets(y)
 
         class_weight = self.class_weight
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index df6e44f5708e0..ca45c72879380 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -641,7 +641,7 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
+        X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True)
         n_features = X.shape[1]
 
         X, y, X_offset, y_offset, X_scale, Gram, Xy = \
@@ -879,8 +879,8 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = check_X_y(X, y, y_numeric=True, ensure_min_features=2,
-                         estimator=self)
+        X, y = self._validate_X_y(X, y, y_numeric=True, ensure_min_features=2,
+                                  estimator=self)
         X = as_float_array(X, copy=False, force_all_finite=False)
         cv = check_cv(self.cv, classifier=False)
         max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1])
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index e868a31d17c8d..a3dc3fb24a983 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -251,7 +251,7 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
         y = check_array(y, ensure_2d=False)
         check_consistent_length(X, y)
 
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index b1c24a5860227..425293c9ee8ca 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -541,10 +541,10 @@ def fit(self, X, y, sample_weight=None):
         _dtype = [np.float64, np.float32]
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
                                                   self.solver)
-        X, y = check_X_y(X, y,
-                         accept_sparse=_accept_sparse,
-                         dtype=_dtype,
-                         multi_output=True, y_numeric=True)
+        X, y = self._validate_X_y(X, y,
+                                  accept_sparse=_accept_sparse,
+                                  dtype=_dtype,
+                                  multi_output=True, y_numeric=True)
         if sparse.issparse(X) and self.fit_intercept:
             if self.solver not in ['auto', 'sparse_cg', 'sag']:
                 raise ValueError(
@@ -921,7 +921,8 @@ def fit(self, X, y, sample_weight=None):
         """
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
                                                   self.solver)
-        check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True)
+        self._validate_X_y(X, y, accept_sparse=_accept_sparse,
+                           multi_output=True)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
         Y = self._label_binarizer.fit_transform(y)
@@ -1418,9 +1419,9 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'],
-                         dtype=[np.float64],
-                         multi_output=True, y_numeric=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                  dtype=[np.float64],
+                                  multi_output=True, y_numeric=True)
 
         if np.any(self.alphas <= 0):
             raise ValueError(
@@ -1829,8 +1830,8 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                  multi_output=True)
+        self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                           multi_output=True)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
         Y = self._label_binarizer.fit_transform(y)
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
index c56792de96172..809aae841ba5b 100644
--- a/sklearn/linear_model/stochastic_gradient.py
+++ b/sklearn/linear_model/stochastic_gradient.py
@@ -509,8 +509,9 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None,
         if hasattr(self, "classes_"):
             self.classes_ = None
 
-        X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
-                         accept_large_sparse=False)
+        X, y = self._validate_X_y(X, y, accept_sparse='csr',
+                                  dtype=np.float64, order="C",
+                                  accept_large_sparse=False)
 
         # labels can be encoded as float, int, or string literals
         # np.unique sorts in asc order; largest class id is positive class
@@ -1079,8 +1080,9 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
 
     def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
                      max_iter, sample_weight, coef_init, intercept_init):
-        X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64,
-                         accept_large_sparse=False)
+        X, y = self._validate_X_y(X, y, accept_sparse="csr", copy=False,
+                                  order='C', dtype=np.float64,
+                                  accept_large_sparse=False)
         y = y.astype(np.float64, copy=False)
 
         n_samples, n_features = X.shape
diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py
index 941c51196cc4a..ab4e840376e75 100644
--- a/sklearn/linear_model/theil_sen.py
+++ b/sklearn/linear_model/theil_sen.py
@@ -358,7 +358,7 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         random_state = check_random_state(self.random_state)
-        X, y = check_X_y(X, y, y_numeric=True)
+        X, y = self._validate_X_y(X, y, y_numeric=True)
         n_samples, n_features = X.shape
         n_subsamples, self.n_subpopulation_ = self._check_subparams(n_samples,
                                                                     n_features)
diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py
index 88c979c0e1fdb..d6f0f6a1bf2a9 100644
--- a/sklearn/manifold/isomap.py
+++ b/sklearn/manifold/isomap.py
@@ -114,7 +114,7 @@ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto',
         self.n_jobs = n_jobs
 
     def _fit_transform(self, X):
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
         self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       algorithm=self.neighbors_algorithm,
                                       n_jobs=self.n_jobs)
diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
index 4e90d4876f4df..0186fcac53a39 100644
--- a/sklearn/manifold/locally_linear.py
+++ b/sklearn/manifold/locally_linear.py
@@ -656,7 +656,7 @@ def _fit_transform(self, X):
                                       n_jobs=self.n_jobs)
 
         random_state = check_random_state(self.random_state)
-        X = check_array(X, dtype=float)
+        X = self._validate_X(X, dtype=float)
         self.nbrs_.fit(X)
         self.embedding_, self.reconstruction_error_ = \
             locally_linear_embedding(
diff --git a/sklearn/manifold/mds.py b/sklearn/manifold/mds.py
index 5238c67e93dfd..0ddf8dda7f31c 100644
--- a/sklearn/manifold/mds.py
+++ b/sklearn/manifold/mds.py
@@ -414,7 +414,7 @@ def fit_transform(self, X, y=None, init=None):
             algorithm. By default, the algorithm is initialized with a randomly
             chosen array.
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
             warnings.warn("The MDS API has changed. ``fit`` now constructs an"
                           " dissimilarity matrix from data. To use a custom "
diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
index 9142237fd5042..e6a646d13ffd0 100644
--- a/sklearn/manifold/spectral_embedding_.py
+++ b/sklearn/manifold/spectral_embedding_.py
@@ -522,7 +522,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
 
-        X = check_array(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_X(X, ensure_min_samples=2, estimator=self)
 
         random_state = check_random_state(self.random_state)
         if isinstance(self.affinity, str):
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index 987f3af05a941..e460438d9641a 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -682,11 +682,11 @@ def _fit(self, X, skip_num_points=0):
                             'memory. Otherwise consider dimensionality '
                             'reduction techniques (e.g. TruncatedSVD)')
         if self.method == 'barnes_hut':
-            X = check_array(X, ensure_min_samples=2,
-                            dtype=[np.float32, np.float64])
+            X = self._validate_X(X, ensure_min_samples=2,
+                                 dtype=[np.float32, np.float64])
         else:
-            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                            dtype=[np.float32, np.float64])
+            X = self.validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                                dtype=[np.float32, np.float64])
         if self.method == 'barnes_hut' and self.n_components > 3:
             raise ValueError("'n_components' should be inferior to 4 for the "
                              "barnes_hut algorithm as it relies on "
diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py
index 26410fc5256af..ad79972f21263 100644
--- a/sklearn/mixture/base.py
+++ b/sklearn/mixture/base.py
@@ -216,6 +216,7 @@ def fit_predict(self, X, y=None):
             Component labels.
         """
         X = _check_X(X, self.n_components, ensure_min_samples=2)
+        self._validate_n_features(X, check_n_features=False)
         self._check_initial_parameters(X)
 
         # if we enable warm_start, we will have a unique initialisation
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index a10f10d077b4f..f65c8292031ce 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -572,7 +572,7 @@ def n_features_in_(self):
         except NotFittedError as nfe:
             raise AttributeError(
                 "{} object has no n_features_in_ attribute."
-                .format(self.__class__.__name__ )
+                .format(self.__class__.__name__)
             ) from nfe
 
         return self.best_estimator_.n_features_in_
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 217f6ce87cba6..fdb768e25502f 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -52,6 +52,7 @@
                                check_classification_targets,
                                _ovr_decision_function)
 from .utils.metaestimators import _safe_split, if_delegate_has_method
+from .exceptions import NotFittedError
 
 from joblib import Parallel, delayed
 
@@ -409,6 +410,19 @@ def _pairwise(self):
     def _first_estimator(self):
         return self.estimators_[0]
 
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the OVR estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+        return self.estimators_[0].n_features_in_
+
 
 def _fit_ovo_binary(estimator, X, y, i, j):
     """Fit a single binary estimator (one-vs-one)."""
@@ -497,7 +511,7 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'])
         check_classification_targets(y)
 
         self.classes_ = np.unique(y)
@@ -724,7 +738,7 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         if self.code_size <= 0:
             raise ValueError("code_size should be greater than 0, got {0}"
                              "".format(self.code_size))
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 3b5a95349868e..b54b951b451ea 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -148,9 +148,7 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("The base estimator should implement"
                              " a fit method")
 
-        X, y = check_X_y(X, y,
-                         multi_output=True,
-                         accept_sparse=True)
+        X, y = self._validate_X_y(X, y, multi_output=True, accept_sparse=True)
 
         if is_classifier(self):
             check_classification_targets(y)
@@ -431,7 +429,7 @@ def fit(self, X, Y):
         -------
         self : object
         """
-        X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True)
+        X, Y = self._validate_X_y(X, Y, multi_output=True, accept_sparse=True)
 
         random_state = check_random_state(self.random_state)
         check_array(X, accept_sparse=True)
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 904a5afecc67e..509fe78d150ea 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -192,7 +192,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self._partial_fit(X, y, np.unique(y), _refit=True,
                                  sample_weight=sample_weight)
 
@@ -591,7 +591,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = check_X_y(X, y, 'csr')
+        X, y = self._validate_X_y(X, y, accept_sparse='csr')
         _, n_features = X.shape
 
         labelbin = LabelBinarizer()
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index 4f7ef38a4ae14..ab6aa296aec74 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -206,7 +206,7 @@ def _fit(self, X):
             self._fit_method = 'kd_tree'
             return self
 
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
 
         n_samples = X.shape[0]
         if n_samples == 0:
diff --git a/sklearn/neighbors/kde.py b/sklearn/neighbors/kde.py
index be5002e579423..baac14518954a 100644
--- a/sklearn/neighbors/kde.py
+++ b/sklearn/neighbors/kde.py
@@ -125,7 +125,7 @@ def fit(self, X, y=None, sample_weight=None):
             List of sample weights attached to the data X.
         """
         algorithm = self._choose_algorithm(self.algorithm, self.metric)
-        X = check_array(X, order='C', dtype=DTYPE)
+        X = self._validate_X(X, order='C', dtype=DTYPE)
 
         if sample_weight is not None:
             sample_weight = check_array(sample_weight, order='C', dtype=DTYPE,
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 68a72c92da865..67c94185030a7 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -297,7 +297,7 @@ def _validate_params(self, X, y):
         """
 
         # Validate the inputs X and y, and converts y to numerical classes.
-        X, y = check_X_y(X, y, ensure_min_samples=2)
+        X, y = self._validate_X_y(X, y, ensure_min_samples=2)
         check_classification_targets(y)
         y = LabelEncoder().fit_transform(y)
 
diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
index 3e1577469c920..a765141cd5c6f 100644
--- a/sklearn/neighbors/nearest_centroid.py
+++ b/sklearn/neighbors/nearest_centroid.py
@@ -104,9 +104,9 @@ def fit(self, X, y):
         # If X is sparse and the metric is "manhattan", store it in a csc
         # format is easier to calculate the median.
         if self.metric == 'manhattan':
-            X, y = check_X_y(X, y, ['csc'])
+            X, y = self._validate_X_y(X, y, accept_sparse=['csc'])
         else:
-            X, y = check_X_y(X, y, ['csr', 'csc'])
+            X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'])
         is_X_sparse = sp.issparse(X)
         if is_X_sparse and self.shrink_threshold:
             raise ValueError("threshold shrinking not supported"
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 11e682a448240..5293937c8c0a1 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -928,8 +928,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
             n_iter_no_change=n_iter_no_change, max_fun=max_fun)
 
     def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         multi_output=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                  multi_output=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)
 
@@ -1336,8 +1336,8 @@ def predict(self, X):
         return y_pred
 
     def _validate_input(self, X, y, incremental):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         multi_output=True, y_numeric=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                  multi_output=True, y_numeric=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)
         return X, y
diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py
index 3018e31f7d04d..43e7eba48db61 100644
--- a/sklearn/neural_network/rbm.py
+++ b/sklearn/neural_network/rbm.py
@@ -336,7 +336,7 @@ def fit(self, X, y=None):
         self : BernoulliRBM
             The fitted model.
         """
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        X = self._validate_X(X, accept_sparse='csr', dtype=np.float64)
         n_samples = X.shape[0]
         rng = check_random_state(self.random_state)
 
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 1be7499f783ec..4bfa0631be77a 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -133,7 +133,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, dtype='numeric')
+        X = self._validate_X(X, dtype='numeric')
 
         valid_encode = ('onehot', 'onehot-dense', 'ordinal')
         if self.encode not in valid_encode:
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index ed352da361007..bf7dcfd2e17a7 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1159,8 +1159,8 @@ def fit(self, X, y=None):
         """
         # at fit, convert sparse matrices to csc for optimized computation of
         # the quantiles
-        X = check_array(X, accept_sparse='csc', estimator=self,
-                        dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
+        X = self._validate_X(X, accept_sparse='csc', estimator=self,
+                             dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
 
         q_min, q_max = self.quantile_range
         if not 0 <= q_min <= q_max <= 100:
@@ -1468,7 +1468,7 @@ def fit(self, X, y=None):
         -------
         self : instance
         """
-        n_samples, n_features = check_array(X, accept_sparse=True).shape
+        n_samples, n_features = self._validate_X(X, accept_sparse=True).shape
         combinations = self._combinations(n_features, self.degree,
                                           self.interaction_only,
                                           self.include_bias)
@@ -1774,7 +1774,7 @@ def fit(self, X, y=None):
         ----------
         X : array-like
         """
-        check_array(X, accept_sparse='csr')
+        self._validate_X(X, accept_sparse='csr')
         return self
 
     def transform(self, X, copy=None):
@@ -1908,7 +1908,7 @@ def fit(self, X, y=None):
         ----------
         X : array-like
         """
-        check_array(X, accept_sparse='csr')
+        self._validate_X(X, accept_sparse='csr')
         return self
 
     def transform(self, X, copy=None):
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 4f8c8af1283b2..7e087bcbd2b84 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -341,7 +341,7 @@ def fit(self, X, y=None):
         self
 
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = self._validate_X(X, accept_sparse=['csr', 'csc'])
 
         n_samples, n_features = X.shape
 
diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py
index 704a075d95932..c4cc523336433 100644
--- a/sklearn/semi_supervised/label_propagation.py
+++ b/sklearn/semi_supervised/label_propagation.py
@@ -220,7 +220,7 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         self.X_ = X
         check_classification_targets(y)
 
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index b2723cc7e0c2b..43afcbc1602f6 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -142,9 +142,9 @@ def fit(self, X, y, sample_weight=None):
             raise TypeError("Sparse precomputed kernels are not supported.")
         self._sparse = sparse and not callable(self.kernel)
 
-        X, y = check_X_y(X, y, dtype=np.float64,
-                         order='C', accept_sparse='csr',
-                         accept_large_sparse=False)
+        X, y = self._validate_X_y(X, y, dtype=np.float64,
+                                  order='C', accept_sparse='csr',
+                                  accept_large_sparse=False)
         y = self._validate_targets(y)
 
         sample_weight = np.asarray([]
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 39c7d2f334de2..7967bf34207e1 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -229,9 +229,9 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Penalty term must be positive; got (C=%r)"
                              % self.C)
 
-        X, y = check_X_y(X, y, accept_sparse='csr',
-                         dtype=np.float64, order="C",
-                         accept_large_sparse=False)
+        X, y = self._validate_X_y(X, y, accept_sparse='csr',
+                                  dtype=np.float64, order="C",
+                                  accept_large_sparse=False)
         check_classification_targets(y)
         self.classes_ = np.unique(y)
 
@@ -418,9 +418,9 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Penalty term must be positive; got (C=%r)"
                              % self.C)
 
-        X, y = check_X_y(X, y, accept_sparse='csr',
-                         dtype=np.float64, order="C",
-                         accept_large_sparse=False)
+        X, y = self._validate_X_y(X, y, accept_sparse='csr',
+                                  dtype=np.float64, order="C",
+                                  accept_large_sparse=False)
         penalty = 'l2'  # SVR only accepts l2 penalty
         self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
             X, y, self.C, self.fit_intercept, self.intercept_scaling,
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 458e0818def4a..9d00cb826f58d 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -271,6 +271,7 @@ def _yield_all_checks(name, estimator):
     yield check_dict_unchanged
     yield check_dont_overwrite_parameters
     yield check_fit_idempotent
+    yield check_n_features_in
     if tags["requires_positive_X"]:
         yield check_fit_non_negative
 
@@ -2648,3 +2649,56 @@ def check_fit_idempotent(name, estimator_orig):
                 atol=max(tol, 1e-9), rtol=max(tol, 1e-7),
                 err_msg="Idempotency check failed for method {}".format(method)
             )
+
+
+def check_n_features_in(name, estimator_orig):
+    # Make sure that n_features_in_ attribute doesn't exist until fit is
+    # called.
+
+    if 'Dummy' in name:
+        # Dummy estimators don't validate X at all
+        return
+    if any(x in name for x in ('FastICA', 'KMeans')):
+        # fit calls public function helper and validates there. No way to
+        # access `self` from the helper.
+        return
+    if 'FunctionTransformer' in name:
+        # Validation is optional and False by default
+        return
+    if 'KernelCenterer' in name:
+        # Takes kernel K with shape (n_samples, n_samples) as input, not X
+        return
+    if any(x in name for x in ('LatentDirichlet', 'MissingIndicator',
+                               'PowerTransformer', 'QuantileTransformer',
+                               'SimpleImputer', 'AdaBoost')):
+        # fit calls private validation method, which is also called for
+        # predict, transform, etc
+        return
+    if any(x in name for x in ('MaxAbsScaler', 'MinMaxScaler')):
+        # Fit directly calls partial_fit. Don't know what to do with
+        # partial_fit.
+        return
+    if name in 'RidgeCV':
+        # Uses aggregation from an estimator that is not an attribute. There is
+        # no way to delegate to this estimator.
+        return
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if 'warm_start' in estimator.get_params().keys():
+        estimator.set_params(warm_start=False)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = pairwise_estimator_convert_X(X, estimator)
+    if is_regressor(estimator_orig):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = enforce_estimator_tags_y(estimator, y)
+
+    assert not hasattr(estimator, 'n_features_in_')
+    estimator.fit(X, y)
+    assert hasattr(estimator, 'n_features_in_')

From 9bdfb65c0506838fcb8f356e71cf82a3d8c2ad48 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 2 Sep 2019 18:46:35 -0400
Subject: [PATCH 13/53] Fixed some test

---
 sklearn/ensemble/voting.py | 15 +++++++++++++++
 sklearn/manifold/t_sne.py  |  4 ++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py
index 69381a39d9ce3..48d4fe413c5d2 100644
--- a/sklearn/ensemble/voting.py
+++ b/sklearn/ensemble/voting.py
@@ -29,6 +29,7 @@
 from ..utils.metaestimators import _BaseComposition
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import column_or_1d
+from ..exceptions import NotFittedError
 
 
 def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
@@ -144,6 +145,20 @@ def get_params(self, deep=True):
         """
         return self._get_params('estimators', deep=deep)
 
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return self.estimators_[0].n_features_in_
+
 
 class VotingClassifier(_BaseVoting, ClassifierMixin):
     """Soft Voting/Majority Rule classifier for unfitted estimators.
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index e460438d9641a..70732b8d6ac16 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -685,8 +685,8 @@ def _fit(self, X, skip_num_points=0):
             X = self._validate_X(X, ensure_min_samples=2,
                                  dtype=[np.float32, np.float64])
         else:
-            X = self.validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
-                                dtype=[np.float32, np.float64])
+            X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                                 dtype=[np.float32, np.float64])
         if self.method == 'barnes_hut' and self.n_components > 3:
             raise ValueError("'n_components' should be inferior to 4 for the "
                              "barnes_hut algorithm as it relies on "

From be76ef49c4cd8fc72b05111a8b9d4f769b6f0733 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 4 Sep 2019 18:04:48 -0400
Subject: [PATCH 14/53] fixed issues for some estimators

---
 sklearn/cluster/hierarchical.py               | 11 +++-
 sklearn/decomposition/online_lda.py           | 26 +++++---
 sklearn/ensemble/weight_boosting.py           | 29 +++------
 sklearn/impute/_base.py                       | 25 ++++----
 sklearn/linear_model/ridge.py                 |  1 +
 .../preprocessing/_function_transformer.py    |  4 +-
 sklearn/preprocessing/data.py                 | 59 +++++++++++--------
 sklearn/utils/estimator_checks.py             | 28 +--------
 8 files changed, 88 insertions(+), 95 deletions(-)

diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index 3c183c24f9a95..0da4a6b2fc0fa 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -1034,9 +1034,14 @@ def fit(self, X, y=None, **params):
         -------
         self
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        ensure_min_features=2, estimator=self)
-        return AgglomerativeClustering.fit(self, X.T, **params)
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                             ensure_min_features=2, estimator=self)
+        n_features_in_ = self.n_features_in_
+        AgglomerativeClustering.fit(self, X.T, **params)
+        # Need to restore n_features_in_ attribute that was overridden in
+        # AgglomerativeClustering since we passed it X.T.
+        self.n_features_in_ = n_features_in_
+        return self
 
     @property
     def fit_predict(self):
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index 694893b6b2dc4..36d4d8a4f785b 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -469,7 +469,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None):
     def _more_tags(self):
         return {'requires_positive_X': True}
 
-    def _check_non_neg_array(self, X, whom):
+    def _check_non_neg_array(self, X, check_n_features, whom):
         """check X format
 
         check X format and make sure no negative value in X.
@@ -479,7 +479,8 @@ def _check_non_neg_array(self, X, whom):
         X :  array-like or sparse matrix
 
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, check_n_features=check_n_features,
+                             accept_sparse='csr')
         check_non_negative(X, whom)
         return X
 
@@ -498,13 +499,15 @@ def partial_fit(self, X, y=None):
         self
         """
         self._check_params()
-        X = self._check_non_neg_array(X,
+        first_time = not hasattr(self, 'components_')
+        check_n_features = not first_time
+        X = self._check_non_neg_array(X, check_n_features,
                                       "LatentDirichletAllocation.partial_fit")
         n_samples, n_features = X.shape
         batch_size = self.batch_size
 
         # initialize parameters or check
-        if not hasattr(self, 'components_'):
+        if first_time:
             self._init_latent_vars(n_features)
 
         if n_features != self.components_.shape[1]:
@@ -542,7 +545,8 @@ def fit(self, X, y=None):
         self
         """
         self._check_params()
-        X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit")
+        X = self._check_non_neg_array(X, check_n_features=False,
+                                      whom="LatentDirichletAllocation.fit")
         n_samples, n_features = X.shape
         max_iter = self.max_iter
         evaluate_every = self.evaluate_every
@@ -611,7 +615,9 @@ def _unnormalized_transform(self, X):
         check_is_fitted(self)
 
         # make sure feature size is the same in fitted model and in X
-        X = self._check_non_neg_array(X, "LatentDirichletAllocation.transform")
+        X = self._check_non_neg_array(
+            X, check_n_features=True,
+            whom="LatentDirichletAllocation.transform")
         n_samples, n_features = X.shape
         if n_features != self.components_.shape[1]:
             raise ValueError(
@@ -735,7 +741,8 @@ def score(self, X, y=None):
         score : float
             Use approximate bound as score.
         """
-        X = self._check_non_neg_array(X, "LatentDirichletAllocation.score")
+        X = self._check_non_neg_array(X, check_n_features=True,
+                                      whom="LatentDirichletAllocation.score")
 
         doc_topic_distr = self._unnormalized_transform(X)
         score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)
@@ -764,8 +771,9 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
         """
         check_is_fitted(self)
 
-        X = self._check_non_neg_array(X,
-                                      "LatentDirichletAllocation.perplexity")
+        X = self._check_non_neg_array(
+            X, check_n_features=True,
+            whom="LatentDirichletAllocation.perplexity")
 
         if doc_topic_distr is None:
             doc_topic_distr = self._unnormalized_transform(X)
diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py
index b0a634ce1be6f..f437f5a8dafd1 100644
--- a/sklearn/ensemble/weight_boosting.py
+++ b/sklearn/ensemble/weight_boosting.py
@@ -70,25 +70,9 @@ def __init__(self,
         self.learning_rate = learning_rate
         self.random_state = random_state
 
-    def _validate_data(self, X, y=None):
-
-        # Accept or convert to these sparse matrix formats so we can
-        # use safe_indexing
-        accept_sparse = ['csr', 'csc']
-        if y is None:
-            ret = check_array(X,
-                              accept_sparse=accept_sparse,
-                              ensure_2d=False,
-                              allow_nd=True,
-                              dtype=None)
-        else:
-            ret = check_X_y(X, y,
-                            accept_sparse=accept_sparse,
-                            ensure_2d=False,
-                            allow_nd=True,
-                            dtype=None,
-                            y_numeric=is_regressor(self))
-        return ret
+    def _validate_data(self, X):
+        return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=False,
+                           allow_nd=True, dtype=None)
 
     def fit(self, X, y, sample_weight=None):
         """Build a boosted classifier/regressor from the training set (X, y).
@@ -115,7 +99,12 @@ def fit(self, X, y, sample_weight=None):
         if self.learning_rate <= 0:
             raise ValueError("learning_rate must be greater than zero")
 
-        X, y = self._validate_data(X, y)
+        X, y = self._validate_X_y(X, y,
+                                  accept_sparse=['csr', 'csc'],
+                                  ensure_2d=False,
+                                  allow_nd=True,
+                                  dtype=None,
+                                  y_numeric=is_regressor(self))
 
         if sample_weight is None:
             # Initialize weights to 1 / n_samples
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index f7be7fd49cb64..9b7b239d6b7d3 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -177,7 +177,7 @@ def __init__(self, missing_values=np.nan, strategy="mean",
         self.copy = copy
         self.add_indicator = add_indicator
 
-    def _validate_input(self, X):
+    def _validate_input(self, X, in_fit):
         allowed_strategies = ["mean", "median", "most_frequent", "constant"]
         if self.strategy not in allowed_strategies:
             raise ValueError("Can only use these strategies: {0} "
@@ -195,8 +195,11 @@ def _validate_input(self, X):
             force_all_finite = "allow-nan"
 
         try:
-            X = check_array(X, accept_sparse='csc', dtype=dtype,
-                            force_all_finite=force_all_finite, copy=self.copy)
+            check_n_features = not in_fit
+            X = self._validate_X(X, check_n_features=check_n_features,
+                                 accept_sparse='csc', dtype=dtype,
+                                 force_all_finite=force_all_finite,
+                                 copy=self.copy)
         except ValueError as ve:
             if "could not convert" in str(ve):
                 raise ValueError("Cannot use {0} strategy with non-numeric "
@@ -229,7 +232,7 @@ def fit(self, X, y=None):
         -------
         self : SimpleImputer
         """
-        X = self._validate_input(X)
+        X = self._validate_input(X, in_fit=True)
 
         # default fill_value is 0 for numerical input and "missing_value"
         # otherwise
@@ -374,7 +377,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X = self._validate_input(X)
+        X = self._validate_input(X, in_fit=False)
 
         statistics = self.statistics_
 
@@ -560,13 +563,15 @@ def _get_missing_features_info(self, X):
 
         return imputer_mask, features_indices
 
-    def _validate_input(self, X):
+    def _validate_input(self, X, in_fit):
         if not is_scalar_nan(self.missing_values):
             force_all_finite = True
         else:
             force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
-                        force_all_finite=force_all_finite)
+        check_n_features = not in_fit
+        X = self._validate_X(X, check_n_features=check_n_features,
+                             accept_sparse=('csc', 'csr'), dtype=None,
+                             force_all_finite=force_all_finite)
         _check_inputs_dtype(X, self.missing_values)
         if X.dtype.kind not in ("i", "u", "f", "O"):
             raise ValueError("MissingIndicator does not support data with "
@@ -601,7 +606,7 @@ def _fit(self, X, y=None):
             The imputer mask of the original data.
 
         """
-        X = self._validate_input(X)
+        X = self._validate_input(X, in_fit=True)
         self._n_features = X.shape[1]
 
         if self.features not in ('missing-only', 'all'):
@@ -653,7 +658,7 @@ def transform(self, X):
 
         """
         check_is_fitted(self)
-        X = self._validate_input(X)
+        X = self._validate_input(X, in_fit=False)
 
         if X.shape[1] != self._n_features:
             raise ValueError("X has a different number of features "
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index 425293c9ee8ca..eab0370050abd 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -1574,6 +1574,7 @@ def fit(self, X, y, sample_weight=None):
 
         self.coef_ = estimator.coef_
         self.intercept_ = estimator.intercept_
+        self.n_features_in_ = estimator.n_features_in_
 
         return self
 
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 589a45a1e63d1..6680882bc2694 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -83,7 +83,7 @@ def __init__(self, func=None, inverse_func=None, validate=False,
 
     def _check_input(self, X):
         if self.validate:
-            return check_array(X, accept_sparse=self.accept_sparse)
+            return self._validate_X(X, accept_sparse=self.accept_sparse)
         return X
 
     def _check_inverse_transform(self, X):
@@ -156,5 +156,5 @@ def _transform(self, X, func=None, kw_args=None):
         return func(X, **(kw_args if kw_args else {}))
 
     def _more_tags(self):
-        return {'no_validation': True,
+        return {'no_validation': not self.validate,
                 'stateless': True}
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index bf7dcfd2e17a7..afed782a29dee 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -354,17 +354,17 @@ def partial_fit(self, X, y=None):
             raise TypeError("MinMaxScaler does no support sparse input. "
                             "You may consider to use MaxAbsScaler instead.")
 
-        X = check_array(X,
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite="allow-nan")
+        first_pass = not hasattr(self, 'n_samples_seen_')
+        check_n_features = not first_pass
+        X = self._validate_X(X, check_n_features=check_n_features,
+                             estimator=self, dtype=FLOAT_DTYPES,
+                             force_all_finite="allow-nan")
 
         data_min = np.nanmin(X, axis=0)
         data_max = np.nanmax(X, axis=0)
 
-        # First pass
-        if not hasattr(self, 'n_samples_seen_'):
+        if first_pass:
             self.n_samples_seen_ = X.shape[0]
-        # Next steps
         else:
             data_min = np.minimum(self.data_min_, data_min)
             data_max = np.maximum(self.data_max_, data_max)
@@ -928,9 +928,11 @@ def partial_fit(self, X, y=None):
         y
             Ignored
         """
-        X = check_array(X, accept_sparse=('csr', 'csc'),
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
+        first_pass = not hasattr(self, 'n_samples_seen_')
+        check_n_features = not first_pass
+        X = self._validate_X(X, check_n_features=check_n_features,
+                             accept_sparse=('csr', 'csc'), estimator=self,
+                             dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
@@ -938,10 +940,8 @@ def partial_fit(self, X, y=None):
         else:
             max_abs = np.nanmax(np.abs(X), axis=0)
 
-        # First pass
-        if not hasattr(self, 'n_samples_seen_'):
+        if first_pass:
             self.n_samples_seen_ = X.shape[0]
-        # Next passes
         else:
             max_abs = np.maximum(self.max_abs_, max_abs)
             self.n_samples_seen_ += X.shape[0]
@@ -1988,7 +1988,7 @@ def fit(self, K, y=None):
         self : returns an instance of self.
         """
 
-        K = check_array(K, dtype=FLOAT_DTYPES)
+        K = self._validate_X(K, dtype=FLOAT_DTYPES)
 
         if K.shape[0] != K.shape[1]:
             raise ValueError("Kernel matrix must be a square matrix."
@@ -2298,7 +2298,7 @@ def fit(self, X, y=None):
                              " and {} samples.".format(self.n_quantiles,
                                                        self.subsample))
 
-        X = self._check_inputs(X, copy=False)
+        X = self._check_inputs(X, in_fit=True, copy=False)
         n_samples = X.shape[0]
 
         if self.n_quantiles > n_samples:
@@ -2389,11 +2389,13 @@ def _transform_col(self, X_col, quantiles, inverse):
 
         return X_col
 
-    def _check_inputs(self, X, accept_sparse_negative=False, copy=False):
+    def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
+                      copy=False):
         """Check inputs before fit and transform"""
-        X = check_array(X, accept_sparse='csc', copy=copy,
-                        dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
+        check_n_features = not in_fit
+        X = self._validate_X(X, check_n_features=check_n_features,
+                             accept_sparse='csc', copy=copy,
+                             dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
         # we only accept positive sparse matrix when ignore_implicit_zeros is
         # false and that we call fit or transform.
         with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
@@ -2469,7 +2471,7 @@ def transform(self, X):
         Xt : ndarray or sparse matrix, shape (n_samples, n_features)
             The projected data.
         """
-        X = self._check_inputs(X, copy=self.copy)
+        X = self._check_inputs(X, in_fit=False, copy=self.copy)
         self._check_is_fitted(X)
 
         return self._transform(X, inverse=False)
@@ -2490,7 +2492,8 @@ def inverse_transform(self, X):
         Xt : ndarray or sparse matrix, shape (n_samples, n_features)
             The projected data.
         """
-        X = self._check_inputs(X, accept_sparse_negative=True, copy=self.copy)
+        X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True,
+                               copy=self.copy)
         self._check_is_fitted(X)
 
         return self._transform(X, inverse=True)
@@ -2746,7 +2749,8 @@ def fit_transform(self, X, y=None):
         return self._fit(X, y, force_transform=True)
 
     def _fit(self, X, y=None, force_transform=False):
-        X = self._check_input(X, check_positive=True, check_method=True)
+        X = self._check_input(X, in_fit=True, check_positive=True,
+                              check_method=True)
 
         if not self.copy and not force_transform:  # if call from fit()
             X = X.copy()  # force copy so that fit does not change X inplace
@@ -2788,7 +2792,8 @@ def transform(self, X):
             The transformed data.
         """
         check_is_fitted(self)
-        X = self._check_input(X, check_positive=True, check_shape=True)
+        X = self._check_input(X, in_fit=False, check_positive=True,
+                              check_shape=True)
 
         transform_function = {'box-cox': boxcox,
                               'yeo-johnson': self._yeo_johnson_transform
@@ -2834,7 +2839,7 @@ def inverse_transform(self, X):
             The original data
         """
         check_is_fitted(self)
-        X = self._check_input(X, check_shape=True)
+        X = self._check_input(X, in_fit=False, check_shape=True)
 
         if self.standardize:
             X = self._scaler.inverse_transform(X)
@@ -2939,7 +2944,7 @@ def _neg_log_likelihood(lmbda):
         # choosing bracket -2, 2 like for boxcox
         return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
 
-    def _check_input(self, X, check_positive=False, check_shape=False,
+    def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
                      check_method=False):
         """Validate the input before fit and transform.
 
@@ -2957,8 +2962,10 @@ def _check_input(self, X, check_positive=False, check_shape=False,
         check_method : bool
             If True, check that the transformation method is valid.
         """
-        X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy,
-                        force_all_finite='allow-nan')
+        check_n_features = not in_fit
+        X = self._validate_X(X, check_n_features=check_n_features,
+                             ensure_2d=True, dtype=FLOAT_DTYPES,
+                             copy=self.copy, force_all_finite='allow-nan')
 
         with np.warnings.catch_warnings():
             np.warnings.filterwarnings(
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 9d00cb826f58d..4f09cbcc646cf 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -271,7 +271,8 @@ def _yield_all_checks(name, estimator):
     yield check_dict_unchanged
     yield check_dont_overwrite_parameters
     yield check_fit_idempotent
-    yield check_n_features_in
+    if not tags["no_validation"]:
+        yield check_n_features_in
     if tags["requires_positive_X"]:
         yield check_fit_non_negative
 
@@ -2655,33 +2656,10 @@ def check_n_features_in(name, estimator_orig):
     # Make sure that n_features_in_ attribute doesn't exist until fit is
     # called.
 
-    if 'Dummy' in name:
-        # Dummy estimators don't validate X at all
-        return
     if any(x in name for x in ('FastICA', 'KMeans')):
         # fit calls public function helper and validates there. No way to
         # access `self` from the helper.
         return
-    if 'FunctionTransformer' in name:
-        # Validation is optional and False by default
-        return
-    if 'KernelCenterer' in name:
-        # Takes kernel K with shape (n_samples, n_samples) as input, not X
-        return
-    if any(x in name for x in ('LatentDirichlet', 'MissingIndicator',
-                               'PowerTransformer', 'QuantileTransformer',
-                               'SimpleImputer', 'AdaBoost')):
-        # fit calls private validation method, which is also called for
-        # predict, transform, etc
-        return
-    if any(x in name for x in ('MaxAbsScaler', 'MinMaxScaler')):
-        # Fit directly calls partial_fit. Don't know what to do with
-        # partial_fit.
-        return
-    if name in 'RidgeCV':
-        # Uses aggregation from an estimator that is not an attribute. There is
-        # no way to delegate to this estimator.
-        return
 
     rng = np.random.RandomState(0)
 
@@ -2701,4 +2679,4 @@ def check_n_features_in(name, estimator_orig):
 
     assert not hasattr(estimator, 'n_features_in_')
     estimator.fit(X, y)
-    assert hasattr(estimator, 'n_features_in_')
+    assert estimator.n_features_in_ == X.shape[1]

From 70dc4ed8d5d5e3e58b86c97e80515d35e2b08da1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 5 Sep 2019 11:56:26 -0400
Subject: [PATCH 15/53] fixed tests in test_data.py

---
 sklearn/preprocessing/data.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 011a3c2ea6a58..1e4c421e68ae8 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2392,7 +2392,13 @@ def _transform_col(self, X_col, quantiles, inverse):
     def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
                       copy=False):
         """Check inputs before fit and transform"""
-        check_n_features = not in_fit
+        # deactivating check for now (specific tests about error message would
+        # break)
+        # TODO: uncomment when addressing check_n_features in
+        # predict/transform/etc.
+        # check_n_features = not in_fit
+        check_n_features = False
+
         X = self._validate_X(X, check_n_features=check_n_features,
                              accept_sparse='csc', copy=copy,
                              dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
@@ -2962,7 +2968,12 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
         check_method : bool
             If True, check that the transformation method is valid.
         """
-        check_n_features = not in_fit
+        # deactivating check for now (specific tests about error message would
+        # break)
+        # TODO: uncomment when addressing check_n_features in
+        # predict/transform/etc.
+        # check_n_features = not in_fit
+        check_n_features = False
         X = self._validate_X(X, check_n_features=check_n_features,
                              ensure_2d=True, dtype=FLOAT_DTYPES,
                              copy=self.copy, force_all_finite='allow-nan')

From 988f9c4d93e4dafc14de7c9a35847ccfff8eebf1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 5 Sep 2019 15:26:54 -0400
Subject: [PATCH 16/53] Fixed some tests

---
 sklearn/decomposition/online_lda.py          |  7 ++-
 sklearn/ensemble/weight_boosting.py          |  4 +-
 sklearn/impute/_knn.py                       |  5 +-
 sklearn/utils/tests/test_estimator_checks.py | 55 +++++++++++---------
 4 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index b288eb4e512f2..2a1bc9ac27f73 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -500,7 +500,12 @@ def partial_fit(self, X, y=None):
         """
         self._check_params()
         first_time = not hasattr(self, 'components_')
-        check_n_features = not first_time
+        # deactivating check for now (specific tests about error message would
+        # break)
+        # TODO: uncomment when addressing check_n_features in
+        # predict/transform/etc.
+        # check_n_features = not in_fit
+        check_n_features = False
         X = self._check_non_neg_array(X, check_n_features,
                                       "LatentDirichletAllocation.partial_fit")
         n_samples, n_features = X.shape
diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py
index ff321de00186b..77f970926553c 100644
--- a/sklearn/ensemble/weight_boosting.py
+++ b/sklearn/ensemble/weight_boosting.py
@@ -71,7 +71,7 @@ def __init__(self,
         self.random_state = random_state
 
     def _validate_data(self, X):
-        return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=False,
+        return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True,
                            allow_nd=True, dtype=None)
 
     def fit(self, X, y, sample_weight=None):
@@ -101,7 +101,7 @@ def fit(self, X, y, sample_weight=None):
 
         X, y = self._validate_X_y(X, y,
                                   accept_sparse=['csr', 'csc'],
-                                  ensure_2d=False,
+                                  ensure_2d=True,
                                   allow_nd=True,
                                   dtype=None,
                                   y_numeric=is_regressor(self))
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index 0837cc9750e0a..55cc5072969f4 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -157,8 +157,9 @@ def fit(self, X, y=None):
             raise ValueError(
                 "Expected n_neighbors > 0. Got {}".format(self.n_neighbors))
 
-        X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
-                        force_all_finite=force_all_finite, copy=self.copy)
+        X = self._validate_X(X, accept_sparse=False, dtype=FLOAT_DTYPES,
+                             force_all_finite=force_all_finite,
+                             copy=self.copy)
 
         _check_weights(self.weights)
         self._fit_X = X
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index e26a508566871..8c3a2a0bd4bf1 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -56,7 +56,7 @@ def __init__(self, key=0):
         self.key = key
 
     def fit(self, X, y=None):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
     def predict(self, X):
@@ -71,7 +71,7 @@ def __init__(self, acceptable_key=0):
 
     def fit(self, X, y=None):
         self.wrong_attribute = 0
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
@@ -81,14 +81,14 @@ def __init__(self, wrong_attribute=0):
 
     def fit(self, X, y=None):
         self.wrong_attribute = 1
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
 class ChangesUnderscoreAttribute(BaseEstimator):
     def fit(self, X, y=None):
         self._good_attribute = 1
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
@@ -105,7 +105,7 @@ def set_params(self, **kwargs):
         return super().set_params(**kwargs)
 
     def fit(self, X, y=None):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
@@ -122,7 +122,7 @@ def set_params(self, **kwargs):
         return super().set_params(**kwargs)
 
     def fit(self, X, y=None):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
@@ -141,19 +141,19 @@ def set_params(self, **kwargs):
         return super().set_params(**kwargs)
 
     def fit(self, X, y=None):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
 class NoCheckinPredict(BaseBadClassifier):
     def fit(self, X, y):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         return self
 
 
 class NoSparseClassifier(BaseBadClassifier):
     def fit(self, X, y):
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'])
         if sp.issparse(X):
             raise ValueError("Nonsensical Error")
         return self
@@ -165,7 +165,7 @@ def predict(self, X):
 
 class CorrectNotFittedErrorClassifier(BaseBadClassifier):
     def fit(self, X, y):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         self.coef_ = np.ones(X.shape[1])
         return self
 
@@ -178,10 +178,11 @@ def predict(self, X):
 class NoSampleWeightPandasSeriesType(BaseEstimator):
     def fit(self, X, y, sample_weight=None):
         # Convert data
-        X, y = check_X_y(X, y,
-                         accept_sparse=("csr", "csc"),
-                         multi_output=True,
-                         y_numeric=True)
+        X, y = self._validate_X_y(
+            X, y,
+            accept_sparse=("csr", "csc"),
+            multi_output=True,
+            y_numeric=True)
         # Function is only called after we verify that pandas is installed
         from pandas import Series
         if isinstance(sample_weight, Series):
@@ -218,7 +219,7 @@ def fit(self, X, y):
 
 class BadTransformerWithoutMixin(BaseEstimator):
     def fit(self, X, y=None):
-        X = check_array(X)
+        X = self._validate_X(X)
         return self
 
     def transform(self, X):
@@ -229,10 +230,11 @@ def transform(self, X):
 class NotInvariantPredict(BaseEstimator):
     def fit(self, X, y):
         # Convert data
-        X, y = check_X_y(X, y,
-                         accept_sparse=("csr", "csc"),
-                         multi_output=True,
-                         y_numeric=True)
+        X, y = self._validate_X_y(
+            X, y,
+            accept_sparse=("csr", "csc"),
+            multi_output=True,
+            y_numeric=True)
         return self
 
     def predict(self, X):
@@ -245,11 +247,12 @@ def predict(self, X):
 
 class LargeSparseNotSupportedClassifier(BaseEstimator):
     def fit(self, X, y):
-        X, y = check_X_y(X, y,
-                         accept_sparse=("csr", "csc", "coo"),
-                         accept_large_sparse=True,
-                         multi_output=True,
-                         y_numeric=True)
+        X, y = self._validate_X_y(
+            X, y,
+            accept_sparse=("csr", "csc", "coo"),
+            accept_large_sparse=True,
+            multi_output=True,
+            y_numeric=True)
         if sp.issparse(X):
             if X.getformat() == "coo":
                 if X.row.dtype == "int64" or X.col.dtype == "int64":
@@ -265,7 +268,7 @@ def fit(self, X, y):
 
 class SparseTransformer(BaseEstimator):
     def fit(self, X, y=None):
-        self.X_shape_ = check_array(X).shape
+        self.X_shape_ = self._validate_X(X).shape
         return self
 
     def fit_transform(self, X, y=None):
@@ -296,7 +299,7 @@ def _more_tags(self):
 class RequiresPositiveYRegressor(LinearRegression):
 
     def fit(self, X, y):
-        X, y = check_X_y(X, y)
+        X, y = self._validate_X_y(X, y)
         if (y <= 0).any():
             raise ValueError('negative y values not supported!')
         return super().fit(X, y)

From fd9b72ce28ab333d4e38628c6dc0ee406a6648e1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 5 Sep 2019 16:58:36 -0400
Subject: [PATCH 17/53] validate twice for Kmeans and FastICA

---
 sklearn/cluster/k_means_.py       | 11 +++++++++--
 sklearn/decomposition/fastica_.py |  3 +++
 sklearn/utils/estimator_checks.py |  7 +------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index a83df9c836b86..fe332ff06c98c 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -953,6 +953,13 @@ def fit(self, X, y=None, sample_weight=None):
         """
         random_state = check_random_state(self.random_state)
 
+        # This validates twice but there is not clean way to avoid validation
+        # in k_means.
+        order = "C" if self.copy_x else None
+        X = self._validate_X(X, accept_sparse='csr',
+                             dtype=[np.float64, np.float32], order=order,
+                             copy=self.copy_x)
+
         self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
             k_means(
                 X, n_clusters=self.n_clusters, sample_weight=sample_weight,
@@ -1482,8 +1489,8 @@ def fit(self, X, y=None, sample_weight=None):
 
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X, accept_sparse="csr", order='C',
-                        dtype=[np.float64, np.float32])
+        X = self._validate_X(X, accept_sparse="csr", order='C',
+                             dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
         if n_samples < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d"
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index dffce0dc0d8bc..c4dc9114eb5d6 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -501,6 +501,9 @@ def _fit(self, X, compute_sources=False):
         -------
             X_new : array-like, shape (n_samples, n_components)
         """
+
+        X = self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES,
+                             ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args
         whitening, unmixing, sources, X_mean, self.n_iter_ = fastica(
             X=X, n_components=self.n_components, algorithm=self.algorithm,
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 4f09cbcc646cf..b29e8800b2dbf 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2654,12 +2654,7 @@ def check_fit_idempotent(name, estimator_orig):
 
 def check_n_features_in(name, estimator_orig):
     # Make sure that n_features_in_ attribute doesn't exist until fit is
-    # called.
-
-    if any(x in name for x in ('FastICA', 'KMeans')):
-        # fit calls public function helper and validates there. No way to
-        # access `self` from the helper.
-        return
+    # called, and that its value is correct.
 
     rng = np.random.RandomState(0)
 

From 4f3d6fff2a578d3e36f81dc55eab9212ca16f47a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 5 Sep 2019 17:45:46 -0400
Subject: [PATCH 18/53] again

---
 sklearn/cluster/k_means_.py       | 4 ++--
 sklearn/decomposition/fastica_.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index fe332ff06c98c..4ad3980775ae6 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -1489,8 +1489,8 @@ def fit(self, X, y=None, sample_weight=None):
 
         """
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X, accept_sparse="csr", order='C',
-                             dtype=[np.float64, np.float32])
+        self._validate_X(X, accept_sparse="csr", order='C',
+                         dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
         if n_samples < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d"
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index c4dc9114eb5d6..b32ce59715fbe 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -502,8 +502,8 @@ def _fit(self, X, compute_sources=False):
             X_new : array-like, shape (n_samples, n_components)
         """
 
-        X = self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES,
-                             ensure_min_samples=2).T
+        self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES,
+                         ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args
         whitening, unmixing, sources, X_mean, self.n_iter_ = fastica(
             X=X, n_components=self.n_components, algorithm=self.algorithm,

From 08f71924ae54f23054a468667877d48d54089e58 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 5 Sep 2019 18:09:42 -0400
Subject: [PATCH 19/53] and again

---
 sklearn/cluster/k_means_.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 4ad3980775ae6..50af23b8d046c 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -956,9 +956,9 @@ def fit(self, X, y=None, sample_weight=None):
         # This validates twice but there is not clean way to avoid validation
         # in k_means.
         order = "C" if self.copy_x else None
-        X = self._validate_X(X, accept_sparse='csr',
-                             dtype=[np.float64, np.float32], order=order,
-                             copy=self.copy_x)
+        self._validate_X(X, accept_sparse='csr',
+                         dtype=[np.float64, np.float32], order=order,
+                         copy=self.copy_x)
 
         self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
             k_means(
@@ -1489,8 +1489,8 @@ def fit(self, X, y=None, sample_weight=None):
 
         """
         random_state = check_random_state(self.random_state)
-        self._validate_X(X, accept_sparse="csr", order='C',
-                         dtype=[np.float64, np.float32])
+        X = self._validate_X(X, accept_sparse="csr", order='C',
+                             dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
         if n_samples < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d"

From f0e7b413b9d8fb9ebe0b1f3a185a7e2b41df0c0d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 6 Sep 2019 09:17:57 -0400
Subject: [PATCH 20/53] should fix dep warning error

---
 sklearn/model_selection/_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 798c581add8d4..259fe89e712f7 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -568,7 +568,7 @@ def n_features_in_(self):
         # For consistency with other estimators we raise a AttributeError so
         # that hasattr() fails if the search estimator isn't fitted.
         try:
-            check_is_fitted(self, 'best_estimator_')
+            check_is_fitted(self)
         except NotFittedError as nfe:
             raise AttributeError(
                 "{} object has no n_features_in_ attribute."

From 193fda1598011ea5f206c8857b0caa91954995ec Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 8 Sep 2019 17:07:14 -0400
Subject: [PATCH 21/53] removed superfluous tests

---
 sklearn/tests/test_base.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 372ed7d6ea47d..e25517b5122b7 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -514,20 +514,6 @@ def test_regressormixin_score_multioutput():
     assert_warns_message(FutureWarning, msg, reg.score, X, y)
 
 
-def test_validate_X_n_feature_mismatch():
-    # Make sure ValueError is raised when there is a n_features mismatch
-    # between fit and predict/transform
-
-    X = [[0, 1], [2, 3]]
-
-    ss = StandardScaler().fit(X)
-    ss.transform(X)  # All good
-
-    with pytest.raises(ValueError, match="X has 3 features, but"):
-        X_more_features = [[0, 1, 4], [2, 3, 5]]
-        ss.transform(X_more_features)
-
-
 def test_validate_X_bad_kwargs():
 
     est = BaseEstimator()
@@ -536,20 +522,6 @@ def test_validate_X_bad_kwargs():
         est._validate_X([1], bad_param=4)
 
 
-def test_n_features_in_attribute():
-    # Make sure n_features_in_ is correctly set.
-    # TODO: eventually move this in estimator_checks
-    X_2 = [[0, 1], [2, 3]]
-    X_3 = [[0, 1, 4], [2, 3, 5]]
-
-    ss = StandardScaler()
-    assert not hasattr(ss, 'n_features_in_')
-    ss.fit(X_2)
-    assert ss.n_features_in_ == 2
-    ss = ss.fit(X_3)
-    assert ss.n_features_in_ == 3
-
-
 def test_warns_on_get_params_non_attribute():
     class MyEstimator(BaseEstimator):
         def __init__(self, param=5):

From 5b20a4c0b20352a15c61bdb448f64d7b2644fe91 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 8 Sep 2019 17:28:25 -0400
Subject: [PATCH 22/53] Added specific tests for vectorizers

---
 sklearn/feature_extraction/tests/test_text.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 7b7697ff47fff..a6170e4efbec2 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1343,3 +1343,16 @@ def test_unused_parameters_warn(Vectorizer, stop_words,
            )
     with pytest.warns(UserWarning, match=msg):
         vect.fit(train_data)
+
+
+@pytest.mark.parametrize('Vectorizer, X', (
+    (HashingVectorizer, [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]),
+    (CountVectorizer, JUNK_FOOD_DOCS))
+)
+def test_n_features_in(Vectorizer, X):
+    # For vectorizers, n_features_in_ does not make sense and it is always
+    # None
+    vectorizer = Vectorizer()
+    assert vectorizer.n_features_in_ is None
+    vectorizer.fit(X)
+    assert vectorizer.n_features_in_ is None

From a49e5eaee5c3b1325353e6c4853ccda6cf1c2bd3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 8 Sep 2019 17:29:28 -0400
Subject: [PATCH 23/53] flake8

---
 sklearn/tests/test_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index e25517b5122b7..0d365b9ba882a 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -16,7 +16,6 @@
 from sklearn.svm import SVC
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
-from sklearn.preprocessing import StandardScaler
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor

From 968fbff2deb0eb95ada2ad4efc80521d9a9d5ab8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 9 Sep 2019 11:24:30 -0400
Subject: [PATCH 24/53] Dummies now have n_feautures_in_ to None and raise
 error if not fitted

---
 sklearn/dummy.py            | 29 +++++++++++++++++++++++++++++
 sklearn/tests/test_dummy.py | 10 ++++++++++
 2 files changed, 39 insertions(+)

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index ab79321bd4fa3..b4d173b76bc41 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -17,6 +17,7 @@
 from .utils.random import random_choice_csc
 from .utils.stats import _weighted_percentile
 from .utils.multiclass import class_distribution
+from .exceptions import NotFittedError
 
 
 class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
@@ -352,6 +353,20 @@ def score(self, X, y, sample_weight=None):
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
 
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return None  # Dummies don't validate the input
+
 
 class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     """
@@ -551,3 +566,17 @@ def score(self, X, y, sample_weight=None):
         if X is None:
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
+
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return None  # Dummies don't validate the input
diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
index 4301a4c07654f..1e9623a27e59d 100644
--- a/sklearn/tests/test_dummy.py
+++ b/sklearn/tests/test_dummy.py
@@ -737,3 +737,13 @@ def test_dtype_of_classifier_probas(strategy):
     probas = model.fit(X, y).predict_proba(X)
 
     assert probas.dtype == np.float64
+
+
+@pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier))
+def test_n_features_in_(Dummy):
+    X = [[1, 2]]
+    y = [0]
+    d = Dummy()
+    assert not hasattr(d, 'n_features_in_')
+    d.fit(X, y)
+    assert d.n_features_in_ is None

From e4faf13bdb63f037fa4e0bd8d03efe1c0d006cdf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 9 Sep 2019 11:31:13 -0400
Subject: [PATCH 25/53] still don't check n_features_in_ for LDA (will be done
 in later PR)

---
 sklearn/decomposition/online_lda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index 2a1bc9ac27f73..3c0bcb9372bd9 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -621,7 +621,7 @@ def _unnormalized_transform(self, X):
 
         # make sure feature size is the same in fitted model and in X
         X = self._check_non_neg_array(
-            X, check_n_features=True,
+            X, check_n_features=False,
             whom="LatentDirichletAllocation.transform")
         n_samples, n_features = X.shape
         if n_features != self.components_.shape[1]:
@@ -746,7 +746,7 @@ def score(self, X, y=None):
         score : float
             Use approximate bound as score.
         """
-        X = self._check_non_neg_array(X, check_n_features=True,
+        X = self._check_non_neg_array(X, check_n_features=False,
                                       whom="LatentDirichletAllocation.score")
 
         doc_topic_distr = self._unnormalized_transform(X)
@@ -777,7 +777,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
         check_is_fitted(self)
 
         X = self._check_non_neg_array(
-            X, check_n_features=True,
+            X, check_n_features=False,
             whom="LatentDirichletAllocation.perplexity")
 
         if doc_topic_distr is None:

From a88a4c5f40a9ab0e7d2774918fa44ae688224d54 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 11 Sep 2019 13:37:01 -0400
Subject: [PATCH 26/53] Added tests for some estimators

---
 sklearn/cluster/bicluster.py                 |  2 +-
 sklearn/cluster/tests/test_bicluster.py      | 10 ++++++++++
 sklearn/ensemble/tests/test_voting.py        | 20 ++++++++++++++++++++
 sklearn/model_selection/tests/test_search.py |  7 ++++++-
 sklearn/utils/estimator_checks.py            |  1 +
 5 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py
index 5bfd335549012..3b54df43fe295 100644
--- a/sklearn/cluster/bicluster.py
+++ b/sklearn/cluster/bicluster.py
@@ -115,7 +115,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        X = self._validate_X(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
         self._fit(X)
         return self
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 1d88769f238aa..6044c90d11412 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -256,3 +256,13 @@ def test_wrong_shape():
     data = np.arange(27).reshape((3, 3, 3))
     with pytest.raises(ValueError):
         model.fit(data)
+
+
+@pytest.mark.parametrize('est', (SpectralBiclustering(), SpectralCoclustering()))
+def test_n_features_in_(est):
+
+    X, _, _ = make_biclusters((3, 3), 3, random_state=0)
+
+    assert not hasattr(est, 'n_features_in_')
+    est.fit(X)
+    assert est.n_features_in_ == 3
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index a02efe4d925d8..bbfb91751726a 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -528,3 +528,23 @@ def test_check_estimators_voting_estimator(estimator):
     # their testing parameters (for required parameters).
     check_estimator(estimator)
     check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
+
+
+@pytest.mark.parametrize(
+    "est",
+    [VotingRegressor(
+        estimators=[('lr', LinearRegression()),
+                    ('tree', DecisionTreeRegressor(random_state=0))]),
+     VotingClassifier(
+         estimators=[('lr', LogisticRegression(random_state=0)),
+                     ('tree', DecisionTreeClassifier(random_state=0))])],
+    ids=['VotingRegressor', 'VotingClassifier']
+)
+def test_n_features_in(est):
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    y = [0, 1, 2]
+
+    assert not hasattr(est, 'n_features_in_')
+    est.fit(X, y)
+    assert est.n_features_in_ == 2
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index dd92f144fac32..3ca0cf4f4cc5a 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -1780,12 +1780,17 @@ def get_n_splits(self, *args, **kw):
 
 
 def test_n_features_in():
-    # make sure grid search delegates n_features_in to the best estimator
+    # make sure grid search and random search delegate n_features_in to the
+    # best estimator
     n_features = 4
     X, y = make_classification(n_features=n_features)
     gbdt = HistGradientBoostingClassifier()
     param_grid = {'max_iter': [3, 4]}
     gs = GridSearchCV(gbdt, param_grid)
+    rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1)
     assert not hasattr(gs, 'n_features_in_')
+    assert not hasattr(rs, 'n_features_in_')
     gs.fit(X, y)
+    rs.fit(X, y)
     assert gs.n_features_in_ == n_features
+    assert rs.n_features_in_ == n_features
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 2cc9daf54be32..c3a0fb3d28c4e 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2664,6 +2664,7 @@ def check_n_features_in(name, estimator_orig):
     # Make sure that n_features_in_ attribute doesn't exist until fit is
     # called, and that its value is correct.
 
+    print(name)
     rng = np.random.RandomState(0)
 
     estimator = clone(estimator_orig)

From f3fb539fbdbb61215cc12d1f61586cd63278049e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 11 Sep 2019 13:59:37 -0400
Subject: [PATCH 27/53] removed NonRectangularInputMixin and set n_features_in
 to SparseCoder

---
 sklearn/base.py                                          | 9 ---------
 sklearn/cluster/tests/test_bicluster.py                  | 3 ++-
 sklearn/decomposition/dict_learning.py                   | 1 +
 sklearn/decomposition/tests/test_dict_learning.py        | 6 ++++++
 sklearn/feature_extraction/dict_vectorizer.py            | 5 ++---
 sklearn/feature_extraction/tests/test_dict_vectorizer.py | 4 ++--
 sklearn/feature_extraction/tests/test_text.py            | 7 +++----
 sklearn/feature_extraction/text.py                       | 4 ++--
 8 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 5b8f7637c5261..36a849d2e2c1e 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -583,15 +583,6 @@ def fit_transform(self, X, y=None, **fit_params):
             return self.fit(X, y, **fit_params).transform(X)
 
 
-class NonRectangularInputMixin:
-    """Mixin class for all estimators with non-rectangular input.
-
-    For now only vectorizers are relevant for this mixin.
-    """
-
-    n_features_in_ = None
-
-
 class DensityMixin:
     """Mixin class for all density estimators in scikit-learn."""
     _estimator_type = "DensityEstimator"
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 6044c90d11412..5057480572a6b 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -258,7 +258,8 @@ def test_wrong_shape():
         model.fit(data)
 
 
-@pytest.mark.parametrize('est', (SpectralBiclustering(), SpectralCoclustering()))
+@pytest.mark.parametrize('est',
+                         (SpectralBiclustering(), SpectralCoclustering()))
 def test_n_features_in_(est):
 
     X, _, _ = make_biclusters((3, 3), 3, random_state=0)
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index ce3ed2aa44978..9ed705d680059 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -1024,6 +1024,7 @@ def __init__(self, dictionary, transform_algorithm='omp',
                                        transform_alpha, split_sign, n_jobs,
                                        positive_code, transform_max_iter)
         self.components_ = dictionary
+        self.n_features_in_ = dictionary.shape[1]
 
     def fit(self, X, y=None):
         """Do nothing and return the estimator unchanged
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index 54c5ece561f18..af8a1869626f3 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -498,3 +498,9 @@ def test_sparse_coder_parallel_mmap():
 
     sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2)
     sc.fit_transform(data)
+
+
+def test_sparse_coder_n_features_in():
+    d = np.array([[1, 2, 3], [1, 2, 3]])
+    sc = SparseCoder(d)
+    assert sc.n_features_in_ == d.shape[1]
diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py
index 8ba68ea19efaf..857806c892806 100644
--- a/sklearn/feature_extraction/dict_vectorizer.py
+++ b/sklearn/feature_extraction/dict_vectorizer.py
@@ -9,7 +9,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin
+from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, tosequence
 
 
@@ -21,8 +21,7 @@ def _tosequence(X):
         return tosequence(X)
 
 
-class DictVectorizer(NonRectangularInputMixin, TransformerMixin,
-                     BaseEstimator):
+class DictVectorizer(TransformerMixin, BaseEstimator):
     """Transforms lists of feature-value mappings to vectors.
 
     This transformer turns lists of mappings (dict-like objects) of feature
diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
index 32a14fe82be5b..a65feb2d7590b 100644
--- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py
+++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -116,7 +116,7 @@ def test_n_features_in():
     # For vectorizers, n_features_in_ does not make sense and it is always
     # None
     dv = DictVectorizer()
-    assert dv.n_features_in_ is None
+    assert not hasattr(dv, 'n_features_in_')
     d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
     dv.fit(d)
-    assert dv.n_features_in_ is None
+    assert not hasattr(dv, 'n_features_in_')
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index a6170e4efbec2..f775589fb6a8a 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1350,9 +1350,8 @@ def test_unused_parameters_warn(Vectorizer, stop_words,
     (CountVectorizer, JUNK_FOOD_DOCS))
 )
 def test_n_features_in(Vectorizer, X):
-    # For vectorizers, n_features_in_ does not make sense and it is always
-    # None
+    # For vectorizers, n_features_in_ does not make sense
     vectorizer = Vectorizer()
-    assert vectorizer.n_features_in_ is None
+    assert not hasattr(vectorizer, 'n_features_in_')
     vectorizer.fit(X)
-    assert vectorizer.n_features_in_ is None
+    assert not hasattr(vectorizer, 'n_features_in_')
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 908cfcd526677..610a9f66c58ff 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -25,7 +25,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin
+from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import normalize
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
@@ -181,7 +181,7 @@ def _check_stop_list(stop):
         return frozenset(stop)
 
 
-class VectorizerMixin(NonRectangularInputMixin):
+class VectorizerMixin:
     """Provides common code for text vectorizers (tokenization logic)."""
 
     _white_spaces = re.compile(r"\s\s+")

From 4b7b7581ab0fd3c379d17fe29b331ebedce05f3e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 12 Sep 2019 09:13:23 -0400
Subject: [PATCH 28/53] simpler logic for dummies

---
 sklearn/dummy.py | 30 ++----------------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index d4c9ed730caa0..f0d665704377e 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -129,6 +129,7 @@ def fit(self, X, y, sample_weight=None):
         self.n_outputs_ = y.shape[1]
 
         check_consistent_length(X, y, sample_weight)
+        self.n_features_in_ = None  # No input validation is done for X
 
         if self.strategy == "constant":
             if self.constant is None:
@@ -356,20 +357,6 @@ def score(self, X, y, sample_weight=None):
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
 
-    @property
-    def n_features_in_(self):
-        # For consistency with other estimators we raise a AttributeError so
-        # that hasattr() fails if the estimator isn't fitted.
-        try:
-            check_is_fitted(self)
-        except NotFittedError as nfe:
-            raise AttributeError(
-                "{} object has no n_features_in_ attribute."
-                .format(self.__class__.__name__)
-            ) from nfe
-
-        return None  # Dummies don't validate the input
-
 
 class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     """
@@ -441,6 +428,7 @@ def fit(self, X, y, sample_weight=None):
                              % (self.strategy, allowed_strategies))
 
         y = check_array(y, ensure_2d=False)
+        self.n_features_in_ = None  # No input validation is done for X
         if len(y) == 0:
             raise ValueError("y must not be empty.")
 
@@ -569,17 +557,3 @@ def score(self, X, y, sample_weight=None):
         if X is None:
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
-
-    @property
-    def n_features_in_(self):
-        # For consistency with other estimators we raise a AttributeError so
-        # that hasattr() fails if the estimator isn't fitted.
-        try:
-            check_is_fitted(self)
-        except NotFittedError as nfe:
-            raise AttributeError(
-                "{} object has no n_features_in_ attribute."
-                .format(self.__class__.__name__)
-            ) from nfe
-
-        return None  # Dummies don't validate the input

From 53027d36eadb8f1df06a768238bdbac04cf060da Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 12 Sep 2019 09:21:58 -0400
Subject: [PATCH 29/53] comments

---
 sklearn/cluster/k_means_.py       | 2 +-
 sklearn/decomposition/fastica_.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 50af23b8d046c..7a398e2985e0f 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -954,7 +954,7 @@ def fit(self, X, y=None, sample_weight=None):
         random_state = check_random_state(self.random_state)
 
         # This validates twice but there is not clean way to avoid validation
-        # in k_means.
+        # in k_means(). Please see issue 14897.
         order = "C" if self.copy_x else None
         self._validate_X(X, accept_sparse='csr',
                          dtype=[np.float64, np.float32], order=order,
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index b32ce59715fbe..7815ccd2b0ae8 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -502,6 +502,8 @@ def _fit(self, X, compute_sources=False):
             X_new : array-like, shape (n_samples, n_components)
         """
 
+        # This validates twice but there is not clean way to avoid validation
+        # in fastica(). Please see issue 14897.
         self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES,
                          ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args

From a1aea70fc8b05b0156cbd609a89688b0aa9d9a15 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 15 Sep 2019 18:22:30 -0400
Subject: [PATCH 30/53] pep8

---
 sklearn/dummy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index fad508926ae83..6dc524b778e45 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -17,7 +17,6 @@
 from .utils.random import random_choice_csc
 from .utils.stats import _weighted_percentile
 from .utils.multiclass import class_distribution
-from .exceptions import NotFittedError
 from .utils import deprecated
 
 
From 9ecc396079fa43f887a62414578bbddec59c130d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 15 Sep 2019 18:24:33 -0400
Subject: [PATCH 31/53] remove print

---
 sklearn/utils/estimator_checks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 3e8d65a584e00..285b318589d3d 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2671,7 +2671,6 @@ def check_n_features_in(name, estimator_orig):
     # Make sure that n_features_in_ attribute doesn't exist until fit is
     # called, and that its value is correct.
 
-    print(name)
     rng = np.random.RandomState(0)
 
     estimator = clone(estimator_orig)

From 9292c8459c54a7b3f1dd190bd7f335ffc1373a73 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 16 Sep 2019 11:05:23 -0400
Subject: [PATCH 32/53] avoid dep warning

---
 sklearn/utils/estimator_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 285b318589d3d..5a10bf6db88fe 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2685,7 +2685,7 @@ def check_n_features_in(name, estimator_orig):
         y = rng.normal(size=n_samples)
     else:
         y = rng.randint(low=0, high=2, size=n_samples)
-    y = enforce_estimator_tags_y(estimator, y)
+    y = _enforce_estimator_tags_y(estimator, y)
 
     assert not hasattr(estimator, 'n_features_in_')
     estimator.fit(X, y)

From 6846bea5babb61dacdabb69bee5bd343074e847b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 19 Sep 2019 08:50:29 -0400
Subject: [PATCH 33/53] merged (maybe)

---
 doc/glossary.rst                              |   1 +
 doc/modules/classes.rst                       |   4 +
 doc/modules/ensemble.rst                      | 138 +++-
 doc/modules/neighbors.rst                     |  91 ++-
 doc/whats_new/v0.22.rst                       |  39 +
 examples/ensemble/plot_stack_predictors.py    | 123 +++
 .../approximate_nearest_neighbors.py          | 293 ++++++++
 .../plot_caching_nearest_neighbors.py         |  60 ++
 examples/neighbors/plot_nca_illustration.py   |  59 +-
 sklearn/cluster/dbscan_.py                    |  36 +-
 sklearn/cluster/spectral.py                   |  31 +-
 sklearn/cluster/tests/test_dbscan.py          |  17 +
 sklearn/cluster/tests/test_spectral.py        |  20 +
 sklearn/datasets/samples_generator.py         |  32 +-
 .../datasets/tests/test_samples_generator.py  |  30 +
 sklearn/ensemble/__init__.py                  |   3 +
 sklearn/ensemble/_stacking.py                 | 704 ++++++++++++++++++
 sklearn/ensemble/base.py                      |  17 +
 sklearn/ensemble/tests/test_stacking.py       | 492 ++++++++++++
 sklearn/ensemble/tests/test_voting.py         |   4 +-
 sklearn/ensemble/voting.py                    |  18 +-
 sklearn/manifold/_utils.pyx                   |  30 +-
 sklearn/manifold/isomap.py                    |  78 +-
 sklearn/manifold/locally_linear.py            |   2 +-
 sklearn/manifold/spectral_embedding_.py       |  40 +-
 sklearn/manifold/t_sne.py                     | 116 ++-
 sklearn/manifold/tests/test_isomap.py         |  52 ++
 .../manifold/tests/test_spectral_embedding.py |  34 +-
 sklearn/manifold/tests/test_t_sne.py          | 183 ++---
 sklearn/neighbors/__init__.py                 |   3 +
 sklearn/neighbors/base.py                     | 389 ++++++++--
 sklearn/neighbors/classification.py           |  49 +-
 sklearn/neighbors/graph.py                    | 319 +++++++-
 sklearn/neighbors/lof.py                      |  21 +-
 sklearn/neighbors/regression.py               |  24 +-
 sklearn/neighbors/tests/test_graph.py         |  79 ++
 sklearn/neighbors/tests/test_neighbors.py     | 184 ++++-
 .../tests/test_neighbors_pipeline.py          | 221 ++++++
 sklearn/neighbors/unsupervised.py             |  32 +-
 sklearn/tests/test_common.py                  |   5 +-
 sklearn/utils/estimator_checks.py             |  43 +-
 41 files changed, 3603 insertions(+), 513 deletions(-)
 create mode 100644 examples/ensemble/plot_stack_predictors.py
 create mode 100644 examples/neighbors/approximate_nearest_neighbors.py
 create mode 100644 examples/neighbors/plot_caching_nearest_neighbors.py
 create mode 100644 sklearn/ensemble/_stacking.py
 create mode 100644 sklearn/ensemble/tests/test_stacking.py
 create mode 100644 sklearn/neighbors/tests/test_graph.py
 create mode 100644 sklearn/neighbors/tests/test_neighbors_pipeline.py

diff --git a/doc/glossary.rst b/doc/glossary.rst
index 99f512cc49acc..1c5535cdc2bb4 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -697,6 +697,7 @@ General Concepts
         to :term:`unlabeled` samples in semi-supervised classification.
 
     sparse matrix
+    sparse graph
         A representation of two-dimensional numeric data that is more memory
         efficient the corresponding dense numpy array where almost all elements
         are zero. We use the :mod:`scipy.sparse` framework, which provides
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 16658a39b1612..444895245bf6b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -422,6 +422,8 @@ Samples generator
    ensemble.RandomForestClassifier
    ensemble.RandomForestRegressor
    ensemble.RandomTreesEmbedding
+   ensemble.StackingClassifier
+   ensemble.StackingRegressor
    ensemble.VotingClassifier
    ensemble.VotingRegressor
    ensemble.HistGradientBoostingRegressor
@@ -1234,9 +1236,11 @@ Model validation
    neighbors.KernelDensity
    neighbors.KNeighborsClassifier
    neighbors.KNeighborsRegressor
+   neighbors.KNeighborsTransformer
    neighbors.LocalOutlierFactor
    neighbors.RadiusNeighborsClassifier
    neighbors.RadiusNeighborsRegressor
+   neighbors.RadiusNeighborsTransformer
    neighbors.NearestCentroid
    neighbors.NearestNeighbors
    neighbors.NeighborhoodComponentsAnalysis
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index b023b4711c57f..02be4f4cff624 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -722,7 +722,7 @@ The parameter ``learning_rate`` strongly interacts with the parameter
 ``n_estimators``, the number of weak learners to fit. Smaller values
 of ``learning_rate`` require larger numbers of weak learners to maintain
 a constant training error. Empirical evidence suggests that small
-values of ``learning_rate`` favor better test error. [HTF2009]_
+values of ``learning_rate`` favor better test error. [HTF]_
 recommend to set the learning rate to a small constant
 (e.g. ``learning_rate <= 0.1``) and choose ``n_estimators`` by early
 stopping. For a more detailed discussion of the interaction between
@@ -1056,7 +1056,9 @@ The following example shows how to fit the majority rule classifier::
    >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    >>> clf3 = GaussianNB()
 
-   >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
+   >>> eclf = VotingClassifier(
+   ...     estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+   ...     voting='hard')
 
    >>> for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    ...     scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
@@ -1142,7 +1144,10 @@ hyperparameters of the individual estimators::
    >>> clf1 = LogisticRegression(random_state=1)
    >>> clf2 = RandomForestClassifier(random_state=1)
    >>> clf3 = GaussianNB()
-   >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
+   >>> eclf = VotingClassifier(
+   ...     estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+   ...     voting='soft'
+   ... )
 
    >>> params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200]}
 
@@ -1156,13 +1161,17 @@ In order to predict the class labels based on the predicted
 class-probabilities (scikit-learn estimators in the VotingClassifier
 must support ``predict_proba`` method)::
 
-   >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
+   >>> eclf = VotingClassifier(
+   ...     estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+   ...     voting='soft'
+   ... )
 
 Optionally, weights can be provided for the individual classifiers::
 
-   >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
-   ...                         voting='soft', weights=[2, 5, 1])
-
+   >>> eclf = VotingClassifier(
+   ...     estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
+   ...     voting='soft', weights=[2,5,1]
+   ... )
 
 .. _voting_regressor:
 
@@ -1187,7 +1196,7 @@ The following example shows how to fit the VotingRegressor::
 
    >>> # Loading some example data
    >>> X, y = load_boston(return_X_y=True)
-   
+
    >>> # Training classifiers
    >>> reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10)
    >>> reg2 = RandomForestRegressor(random_state=1, n_estimators=10)
@@ -1203,3 +1212,116 @@ The following example shows how to fit the VotingRegressor::
 .. topic:: Examples:
 
   * :ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py`
+
+.. _stacking:
+
+Stacked generalization
+======================
+
+Stacked generalization is a method for combining estimators to reduce their
+biases [W1992]_ [HTF]_. More precisely, the predictions of each individual
+estimator are stacked together and used as input to a final estimator to
+compute the prediction. This final estimator is trained through
+cross-validation.
+
+The :class:`StackingClassifier` and :class:`StackingRegressor` provide such
+strategies which can be applied to classification and regression problems.
+
+The `estimators` parameter corresponds to the list of the estimators which
+are stacked together in parallel on the input data. It should be given as a
+list of names and estimators::
+
+  >>> from sklearn.linear_model import RidgeCV, LassoCV
+  >>> from sklearn.svm import SVR
+  >>> estimators = [('ridge', RidgeCV()),
+  ...               ('lasso', LassoCV(random_state=42)),
+  ...               ('svr', SVR(C=1, gamma=1e-6))]
+
+The `final_estimator` will use the predictions of the `estimators` as input. It
+needs to be a classifier or a regressor when using :class:`StackingClassifier`
+or :class:`StackingRegressor`, respectively::
+
+  >>> from sklearn.ensemble import GradientBoostingRegressor
+  >>> from sklearn.ensemble import StackingRegressor
+  >>> reg = StackingRegressor(
+  ...     estimators=estimators,
+  ...     final_estimator=GradientBoostingRegressor(random_state=42))
+
+To train the `estimators` and `final_estimator`, the `fit` method needs
+to be called on the training data::
+
+  >>> from sklearn.datasets import load_boston
+  >>> X, y = load_boston(return_X_y=True)
+  >>> from sklearn.model_selection import train_test_split
+  >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+  ...                                                     random_state=42)
+  >>> reg.fit(X_train, y_train)
+  StackingRegressor(...)
+
+During training, the `estimators` are fitted on the whole training data
+`X_train`. They will be used when calling `predict` or `predict_proba`. To
+generalize and avoid over-fitting, the `final_estimator` is trained on
+out-samples using :func:`sklearn.model_selection.cross_val_predict` internally.
+
+For :class:`StackingClassifier`, note that the output of the ``estimators`` is
+controlled by the parameter `stack_method` and it is called by each estimator.
+This parameter is either a string, being estimator method names, or `'auto'`
+which will automatically identify an available method depending on the
+availability, tested in the order of preference: `predict_proba`,
+`decision_function` and `predict`.
+
+A :class:`StackingRegressor` and :class:`StackingClassifier` can be used as
+any other regressor or classifier, exposing a `predict`, `predict_proba`, and
+`decision_function` methods, e.g.::
+
+   >>> y_pred = reg.predict(X_test)
+   >>> from sklearn.metrics import r2_score
+   >>> print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred)))
+   R2 score: 0.81
+
+Note that it is also possible to get the output of the stacked outputs of the
+`estimators` using the `transform` method::
+
+  >>> reg.transform(X_test[:5])
+  array([[28.78..., 28.43...  , 22.62...],
+         [35.96..., 32.58..., 23.68...],
+         [14.97..., 14.05..., 16.45...],
+         [25.19..., 25.54..., 22.92...],
+         [18.93..., 19.26..., 17.03... ]])
+
+In practise, a stacking predictor predict as good as the best predictor of the
+base layer and even sometimes outputperform it by combining the different
+strength of the these predictors. However, training a stacking predictor is
+computationally expensive.
+
+.. note::
+   For :class:`StackingClassifier`, when using `stack_method_='predict_proba'`,
+   the first column is dropped when the problem is a binary classification
+   problem. Indeed, both probability columns predicted by each estimator are
+   perfectly collinear.
+
+.. note::
+   Multiple stacking layers can be achieved by assigning `final_estimator` to
+   a :class:`StackingClassifier` or :class:`StackingRegressor`::
+
+    >>> final_layer = StackingRegressor(
+    ...     estimators=[('rf', RandomForestRegressor(random_state=42)),
+    ...                 ('gbrt', GradientBoostingRegressor(random_state=42))],
+    ...     final_estimator=RidgeCV()
+    ...     )
+    >>> multi_layer_regressor = StackingRegressor(
+    ...     estimators=[('ridge', RidgeCV()),
+    ...                 ('lasso', LassoCV(random_state=42)),
+    ...                 ('svr', SVR(C=1, gamma=1e-6, kernel='rbf'))],
+    ...     final_estimator=final_layer
+    ... )
+    >>> multi_layer_regressor.fit(X_train, y_train)
+    StackingRegressor(...)
+    >>> print('R2 score: {:.2f}'
+    ...       .format(multi_layer_regressor.score(X_test, y_test)))
+    R2 score: 0.82
+
+.. topic:: References
+
+   .. [W1992] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+      (1992): 241-259.
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index e07e66e833919..7c7da8f0afdb7 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -135,8 +135,8 @@ have the same interface; we'll show an example of using the KD Tree here:
 
 Refer to the :class:`KDTree` and :class:`BallTree` class documentation
 for more information on the options available for nearest neighbors searches,
-including specification of query strategies, distance metrics, etc. For a list 
-of available metrics, see the documentation of the :class:`DistanceMetric` 
+including specification of query strategies, distance metrics, etc. For a list
+of available metrics, see the documentation of the :class:`DistanceMetric`
 class.
 
 .. _classification:
@@ -160,8 +160,8 @@ training point, where :math:`r` is a floating-point value specified by
 the user.
 
 The :math:`k`-neighbors classification in :class:`KNeighborsClassifier`
-is the most commonly used technique. The optimal choice of the value :math:`k` 
-is highly data-dependent: in general a larger :math:`k` suppresses the effects 
+is the most commonly used technique. The optimal choice of the value :math:`k`
+is highly data-dependent: in general a larger :math:`k` suppresses the effects
 of noise, but makes the classification boundaries less distinct.
 
 In cases where the data is not uniformly sampled, radius-based neighbors
@@ -320,7 +320,7 @@ To address the inefficiencies of KD Trees in higher dimensions, the *ball tree*
 data structure was developed.  Where KD trees partition data along
 Cartesian axes, ball trees partition data in a series of nesting
 hyper-spheres.  This makes tree construction more costly than that of the
-KD tree, but results in a data structure which can be very efficient on 
+KD tree, but results in a data structure which can be very efficient on
 highly structured data, even in very high dimensions.
 
 A ball tree recursively divides the data into
@@ -509,6 +509,87 @@ the model from 0.81 to 0.82.
   * :ref:`sphx_glr_auto_examples_neighbors_plot_nearest_centroid.py`: an example of
     classification using nearest centroid with different shrink thresholds.
 
+.. _neighbors_transformer:
+
+Nearest Neighbors Transformer
+=============================
+
+Many scikit-learn estimators rely on nearest neighbors: Several classifiers and
+regressors such as :class:`KNeighborsClassifier` and
+:class:`KNeighborsRegressor`, but also some clustering methods such as
+:class:`~sklearn.cluster.DBSCAN` and
+:class:`~sklearn.cluster.SpectralClustering`, and some manifold embeddings such
+as :class:`~sklearn.manifold.TSNE` and :class:`~sklearn.manifold.Isomap`.
+
+All these estimators can compute internally the nearest neighbors, but most of
+them also accept precomputed nearest neighbors :term:`sparse graph`,
+as given by :func:`~sklearn.neighbors.kneighbors_graph` and
+:func:`~sklearn.neighbors.radius_neighbors_graph`. With mode
+`mode='connectivity'`, these functions return a binary adjacency sparse graph
+as required, for instance, in :class:`~sklearn.cluster.SpectralClustering`.
+Whereas with `mode='distance'`, they return a distance sparse graph as required,
+for instance, in :class:`~sklearn.cluster.DBSCAN`. To include these functions in
+a scikit-learn pipeline, one can also use the corresponding classes
+:class:`KNeighborsTransformer` and :class:`RadiusNeighborsTransformer`.
+The benefits of this sparse graph API are multiple.
+
+First, the precomputed graph can be re-used multiple times, for instance while
+varying a parameter of the estimator. This can be done manually by the user, or
+using the caching properties of the scikit-learn pipeline:
+
+    >>> from sklearn.manifold import Isomap
+    >>> from sklearn.neighbors import KNeighborsTransformer
+    >>> from sklearn.pipeline import make_pipeline
+    >>> estimator = make_pipeline(
+    ...     KNeighborsTransformer(n_neighbors=5, mode='distance'),
+    ...     Isomap(neighbors_algorithm='precomputed'),
+    ...     memory='/path/to/cache')
+
+Second, precomputing the graph can give finer control on the nearest neighbors
+estimation, for instance enabling multiprocessing though the parameter
+`n_jobs`, which might not be available in all estimators.
+
+Finally, the precomputation can be performed by custom estimators to use
+different implementations, such as approximate nearest neighbors methods, or
+implementation with special data types. The precomputed neighbors
+:term:`sparse graph` needs to be formatted as in
+:func:`~sklearn.neighbors.radius_neighbors_graph` output:
+
+* a CSR matrix (although COO, CSC or LIL will be accepted).
+* only explicitly store nearest neighborhoods of each sample with respect to the
+  training data. This should include those at 0 distance from a query point,
+  including the matrix diagonal when computing the nearest neighborhoods
+  between the training data and itself.
+* each row's `data` should store the distance in increasing order (optional.
+  Unsorted data will be stable-sorted, adding a computational overhead).
+* all values in data should be non-negative.
+* there should be no duplicate `indices` in any row
+  (see https://github.com/scipy/scipy/issues/5807).
+* if the algorithm being passed the precomputed matrix uses k nearest neighbors
+  (as opposed to radius neighborhood), at least k neighbors must be stored in
+  each row (or k+1, as explained in the following note).
+
+.. note::
+  When a specific number of neighbors is queried (using
+  :class:`KNeighborsTransformer`), the definition of `n_neighbors` is ambiguous
+  since it can either include each training point as its own neighbor, or 
+  exclude them. Neither choice is perfect, since including them leads to a
+  different number of non-self neighbors during training and testing, while
+  excluding them leads to a difference between `fit(X).transform(X)` and
+  `fit_transform(X)`, which is against scikit-learn API.
+  In :class:`KNeighborsTransformer` we use the definition which includes each
+  training point as its own neighbor in the count of `n_neighbors`. However,
+  for compatibility reasons with other estimators which use the other
+  definition, one extra neighbor will be computed when `mode == 'distance'`.
+  To maximise compatiblity with all estimators, a safe choice is to always
+  include one extra neighbor in a custom nearest neighbors estimator, since
+  unnecessary neighbors will be filtered by following estimators.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_neighbors_neighbors_in_pipeline_api.py`: an
+    example of pipelining KNeighborsTransformer and TSNE, and of two custom
+    nearest neighbors estimators based on external packages.
 
 .. _nca:
 
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 9f60a735556cd..a793c4913ca5c 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -69,6 +69,13 @@ Changelog
   `sample_weights` are not supported by the wrapped estimator). :pr:`13575`
   by :user:`William de Vazelhes <wdevazelhes>`.
 
+:mod:`sklearn.cluster`
+......................
+
+- |Feature| :class:`cluster.SpectralClustering` now accepts precomputed sparse
+  neighbors graph as input. :issue:`10482` by `Tom Dupre la Tour`_ and
+  :user:`Kumar Ashutosh <thechargedneutron>`.
+
 :mod:`sklearn.compose`
 ......................
 
@@ -111,6 +118,10 @@ Changelog
   :func:`datasets.fetch_20newsgroups` and :func:`datasets.fetch_olivetti_faces`
   . :pr:`14259` by :user:`Sourav Singh <souravsingh>`.
 
+- |Enhancement| :func:`datasets.make_classification` now accepts array-like
+   `weights` parameter, i.e. list or numpy.array, instead of list only.
+  :pr:`14764` by :user:`Cat Chenal <CatChenal>`.
+
 - |Fix| Fixed a bug in :func:`datasets.fetch_openml`, which failed to load
   an OpenML dataset that contains an ignored feature.
   :pr:`14623` by :user:`Sarra Habchi <HabchiSarra>`.
@@ -152,6 +163,12 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |MajorFeature| Added :class:`ensemble.StackingClassifier` and
+  :class:`ensemble.StackingRegressor` to stack predictors using a final
+  classifier or regressor.
+  :pr:`11047` by :user:`Guillaume Lemaitre <glemaitre>` and
+  :user:`Caio Oliveira <caioaao>`.
+
 - Many improvements were made to
   :class:`ensemble.HistGradientBoostingClassifier` and
   :class:`ensemble.HistGradientBoostingRegressor`:
@@ -314,6 +331,14 @@ Changelog
 :mod:`sklearn.manifold`
 .......................
 
+- |Feature| :class:`manifold.Isomap`, :class:`manifold.TSNE`, and
+  :class:`manifold.SpectralEmbedding` now accept precomputed sparse
+  neighbors graph as input. :issue:`10482` by `Tom Dupre la Tour`_ and
+  :user:`Kumar Ashutosh <thechargedneutron>`.
+
+- |API| Deprecate ``training_data_`` unused attribute in
+  :class:`manifold.Isomap`. :issue:`10482` by `Tom Dupre la Tour`_.
+
 - |Fix| Fixed a bug where :func:`manifold.spectral_embedding` (and therefore
   :class:`manifold.SpectralEmbedding` and :class:`cluster.SpectralClustering`)
   computed wrong eigenvalues with ``eigen_solver='amg'`` when
@@ -397,6 +422,20 @@ Changelog
 :mod:`sklearn.neighbors`
 ........................
 
+- |MajorFeature| Added :class:`neighbors.KNeighborsTransformer` and
+  :class:`neighbors.RadiusNeighborsTransformer`, which transform input dataset
+  into a sparse neighbors graph. They give finer control on nearest neighbors
+  computations and enable easy pipeline caching for multiple use.
+  :issue:`10482` by `Tom Dupre la Tour`_.
+
+- |Feature| :class:`neighbors.KNeighborsClassifier`,
+  :class:`neighbors.KNeighborsRegressor`,
+  :class:`neighbors.RadiusNeighborsClassifier`,
+  :class:`neighbors.RadiusNeighborsRegressor`, and
+  :class:`neighbors.LocalOutlierFactor` now accept precomputed sparse
+  neighbors graph as input. :issue:`10482` by `Tom Dupre la Tour`_ and
+  :user:`Kumar Ashutosh <thechargedneutron>`.
+
 - |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports
   predicting probabilities by using `predict_proba` and supports more
   outlier_label options: 'most_frequent', or different outlier_labels
diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
new file mode 100644
index 0000000000000..2c10ac1b362e7
--- /dev/null
+++ b/examples/ensemble/plot_stack_predictors.py
@@ -0,0 +1,123 @@
+"""
+=================================
+Combine predictors using stacking
+=================================
+
+Stacking refers to a method to blend estimators. In this strategy, some
+estimators are individually fitted on some training data while a final
+estimator is trained using the stacked predictions of these base estimators.
+
+In this example, we illustrate the use case in which different regressors are
+stacked together and a final linear penalized regressor is used to output the
+prediction. We compare the performance of each individual regressor with the
+stacking strategy. Stacking slightly improves the overall performance.
+
+"""
+print(__doc__)
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: BSD 3 clause
+
+###############################################################################
+# The function ``plot_regression_results`` is used to plot the predicted and
+# true targets.
+
+import matplotlib.pyplot as plt
+
+
+def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
+    """Scatter plot of the predicted vs true targets."""
+    ax.plot([y_true.min(), y_true.max()],
+            [y_true.min(), y_true.max()],
+            '--r', linewidth=2)
+    ax.scatter(y_true, y_pred, alpha=0.2)
+
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['left'].set_position(('outward', 10))
+    ax.spines['bottom'].set_position(('outward', 10))
+    ax.set_xlim([y_true.min(), y_true.max()])
+    ax.set_ylim([y_true.min(), y_true.max()])
+    ax.set_xlabel('Measured')
+    ax.set_ylabel('Predicted')
+    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
+                          edgecolor='none', linewidth=0)
+    ax.legend([extra], [scores], loc='upper left')
+    title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
+    ax.set_title(title)
+
+
+###############################################################################
+# Stack of predictors on a single data set
+###############################################################################
+# It is sometimes tedious to find the model which will best perform on a given
+# dataset. Stacking provide an alternative by combining the outputs of several
+# learners, without the need to choose a model specifically. The performance of
+# stacking is usually close to the best model and sometimes it can outperform
+# the prediction performance of each individual model.
+#
+# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
+# to combine their outputs together.
+
+from sklearn.ensemble import StackingRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.linear_model import LassoCV
+from sklearn.linear_model import RidgeCV
+
+estimators = [
+    ('Random Forest', RandomForestRegressor(random_state=42)),
+    ('Lasso', LassoCV()),
+    ('Gradient Boosting', HistGradientBoostingRegressor(random_state=0))
+]
+stacking_regressor = StackingRegressor(
+    estimators=estimators, final_estimator=RidgeCV()
+)
+
+
+###############################################################################
+# We used the Boston data set (prediction of house prices). We check the
+# performance of each individual predictor as well as the stack of the
+# regressors.
+
+import time
+import numpy as np
+from sklearn.datasets import load_boston
+from sklearn.model_selection import cross_validate, cross_val_predict
+
+X, y = load_boston(return_X_y=True)
+
+fig, axs = plt.subplots(2, 2, figsize=(9, 7))
+axs = np.ravel(axs)
+
+for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
+                                               stacking_regressor)]):
+    start_time = time.time()
+    score = cross_validate(est, X, y,
+                           scoring=['r2', 'neg_mean_absolute_error'],
+                           n_jobs=-1, verbose=0)
+    elapsed_time = time.time() - time.time()
+
+    y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
+    plot_regression_results(
+        ax, y, y_pred,
+        name,
+        (r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$')
+        .format(np.mean(score['test_r2']),
+                np.std(score['test_r2']),
+                -np.mean(score['test_neg_mean_absolute_error']),
+                np.std(score['test_neg_mean_absolute_error'])),
+        elapsed_time)
+
+plt.suptitle('Single predictors versus stacked predictors')
+plt.tight_layout()
+plt.subplots_adjust(top=0.9)
+plt.show()
+
+###############################################################################
+# The stacked regressor will combine the strengths of the different regressors.
+# However, we also see that training the stacked regressor is much more
+# computationally expensive.
diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py
new file mode 100644
index 0000000000000..b24087f4cb593
--- /dev/null
+++ b/examples/neighbors/approximate_nearest_neighbors.py
@@ -0,0 +1,293 @@
+"""
+=====================================
+Approximate nearest neighbors in TSNE
+=====================================
+
+This example presents how to chain KNeighborsTransformer and TSNE in a
+pipeline, and how to wrap the packages `annoy` and `nmslib` to replace
+KNeighborsTransformer and perform approximate nearest neighbors.
+These package can be installed with `pip install annoy nmslib`.
+
+Note: Currently TSNE(metric='precomputed') does not modify the precomputed
+distances, and thus assumes that precomputed euclidean distances are squared.
+In future versions, a parameter in TSNE will control the optional squaring of
+precomputed distances (see #12401).
+
+Note: In :class:`KNeighborsTransformer` we use the definition which includes
+each training point as its own neighbor in the count of `n_neighbors`, and for
+compatibility reasons, one extra neighbor is computed when
+`mode == 'distance'`. Please note that we do the same in the proposed wrappers.
+
+Sample output:
+
+Benchmarking on MNIST_2000:
+---------------------------
+AnnoyTransformer:                    0.583 sec
+NMSlibTransformer:                   0.321 sec
+KNeighborsTransformer:               1.225 sec
+TSNE with AnnoyTransformer:          4.903 sec
+TSNE with NMSlibTransformer:         5.009 sec
+TSNE with KNeighborsTransformer:     6.210 sec
+TSNE with internal NearestNeighbors: 6.365 sec
+
+Benchmarking on MNIST_10000:
+----------------------------
+AnnoyTransformer:                    4.457 sec
+NMSlibTransformer:                   2.080 sec
+KNeighborsTransformer:               30.680 sec
+TSNE with AnnoyTransformer:          30.225 sec
+TSNE with NMSlibTransformer:         43.295 sec
+TSNE with KNeighborsTransformer:     64.845 sec
+TSNE with internal NearestNeighbors: 64.984 sec
+"""
+# Author: Tom Dupre la Tour
+#
+# License: BSD 3 clause
+import time
+import sys
+
+try:
+    import annoy
+except ImportError:
+    print("The package 'annoy' is required to run this example.")
+    sys.exit()
+
+try:
+    import nmslib
+except ImportError:
+    print("The package 'nmslib' is required to run this example.")
+    sys.exit()
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.ticker import NullFormatter
+from scipy.sparse import csr_matrix
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.neighbors import KNeighborsTransformer
+from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.datasets import fetch_openml
+from sklearn.pipeline import make_pipeline
+from sklearn.manifold import TSNE
+from sklearn.utils import shuffle
+
+print(__doc__)
+
+
+class NMSlibTransformer(TransformerMixin, BaseEstimator):
+    """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
+
+    def __init__(self, n_neighbors=5, metric='euclidean', method='sw-graph',
+                 n_jobs=1):
+        self.n_neighbors = n_neighbors
+        self.method = method
+        self.metric = metric
+        self.n_jobs = n_jobs
+
+    def fit(self, X):
+        self.n_samples_fit_ = X.shape[0]
+
+        # see more metric in the manual
+        # https://github.com/nmslib/nmslib/tree/master/manual
+        space = {
+            'sqeuclidean': 'l2',
+            'euclidean': 'l2',
+            'cosine': 'cosinesimil',
+            'l1': 'l1',
+            'l2': 'l2',
+        }[self.metric]
+
+        self.nmslib_ = nmslib.init(method=self.method, space=space)
+        self.nmslib_.addDataPointBatch(X)
+        self.nmslib_.createIndex()
+        return self
+
+    def transform(self, X):
+        n_samples_transform = X.shape[0]
+
+        # For compatibility reasons, as each sample is considered as its own
+        # neighbor, one extra neighbor will be computed.
+        n_neighbors = self.n_neighbors + 1
+
+        results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
+                                             num_threads=self.n_jobs)
+        indices, distances = zip(*results)
+        indices, distances = np.vstack(indices), np.vstack(distances)
+
+        if self.metric == 'sqeuclidean':
+            distances **= 2
+
+        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
+                           n_neighbors)
+        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
+                                       indptr), shape=(n_samples_transform,
+                                                       self.n_samples_fit_))
+
+        return kneighbors_graph
+
+
+class AnnoyTransformer(TransformerMixin, BaseEstimator):
+    """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""
+
+    def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
+                 search_k=-1):
+        self.n_neighbors = n_neighbors
+        self.n_trees = n_trees
+        self.search_k = search_k
+        self.metric = metric
+
+    def fit(self, X):
+        self.n_samples_fit_ = X.shape[0]
+        metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
+        self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
+        for i, x in enumerate(X):
+            self.annoy_.add_item(i, x.tolist())
+        self.annoy_.build(self.n_trees)
+        return self
+
+    def transform(self, X):
+        return self._transform(X)
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X)._transform(X=None)
+
+    def _transform(self, X):
+        """As `transform`, but handles X is None for faster `fit_transform`."""
+
+        n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]
+
+        # For compatibility reasons, as each sample is considered as its own
+        # neighbor, one extra neighbor will be computed.
+        n_neighbors = self.n_neighbors + 1
+
+        indices = np.empty((n_samples_transform, n_neighbors),
+                           dtype=np.int)
+        distances = np.empty((n_samples_transform, n_neighbors))
+
+        if X is None:
+            for i in range(self.annoy_.get_n_items()):
+                ind, dist = self.annoy_.get_nns_by_item(
+                    i, n_neighbors, self.search_k, include_distances=True)
+
+                indices[i], distances[i] = ind, dist
+        else:
+            for i, x in enumerate(X):
+                indices[i], distances[i] = self.annoy_.get_nns_by_vector(
+                    x.tolist(), n_neighbors, self.search_k,
+                    include_distances=True)
+
+        if self.metric == 'sqeuclidean':
+            distances **= 2
+
+        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
+                           n_neighbors)
+        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
+                                       indptr), shape=(n_samples_transform,
+                                                       self.n_samples_fit_))
+
+        return kneighbors_graph
+
+
+def test_transformers():
+    """Test that AnnoyTransformer and KNeighborsTransformer give same results
+    """
+    X = np.random.RandomState(42).randn(10, 2)
+
+    knn = KNeighborsTransformer()
+    Xt0 = knn.fit_transform(X)
+
+    ann = AnnoyTransformer()
+    Xt1 = ann.fit_transform(X)
+
+    nms = NMSlibTransformer()
+    Xt2 = nms.fit_transform(X)
+
+    assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5)
+    assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)
+
+
+def load_mnist(n_samples):
+    """Load MNIST, shuffle the data, and return only n_samples."""
+    mnist = fetch_openml(data_id=41063)
+    X, y = shuffle(mnist.data, mnist.target, random_state=42)
+    return X[:n_samples], y[:n_samples]
+
+
+def run_benchmark():
+    datasets = [
+        ('MNIST_2000', load_mnist(n_samples=2000)),
+        ('MNIST_10000', load_mnist(n_samples=10000)),
+    ]
+
+    n_iter = 500
+    perplexity = 30
+    # TSNE requires a certain number of neighbors which depends on the
+    # perplexity parameter.
+    # Add one since we include each sample as its own neighbor.
+    n_neighbors = int(3. * perplexity + 1) + 1
+
+    transformers = [
+        ('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors,
+                                              metric='sqeuclidean')),
+        ('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors,
+                                                metric='sqeuclidean')),
+        ('KNeighborsTransformer', KNeighborsTransformer(
+            n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')),
+        ('TSNE with AnnoyTransformer', make_pipeline(
+            AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
+            TSNE(metric='precomputed', perplexity=perplexity,
+                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
+        ('TSNE with NMSlibTransformer', make_pipeline(
+            NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
+            TSNE(metric='precomputed', perplexity=perplexity,
+                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
+        ('TSNE with KNeighborsTransformer', make_pipeline(
+            KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
+                                  metric='sqeuclidean'),
+            TSNE(metric='precomputed', perplexity=perplexity,
+                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
+        ('TSNE with internal NearestNeighbors',
+         TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut",
+              random_state=42, n_iter=n_iter)),
+    ]
+
+    # init the plot
+    nrows = len(datasets)
+    ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
+    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False,
+                             figsize=(5 * ncols, 4 * nrows))
+    axes = axes.ravel()
+    i_ax = 0
+
+    for dataset_name, (X, y) in datasets:
+
+        msg = 'Benchmarking on %s:' % dataset_name
+        print('\n%s\n%s' % (msg, '-' * len(msg)))
+
+        for transformer_name, transformer in transformers:
+            start = time.time()
+            Xt = transformer.fit_transform(X)
+            duration = time.time() - start
+
+            # print the duration report
+            longest = np.max([len(name) for name, model in transformers])
+            whitespaces = ' ' * (longest - len(transformer_name))
+            print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))
+
+            # plot TSNE embedding which should be very similar across methods
+            if 'TSNE' in transformer_name:
+                axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
+                axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y, alpha=0.2,
+                                   cmap=plt.cm.viridis)
+                axes[i_ax].xaxis.set_major_formatter(NullFormatter())
+                axes[i_ax].yaxis.set_major_formatter(NullFormatter())
+                axes[i_ax].axis('tight')
+                i_ax += 1
+
+    fig.tight_layout()
+    plt.show()
+
+
+if __name__ == '__main__':
+    test_transformers()
+    run_benchmark()
diff --git a/examples/neighbors/plot_caching_nearest_neighbors.py b/examples/neighbors/plot_caching_nearest_neighbors.py
new file mode 100644
index 0000000000000..9fecea09f6b78
--- /dev/null
+++ b/examples/neighbors/plot_caching_nearest_neighbors.py
@@ -0,0 +1,60 @@
+"""
+=========================
+Caching nearest neighbors
+=========================
+
+This examples demonstrates how to precompute the k nearest neighbors before
+using them in KNeighborsClassifier. KNeighborsClassifier can compute the
+nearest neighbors internally, but precomputing them can have several benefits,
+such as finer parameter control, caching for multiple use, or custom
+implementations.
+
+Here we use the caching property of pipelines to cache the nearest neighbors
+graph between multiple fits of KNeighborsClassifier. The first call is slow
+since it computes the neighbors graph, while subsequent call are faster as they
+do not need to recompute the graph. Here the durations are small since the
+dataset is small, but the gain can be more substantial when the dataset grows
+larger, or when the grid of parameter to search is large.
+"""
+# Author: Tom Dupre la Tour
+#
+# License: BSD 3 clause
+import matplotlib.pyplot as plt
+
+from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.datasets import load_digits
+from sklearn.pipeline import Pipeline
+
+print(__doc__)
+
+X, y = load_digits(return_X_y=True)
+n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+# The transformer computes the nearest neighbors graph using the maximum number
+# of neighbors necessary in the grid search. The classifier model filters the
+# nearest neighbors graph as required by its own n_neighbors parameter.
+graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list),
+                                    mode='distance')
+classifier_model = KNeighborsClassifier(metric='precomputed')
+
+# Note that we give `memory` a directory to cache the graph computation.
+full_model = Pipeline(
+    steps=[('graph', graph_model), ('classifier', classifier_model)],
+    memory='./cache')
+
+param_grid = {'classifier__n_neighbors': n_neighbors_list}
+grid_model = GridSearchCV(full_model, param_grid)
+grid_model.fit(X, y)
+
+# Plot the results of the grid search.
+fig, axes = plt.subplots(1, 2, figsize=(8, 4))
+axes[0].errorbar(x=n_neighbors_list,
+                 y=grid_model.cv_results_['mean_test_score'],
+                 yerr=grid_model.cv_results_['std_test_score'])
+axes[0].set(xlabel='n_neighbors', title='Classification accuracy')
+axes[1].errorbar(x=n_neighbors_list, y=grid_model.cv_results_['mean_fit_time'],
+                 yerr=grid_model.cv_results_['std_fit_time'], color='r')
+axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
+fig.tight_layout()
+plt.show()
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 38d06d1c244b7..9de22673606f2 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -3,10 +3,10 @@
 Neighborhood Components Analysis Illustration
 =============================================
 
-An example illustrating the goal of learning a distance metric that maximizes
-the nearest neighbors classification accuracy. The example is solely for
-illustration purposes. Please refer to the :ref:`User Guide <nca>` for
-more information.
+This example illustrates a learned distance metric that maximizes
+the nearest neighbors classification accuracy. It provides a visual
+representation of this metric compared to the original point
+space. Please refer to the :ref:`User Guide <nca>` for more information.
 """
 
 # License: BSD 3 clause
@@ -20,23 +20,31 @@
 
 print(__doc__)
 
-random_state = 0
+##############################################################################
+# Original points
+# ---------------
+# First we create a data set of 9 samples from 3 classes, and plot the points
+# in the original space. For this example, we focus on the classification of
+# point no. 3. The thickness of a link between point no. 3 and another point
+# is proportional to their distance.
 
-# Create a tiny data set of 9 samples from 3 classes
 X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
                            n_redundant=0, n_classes=3, n_clusters_per_class=1,
-                           class_sep=1.0, random_state=random_state)
+                           class_sep=1.0, random_state=0)
 
-# Plot the points in the original space
-plt.figure()
+plt.figure(1)
 ax = plt.gca()
-
-# Draw the graph nodes
 for i in range(X.shape[0]):
     ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center')
     ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4)
 
-def p_i(X, i):
+ax.set_title("Original points")
+ax.axes.get_xaxis().set_visible(False)
+ax.axes.get_yaxis().set_visible(False)
+ax.axis('equal')  # so that boundaries are displayed correctly as circles
+
+
+def link_thickness_i(X, i):
     diff_embedded = X[i] - X
     dist_embedded = np.einsum('ij,ij->i', diff_embedded,
                               diff_embedded)
@@ -52,34 +60,30 @@ def p_i(X, i):
 def relate_point(X, i, ax):
     pt_i = X[i]
     for j, pt_j in enumerate(X):
-        thickness = p_i(X, i)
+        thickness = link_thickness_i(X, i)
         if i != j:
             line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]])
             ax.plot(*line, c=cm.Set1(y[j]),
                     linewidth=5*thickness[j])
 
 
-# we consider only point 3
 i = 3
-
-# Plot bonds linked to sample i in the original space
 relate_point(X, i, ax)
-ax.set_title("Original points")
-ax.axes.get_xaxis().set_visible(False)
-ax.axes.get_yaxis().set_visible(False)
-ax.axis('equal')
+plt.show()
 
-# Learn an embedding with NeighborhoodComponentsAnalysis
-nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state)
+##############################################################################
+# Learning an embedding
+# ---------------------
+# We use :class:`~sklearn.neighbors.NeighborhoodComponentsAnalysis` to learn an
+# embedding and plot the points after the transformation. We then take the
+# embedding and find the nearest neighbors.
+
+nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=0)
 nca = nca.fit(X, y)
 
-# Plot the points after transformation with NeighborhoodComponentsAnalysis
-plt.figure()
+plt.figure(2)
 ax2 = plt.gca()
-
-# Get the embedding and find the new nearest neighbors
 X_embedded = nca.transform(X)
-
 relate_point(X_embedded, i, ax2)
 
 for i in range(len(X)):
@@ -88,7 +92,6 @@ def relate_point(X, i, ax):
     ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]),
                 alpha=0.4)
 
-# Make axes equal so that boundaries are displayed correctly as circles
 ax2.set_title("NCA embedding")
 ax2.axes.get_xaxis().set_visible(False)
 ax2.axes.get_yaxis().set_visible(False)
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index fbbd912592979..9b6467d170e70 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -51,8 +51,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
         the options allowed by :func:`sklearn.metrics.pairwise_distances` for
         its metric parameter.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square. X may be a sparse matrix, in which case only "nonzero"
-        elements may be considered neighbors for DBSCAN.
+        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, optional
         Additional keyword arguments for the metric function.
@@ -172,8 +172,8 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         the options allowed by :func:`sklearn.metrics.pairwise_distances` for
         its metric parameter.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square. X may be a sparse matrix, in which case only "nonzero"
-        elements may be considered neighbors for DBSCAN.
+        must be square. X may be a :term:`Glossary <sparse graph>`, in which
+        case only "nonzero" elements may be considered neighbors for DBSCAN.
 
         .. versionadded:: 0.17
            metric *precomputed* to accept precomputed sparse matrix.
@@ -319,32 +319,20 @@ def fit(self, X, y=None, sample_weight=None):
         # point in, which needs to be considered later (i.e. point i is in the
         # neighborhood of point i. While True, its useless information)
         if self.metric == 'precomputed' and sparse.issparse(X):
-            neighborhoods = np.empty(X.shape[0], dtype=object)
-            X.sum_duplicates()  # XXX: modifies X's internals in-place
-
             # set the diagonal to explicit values, as a point is its own
             # neighbor
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
                 X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place
 
-            X_mask = X.data <= self.eps
-            masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
-            masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))
-            masked_indptr = masked_indptr[X.indptr[1:-1]]
-
-            # split into rows
-            neighborhoods[:] = np.split(masked_indices, masked_indptr)
-        else:
-            neighbors_model = NearestNeighbors(
-                radius=self.eps, algorithm=self.algorithm,
-                leaf_size=self.leaf_size, metric=self.metric,
-                metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs
-            )
-            neighbors_model.fit(X)
-            # This has worst case O(n^2) memory complexity
-            neighborhoods = neighbors_model.radius_neighbors(
-                X, self.eps, return_distance=False)
+        neighbors_model = NearestNeighbors(
+            radius=self.eps, algorithm=self.algorithm,
+            leaf_size=self.leaf_size, metric=self.metric,
+            metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)
+        neighbors_model.fit(X)
+        # This has worst case O(n^2) memory complexity
+        neighborhoods = neighbors_model.radius_neighbors(X,
+                                                         return_distance=False)
 
         if sample_weight is None:
             n_neighbors = np.array([len(neighbors)
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index 207dfaeb08974..588742613938d 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -13,7 +13,7 @@
 from ..utils import check_random_state, as_float_array
 from ..utils.validation import check_array
 from ..metrics.pairwise import pairwise_kernels
-from ..neighbors import kneighbors_graph
+from ..neighbors import kneighbors_graph, NearestNeighbors
 from ..manifold import spectral_embedding
 from .k_means_ import k_means
 
@@ -326,10 +326,18 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
         Ignored for ``affinity='nearest_neighbors'``.
 
-    affinity : string, array-like or callable, default 'rbf'
-        If a string, this may be one of 'nearest_neighbors', 'precomputed',
-        'rbf' or one of the kernels supported by
-        `sklearn.metrics.pairwise_kernels`.
+    affinity : string or callable, default 'rbf'
+        How to construct the affinity matrix.
+         - 'nearest_neighbors' : construct the affinity matrix by computing a
+           graph of nearest neighbors.
+         - 'rbf' : construct the affinity matrix using a radial basis function
+           (RBF) kernel.
+         - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
+         - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
+           of precomputed nearest neighbors, and constructs the affinity matrix
+           by selecting the ``n_neighbors`` nearest neighbors.
+         - one of the kernels supported by
+           :func:`~sklearn.metrics.pairwise_kernels`.
 
         Only kernels that produce similarity scores (non-negative values that
         increase with similarity) should be used. This property is not checked
@@ -468,7 +476,9 @@ def fit(self, X, y=None):
         """
         X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
                              dtype=np.float64, ensure_min_samples=2)
-        if X.shape[0] == X.shape[1] and self.affinity != "precomputed":
+        allow_squared = self.affinity in ["precomputed",
+                                          "precomputed_nearest_neighbors"]
+        if X.shape[0] == X.shape[1] and not allow_squared:
             warnings.warn("The spectral clustering API has changed. ``fit``"
                           "now constructs an affinity matrix from data. To use"
                           " a custom affinity matrix, "
@@ -479,6 +489,12 @@ def fit(self, X, y=None):
                                             include_self=True,
                                             n_jobs=self.n_jobs)
             self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+        elif self.affinity == 'precomputed_nearest_neighbors':
+            estimator = NearestNeighbors(n_neighbors=self.n_neighbors,
+                                         n_jobs=self.n_jobs,
+                                         metric="precomputed").fit(X)
+            connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
         elif self.affinity == 'precomputed':
             self.affinity_matrix_ = X
         else:
@@ -530,4 +546,5 @@ def fit_predict(self, X, y=None):
 
     @property
     def _pairwise(self):
-        return self.affinity == "precomputed"
+        return self.affinity in ["precomputed",
+                                 "precomputed_nearest_neighbors"]
diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index e74120ecb9c03..17f85b4fb0fbf 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -95,6 +95,23 @@ def test_dbscan_sparse_precomputed(include_self):
     assert_array_equal(labels_dense, labels_sparse)
 
 
+def test_dbscan_sparse_precomputed_different_eps():
+    # test that precomputed neighbors graph is filtered if computed with
+    # a radius larger than DBSCAN's eps.
+    lower_eps = 0.2
+    nn = NearestNeighbors(radius=lower_eps).fit(X)
+    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
+    dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed')
+
+    higher_eps = lower_eps + 0.7
+    nn = NearestNeighbors(radius=higher_eps).fit(X)
+    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
+    dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed')
+
+    assert_array_equal(dbscan_lower[0], dbscan_higher[0])
+    assert_array_equal(dbscan_lower[1], dbscan_higher[1])
+
+
 @pytest.mark.parametrize('use_sparse', [True, False])
 @pytest.mark.parametrize('metric', ['precomputed', 'minkowski'])
 def test_dbscan_input_not_modified(use_sparse, metric):
diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
index 9ea9cfa7df9b8..dd1a1227a8f09 100644
--- a/sklearn/cluster/tests/test_spectral.py
+++ b/sklearn/cluster/tests/test_spectral.py
@@ -17,6 +17,7 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics import adjusted_rand_score
 from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
+from sklearn.neighbors import NearestNeighbors
 from sklearn.datasets.samples_generator import make_blobs
 
 try:
@@ -102,6 +103,25 @@ def test_spectral_clustering_sparse():
     assert adjusted_rand_score(y, labels) == 1
 
 
+def test_precomputed_nearest_neighbors_filtering():
+    # Test precomputed graph filtering when containing too many neighbors
+    X, y = make_blobs(n_samples=200, random_state=0,
+                      centers=[[1, 1], [-1, -1]], cluster_std=0.01)
+
+    n_neighbors = 2
+    results = []
+    for additional_neighbors in [0, 10]:
+        nn = NearestNeighbors(
+            n_neighbors=n_neighbors + additional_neighbors).fit(X)
+        graph = nn.kneighbors_graph(X, mode='connectivity')
+        labels = SpectralClustering(random_state=0, n_clusters=2,
+                                    affinity='precomputed_nearest_neighbors',
+                                    n_neighbors=n_neighbors).fit(graph).labels_
+        results.append(labels)
+
+    assert_array_equal(results[0], results[1])
+
+
 def test_affinities():
     # Note: in the following, random_state has been selected to have
     # a dataset that yields a stable eigen decomposition both when built
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
index f42eb9d83798c..d7e5ee52db7eb 100644
--- a/sklearn/datasets/samples_generator.py
+++ b/sklearn/datasets/samples_generator.py
@@ -91,7 +91,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     n_clusters_per_class : int, optional (default=2)
         The number of clusters per class.
 
-    weights : list of floats or None (default=None)
+    weights : array-like of shape (n_classes,) or (n_classes - 1,),\
+              (default=None)
         The proportions of samples assigned to each class. If None, then
         classes are balanced. Note that if ``len(weights) == n_classes - 1``,
         then the last class weight is automatically inferred.
@@ -160,22 +161,27 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
                          " features")
     # Use log2 to avoid overflow errors
     if n_informative < np.log2(n_classes * n_clusters_per_class):
-        raise ValueError("n_classes * n_clusters_per_class must"
-                         " be smaller or equal 2 ** n_informative")
-    if weights and len(weights) not in [n_classes, n_classes - 1]:
-        raise ValueError("Weights specified but incompatible with number "
-                         "of classes.")
+        msg = "n_classes({}) * n_clusters_per_class({}) must be"
+        msg += " smaller or equal 2**n_informative({})={}"
+        raise ValueError(msg.format(n_classes, n_clusters_per_class,
+                                    n_informative, 2**n_informative))
+
+    if weights is not None:
+        if len(weights) not in [n_classes, n_classes - 1]:
+            raise ValueError("Weights specified but incompatible with number "
+                             "of classes.")
+        if len(weights) == n_classes - 1:
+            if isinstance(weights, list):
+                weights = weights + [1.0 - sum(weights)]
+            else:
+                weights = np.resize(weights, n_classes)
+                weights[-1] = 1.0 - sum(weights[:-1])
+    else:
+        weights = [1.0 / n_classes] * n_classes
 
     n_useless = n_features - n_informative - n_redundant - n_repeated
     n_clusters = n_classes * n_clusters_per_class
 
-    if weights and len(weights) == (n_classes - 1):
-        weights = weights + [1.0 - sum(weights)]
-
-    if weights is None:
-        weights = [1.0 / n_classes] * n_classes
-        weights[-1] = 1.0 - sum(weights[:-1])
-
     # Distribute samples among clusters by weight
     n_samples_per_cluster = [
         int(n_samples * weights[k % n_classes] / n_clusters_per_class)
diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py
index ecd7e7cba1ef1..f10fd54dc681e 100644
--- a/sklearn/datasets/tests/test_samples_generator.py
+++ b/sklearn/datasets/tests/test_samples_generator.py
@@ -146,6 +146,36 @@ def test_make_classification_informative_features():
              n_clusters_per_class=2)
 
 
+@pytest.mark.parametrize(
+    'weights, err_type, err_msg',
+    [
+        ([], ValueError,
+         "Weights specified but incompatible with number of classes."),
+        ([.25, .75, .1], ValueError,
+         "Weights specified but incompatible with number of classes."),
+        (np.array([]), ValueError,
+         "Weights specified but incompatible with number of classes."),
+        (np.array([.25, .75, .1]), ValueError,
+         "Weights specified but incompatible with number of classes."),
+        (np.random.random(3), ValueError,
+         "Weights specified but incompatible with number of classes.")
+    ]
+)
+def test_make_classification_weights_type(weights, err_type, err_msg):
+    with pytest.raises(err_type, match=err_msg):
+        make_classification(weights=weights)
+
+
+@pytest.mark.parametrize("kwargs", [{}, {"n_classes": 3, "n_informative": 3}])
+def test_make_classification_weights_array_or_list_ok(kwargs):
+    X1, y1 = make_classification(weights=[.1, .9],
+                                 random_state=0, **kwargs)
+    X2, y2 = make_classification(weights=np.array([.1, .9]),
+                                 random_state=0, **kwargs)
+    assert_almost_equal(X1, X2)
+    assert_almost_equal(y1, y2)
+
+
 def test_make_multilabel_classification_return_sequences():
     for allow_unlabeled, min_length in zip((True, False), (0, 1)):
         X, Y = make_multilabel_classification(n_samples=100, n_features=20,
diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py
index e8e8f46e2dec1..3eadb76b9f744 100644
--- a/sklearn/ensemble/__init__.py
+++ b/sklearn/ensemble/__init__.py
@@ -18,6 +18,8 @@
 from .gradient_boosting import GradientBoostingRegressor
 from .voting import VotingClassifier
 from .voting import VotingRegressor
+from ._stacking import StackingClassifier
+from ._stacking import StackingRegressor
 
 from . import bagging
 from . import forest
@@ -32,5 +34,6 @@
            "BaggingRegressor", "IsolationForest", "GradientBoostingClassifier",
            "GradientBoostingRegressor", "AdaBoostClassifier",
            "AdaBoostRegressor", "VotingClassifier", "VotingRegressor",
+           "StackingClassifier", "StackingRegressor",
            "bagging", "forest", "gradient_boosting",
            "partial_dependence", "weight_boosting"]
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
new file mode 100644
index 0000000000000..c2a09c54b4622
--- /dev/null
+++ b/sklearn/ensemble/_stacking.py
@@ -0,0 +1,704 @@
+"""Stacking classifier and regressor."""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from copy import deepcopy
+
+import numpy as np
+from joblib import Parallel, delayed
+
+from ..base import clone
+from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
+from ..base import is_classifier, is_regressor
+from ..base import MetaEstimatorMixin
+
+from .base import _parallel_fit_estimator
+
+from ..linear_model import LogisticRegression
+from ..linear_model import RidgeCV
+
+from ..model_selection import cross_val_predict
+from ..model_selection import check_cv
+
+from ..preprocessing import LabelEncoder
+
+from ..utils import Bunch
+from ..utils.metaestimators import _BaseComposition
+from ..utils.metaestimators import if_delegate_has_method
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import check_is_fitted
+from ..utils.validation import column_or_1d
+
+
+class _BaseStacking(TransformerMixin, MetaEstimatorMixin, _BaseComposition,
+                    metaclass=ABCMeta):
+    """Base class for stacking method."""
+    _required_parameters = ['estimators']
+
+    @abstractmethod
+    def __init__(self, estimators, final_estimator=None, cv=None,
+                 stack_method='auto', n_jobs=None, verbose=0):
+        self.estimators = estimators
+        self.final_estimator = final_estimator
+        self.cv = cv
+        self.stack_method = stack_method
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    @abstractmethod
+    def _validate_estimators(self):
+        if self.estimators is None or len(self.estimators) == 0:
+            raise ValueError(
+                "Invalid 'estimators' attribute, 'estimators' should be a list"
+                " of (string, estimator) tuples."
+            )
+        names, estimators = zip(*self.estimators)
+        self._validate_names(names)
+        return names, estimators
+
+    def _clone_final_estimator(self, default):
+        if self.final_estimator is not None:
+            self.final_estimator_ = clone(self.final_estimator)
+        else:
+            self.final_estimator_ = clone(default)
+
+    def set_params(self, **params):
+        """Set the parameters for the stacking estimator.
+
+        Valid parameter keys can be listed with `get_params()`.
+
+        Parameters
+        ----------
+        params : keyword arguments
+            Specific parameters using e.g.
+            `set_params(parameter_name=new_value)`. In addition, to setting the
+            parameters of the stacking estimator, the individual estimator of
+            the stacking estimators can also be set, or can be removed by
+            setting them to 'drop'.
+
+        Examples
+        --------
+        # In this example, the RandomForestClassifier is removed
+        clf1 = LogisticRegression()
+        clf2 = RandomForestClassifier()
+        eclf = StackingClassifier(estimators=[('lr', clf1), ('rf', clf2)]
+        eclf.set_params(rf='drop')
+        """
+        super()._set_params('estimators', **params)
+        return self
+
+    def get_params(self, deep=True):
+        """Get the parameters of the stacking estimator.
+
+        Parameters
+        ----------
+        deep : bool
+            Setting it to True gets the various classifiers and the parameters
+            of the classifiers as well.
+        """
+        return super()._get_params('estimators', deep=deep)
+
+    def _concatenate_predictions(self, predictions):
+        """Concatenate the predictions of each first layer learner.
+
+        This helper is in charge of ensuring the preditions are 2D arrays and
+        it will drop one of the probability column when using probabilities
+        in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)
+        """
+        X_meta = []
+        for est_idx, preds in enumerate(predictions):
+            # case where the the estimator returned a 1D array
+            if preds.ndim == 1:
+                X_meta.append(preds.reshape(-1, 1))
+            else:
+                if (self.stack_method_[est_idx] == 'predict_proba' and
+                        len(self.classes_) == 2):
+                    # Remove the first column when using probabilities in
+                    # binary classification because both features are perfectly
+                    # collinear.
+                    X_meta.append(preds[:, 1:])
+                else:
+                    X_meta.append(preds)
+        return np.concatenate(X_meta, axis=1)
+
+    @staticmethod
+    def _method_name(name, estimator, method):
+        if estimator == 'drop':
+            return None
+        if method == 'auto':
+            if getattr(estimator, 'predict_proba', None):
+                return 'predict_proba'
+            elif getattr(estimator, 'decision_function', None):
+                return 'decision_function'
+            else:
+                return 'predict'
+        else:
+            if not hasattr(estimator, method):
+                raise ValueError('Underlying estimator {} does not implement '
+                                 'the method {}.'.format(name, method))
+            return method
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,) or None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        # all_estimators contains all estimators, the one to be fitted and the
+        # 'drop' string.
+        names, all_estimators = self._validate_estimators()
+        self._validate_final_estimator()
+
+        has_estimator = any(est != 'drop' for est in all_estimators)
+        if not has_estimator:
+            raise ValueError(
+                "All estimators are dropped. At least one is required "
+                "to be an estimator."
+            )
+
+        stack_method = [self.stack_method] * len(all_estimators)
+
+        # Fit the base estimators on the whole training data. Those
+        # base estimators will be used in transform, predict, and
+        # predict_proba. They are exposed publicly.
+        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+            delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight)
+            for est in all_estimators if est != 'drop'
+        )
+
+        self.named_estimators_ = Bunch()
+        est_fitted_idx = 0
+        for name_est, org_est in zip(names, all_estimators):
+            if org_est != 'drop':
+                self.named_estimators_[name_est] = self.estimators_[
+                    est_fitted_idx]
+                est_fitted_idx += 1
+
+        # To train the meta-classifier using the most data as possible, we use
+        # a cross-validation to obtain the output of the stacked estimators.
+
+        # To ensure that the data provided to each estimator are the same, we
+        # need to set the random state of the cv if there is one and we need to
+        # take a copy.
+        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
+        if hasattr(cv, 'random_state') and cv.random_state is None:
+            cv.random_state = np.random.RandomState()
+
+        self.stack_method_ = [
+            self._method_name(name, est, meth)
+            for name, est, meth in zip(names, all_estimators, stack_method)
+        ]
+
+        predictions = Parallel(n_jobs=self.n_jobs)(
+            delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv),
+                                       method=meth, n_jobs=self.n_jobs,
+                                       verbose=self.verbose)
+            for est, meth in zip(all_estimators, self.stack_method_)
+            if est != 'drop'
+        )
+
+        # Only not None or not 'drop' estimators will be used in transform.
+        # Remove the None from the method as well.
+        self.stack_method_ = [
+            meth for (meth, est) in zip(self.stack_method_, all_estimators)
+            if est != 'drop'
+        ]
+
+        X_meta = self._concatenate_predictions(predictions)
+        if sample_weight is not None:
+            try:
+                self.final_estimator_.fit(
+                    X_meta, y, sample_weight=sample_weight
+                )
+            except TypeError as exc:
+                if "unexpected keyword argument 'sample_weight'" in str(exc):
+                    raise TypeError(
+                        "Underlying estimator {} does not support sample "
+                        "weights."
+                        .format(self.final_estimator_.__class__.__name__)
+                    ) from exc
+                raise
+        else:
+            self.final_estimator_.fit(X_meta, y)
+
+        return self
+
+    def _transform(self, X):
+        """Concatenate and return the predictions of the estimators."""
+        check_is_fitted(self)
+        predictions = [
+            getattr(est, meth)(X)
+            for est, meth in zip(self.estimators_, self.stack_method_)
+            if est != 'drop'
+        ]
+        return self._concatenate_predictions(predictions)
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            accounts for uncertainty in the final estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+
+        check_is_fitted(self)
+        return self.final_estimator_.predict(
+            self.transform(X), **predict_params
+        )
+
+
+class StackingClassifier(ClassifierMixin, _BaseStacking):
+    """Stack of estimators with a final classifier.
+
+    Stacked generalization consists in stacking the output of individual
+    estimator and use a classifier to compute the final prediction. Stacking
+    allows to use the strength of each individual estimator by using their
+    output as input of a final estimator.
+
+    Note that `estimators_` are fitted on the full `X` while `final_estimator_`
+    is trained using cross-validated predictions of the base estimators using
+    `cross_val_predict`.
+
+    .. versionadded:: 0.22
+
+    Read more in the :ref:`User Guide <stacking>`.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator)
+        Base estimators which will be stacked together. Each element of the
+        list is defined as a tuple of string (i.e. name) and an estimator
+        instance. An estimator can be set to 'drop' using `set_params`.
+
+    final_estimator : estimator, default=None
+        A classifier which will be used to combine the base estimators.
+        The default classifier is a `LogisticRegression`.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train `final_estimator`. Possible inputs for
+        cv are:
+
+        * None, to use the default 5-fold cross validation,
+        * integer, to specify the number of folds in a (Stratified) KFold,
+        * An object to be used as a cross-validation generator,
+        * An iterable yielding train, test splits.
+
+        For integer/None inputs, if the estimator is a classifier and y is
+        either binary or multiclass, `StratifiedKFold` is used. In all other
+        cases, `KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+           A larger number of split will provide no benefits if the number
+           of training samples is large enough. Indeed, the training time
+           will increase. ``cv`` is not used for model evaluation but for
+           prediction.
+
+    stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \
+            default='auto'
+        Methods called for each base estimator. It can be:
+
+        * if 'auto', it will try to invoke, for each estimator,
+          `'predict_proba'`, `'decision_function'` or `'predict'` in that
+          order.
+        * otherwise, one of `'predict_proba'`, `'decision_function'` or
+         `'predict'`. If the method is not implemented by the estimator, it
+         will raise an error.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel all `estimators` `fit`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See Glossary for more details.
+
+    Attributes
+    ----------
+    estimators_ : list of estimators
+        The elements of the estimators parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it
+        will not appear in `estimators_`.
+
+    named_estimators_ : Bunch
+        Attribute to access any fitted sub-estimators by name.
+
+    final_estimator_ : estimator
+        The classifier which predicts given the output of `estimators_`.
+
+    stack_method_ : list of str
+        The method used by each base estimator.
+
+    Notes
+    -----
+    When `predict_proba` is used by each estimator (i.e. most of the time for
+    `stack_method='auto'` or specifically for `stack_method='predict_proba'`),
+    The first column predicted by each estimator will be dropped in the case
+    of a binary classification problem. Indeed, both feature will be perfectly
+    collinear.
+
+    References
+    ----------
+    .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+       (1992): 241-259.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.ensemble import StackingClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> estimators = [
+    ...     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
+    ...     ('svr', make_pipeline(StandardScaler(),
+    ...                           LinearSVC(random_state=42)))
+    ... ]
+    >>> clf = StackingClassifier(
+    ...     estimators=estimators, final_estimator=LogisticRegression()
+    ... )
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=42
+    ... )
+    >>> clf.fit(X_train, y_train).score(X_test, y_test)
+    0.9...
+
+    """
+    def __init__(self, estimators, final_estimator=None, cv=None,
+                 stack_method='auto', n_jobs=None, verbose=0):
+        super().__init__(
+            estimators=estimators,
+            final_estimator=final_estimator,
+            cv=cv,
+            stack_method=stack_method,
+            n_jobs=n_jobs,
+            verbose=verbose
+        )
+
+    def _validate_estimators(self):
+        names, estimators = super()._validate_estimators()
+        for est in estimators:
+            if est != 'drop' and not is_classifier(est):
+                raise ValueError(
+                    "The estimator {} should be a classifier."
+                    .format(est.__class__.__name__)
+                )
+        return names, estimators
+
+    def _validate_final_estimator(self):
+        self._clone_final_estimator(default=LogisticRegression())
+        if not is_classifier(self.final_estimator_):
+            raise ValueError(
+                "'final_estimator' parameter should be a classifier. Got {}"
+                .format(self.final_estimator_)
+            )
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,) or None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        check_classification_targets(y)
+        self._le = LabelEncoder().fit(y)
+        self.classes_ = self._le.classes_
+        return super().fit(X, self._le.transform(y), sample_weight)
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def predict(self, X, **predict_params):
+        """Predict target for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        **predict_params : dict of str -> obj
+            Parameters to the `predict` called by the `final_estimator`. Note
+            that this may be used to return uncertainties from some estimators
+            with `return_std` or `return_cov`. Be aware that it will only
+            accounts for uncertainty in the final estimator.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
+            Predicted targets.
+        """
+        y_pred = super().predict(X, **predict_params)
+        return self._le.inverse_transform(y_pred)
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def predict_proba(self, X):
+        """Predict class probabilities for X using
+        `final_estimator_.predict_proba`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes) or \
+            list of ndarray of shape (n_output,)
+            The class probabilities of the input samples.
+        """
+        check_is_fitted(self)
+        return self.final_estimator_.predict_proba(self.transform(X))
+
+    @if_delegate_has_method(delegate='final_estimator_')
+    def decision_function(self, X):
+        """Predict decision function for samples in X using
+        `final_estimator_.decision_function`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \
+            or (n_samples, n_classes * (n_classes-1) / 2)
+            The decision function computed the final estimator.
+        """
+        check_is_fitted(self)
+        return self.final_estimator_.decision_function(self.transform(X))
+
+    def transform(self, X):
+        """Return class labels or probabilities for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators) or \
+                (n_samples, n_classes * n_estimators)
+            Prediction outputs for each estimator.
+        """
+        return self._transform(X)
+
+
+class StackingRegressor(RegressorMixin, _BaseStacking):
+    """Stack of estimators with a final regressor.
+
+    Stacked generalization consists in stacking the output of individual
+    estimator and use a regressor to compute the final prediction. Stacking
+    allows to use the strength of each individual estimator by using their
+    output as input of a final estimator.
+
+    Note that `estimators_` are fitted on the full `X` while `final_estimator_`
+    is trained using cross-validated predictions of the base estimators using
+    `cross_val_predict`.
+
+    .. versionadded:: 0.22
+
+    Read more in the :ref:`User Guide <stacking>`.
+
+    Parameters
+    ----------
+    estimators : list of (str, estimator)
+        Base estimators which will be stacked together. Each element of the
+        list is defined as a tuple of string (i.e. name) and an estimator
+        instance. An estimator can be set to 'drop' using `set_params`.
+
+    final_estimator : estimator, default=None
+        A regressor which will be used to combine the base estimators.
+        The default regressor is a `RidgeCV`.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train `final_estimator`. Possible inputs for
+        cv are:
+
+        * None, to use the default 5-fold cross validation,
+        * integer, to specify the number of folds in a (Stratified) KFold,
+        * An object to be used as a cross-validation generator,
+        * An iterable yielding train, test splits.
+
+        For integer/None inputs, if the estimator is a classifier and y is
+        either binary or multiclass, `StratifiedKFold` is used. In all other
+        cases, `KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. note::
+           A larger number of split will provide no benefits if the number
+           of training samples is large enough. Indeed, the training time
+           will increase. ``cv`` is not used for model evaluation but for
+           prediction.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for `fit` of all `estimators`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See Glossary for more details.
+
+    Attributes
+    ----------
+    estimators_ : list of estimator
+        The elements of the estimators parameter, having been fitted on the
+        training data. If an estimator has been set to `'drop'`, it
+        will not appear in `estimators_`.
+
+    named_estimators_ : Bunch
+        Attribute to access any fitted sub-estimators by name.
+
+    final_estimator_ : estimator
+        The regressor to stacked the base estimators fitted.
+
+    References
+    ----------
+    .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
+       (1992): 241-259.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.linear_model import RidgeCV
+    >>> from sklearn.svm import LinearSVR
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.ensemble import StackingRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> estimators = [
+    ...     ('lr', RidgeCV()),
+    ...     ('svr', LinearSVR(random_state=42))
+    ... ]
+    >>> reg = StackingRegressor(
+    ...     estimators=estimators,
+    ...     final_estimator=RandomForestRegressor(n_estimators=10,
+    ...                                           random_state=42)
+    ... )
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=42
+    ... )
+    >>> reg.fit(X_train, y_train).score(X_test, y_test)
+    0.3...
+
+    """
+    def __init__(self, estimators, final_estimator=None, cv=None, n_jobs=None,
+                 verbose=0):
+        super().__init__(
+            estimators=estimators,
+            final_estimator=final_estimator,
+            cv=cv,
+            stack_method="predict",
+            n_jobs=n_jobs,
+            verbose=verbose
+        )
+
+    def _validate_estimators(self):
+        names, estimators = super()._validate_estimators()
+        for est in estimators:
+            if est != 'drop' and not is_regressor(est):
+                raise ValueError(
+                    "The estimator {} should be a regressor."
+                    .format(est.__class__.__name__)
+                )
+        return names, estimators
+
+    def _validate_final_estimator(self):
+        self._clone_final_estimator(default=RidgeCV())
+        if not is_regressor(self.final_estimator_):
+            raise ValueError(
+                "'final_estimator' parameter should be a regressor. Got {}"
+                .format(self.final_estimator_)
+            )
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,) or None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if all underlying estimators
+            support sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        y = column_or_1d(y, warn=True)
+        return super().fit(X, y, sample_weight)
+
+    def transform(self, X):
+        """Return the predictions for X for each estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        y_preds : ndarray of shape (n_samples, n_estimators)
+            Prediction outputs for each estimator.
+        """
+        return self._transform(X)
diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py
index 36c7b1067c381..b266c38bcbfaa 100644
--- a/sklearn/ensemble/base.py
+++ b/sklearn/ensemble/base.py
@@ -19,6 +19,23 @@
 MAX_RAND_SEED = np.iinfo(np.int32).max
 
 
+def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
+    """Private function used to fit an estimator within a job."""
+    if sample_weight is not None:
+        try:
+            estimator.fit(X, y, sample_weight=sample_weight)
+        except TypeError as exc:
+            if "unexpected keyword argument 'sample_weight'" in str(exc):
+                raise TypeError(
+                    "Underlying estimator {} does not support sample weights."
+                    .format(estimator.__class__.__name__)
+                ) from exc
+            raise
+    else:
+        estimator.fit(X, y)
+    return estimator
+
+
 def _set_random_states(estimator, random_state=None):
     """Sets fixed random_state parameters for an estimator
 
diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py
new file mode 100644
index 0000000000000..3a61456a5665e
--- /dev/null
+++ b/sklearn/ensemble/tests/test_stacking.py
@@ -0,0 +1,492 @@
+"""Test the stacking classifier and regressor."""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: BSD 3 clause
+
+import pytest
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.base import ClassifierMixin
+from sklearn.base import RegressorMixin
+from sklearn.base import clone
+
+from sklearn.exceptions import ConvergenceWarning
+
+from sklearn.datasets import load_iris
+from sklearn.datasets import load_diabetes
+from sklearn.datasets import load_breast_cancer
+
+from sklearn.dummy import DummyClassifier
+from sklearn.dummy import DummyRegressor
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LinearRegression
+from sklearn.svm import LinearSVC
+from sklearn.svm import LinearSVR
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.preprocessing import scale
+
+from sklearn.ensemble import StackingClassifier
+from sklearn.ensemble import StackingRegressor
+
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import KFold
+
+from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.estimator_checks import check_estimator
+from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
+
+X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
+X_iris, y_iris = load_iris(return_X_y=True)
+
+
+@pytest.mark.parametrize(
+    "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
+)
+@pytest.mark.parametrize(
+    "final_estimator", [None, RandomForestClassifier(random_state=42)]
+)
+def test_stacking_classifier_iris(cv, final_estimator):
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, y_test = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=final_estimator, cv=cv
+    )
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    assert clf.score(X_test, y_test) > 0.8
+
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 6
+
+    clf.set_params(lr='drop')
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    clf.predict_proba(X_test)
+    if final_estimator is None:
+        # LogisticRegression has decision_function method
+        clf.decision_function(X_test)
+
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 3
+
+
+def test_stacking_classifier_drop_column_binary_classification():
+    # check that a column is dropped in binary classification
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X), y, stratify=y, random_state=42
+    )
+
+    # both classifiers implement 'predict_proba' and will both drop one column
+    estimators = [('lr', LogisticRegression()),
+                  ('rf', RandomForestClassifier(random_state=42))]
+    clf = StackingClassifier(estimators=estimators, cv=3)
+
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+    # LinearSVC does not implement 'predict_proba' and will not drop one column
+    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
+    clf.set_params(estimators=estimators)
+
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+
+def test_stacking_classifier_drop_estimator():
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_iris), y_iris, stratify=y_iris, random_state=42
+    )
+    estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
+    rf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf = StackingClassifier(
+        estimators=[('svc', LinearSVC(random_state=0))],
+        final_estimator=rf, cv=5
+    )
+    clf_drop = StackingClassifier(
+        estimators=estimators, final_estimator=rf, cv=5
+    )
+
+    clf.fit(X_train, y_train)
+    clf_drop.fit(X_train, y_train)
+    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
+    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
+    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
+
+
+def test_stacking_regressor_drop_estimator():
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_diabetes), y_diabetes, random_state=42
+    )
+    estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
+    rf = RandomForestRegressor(n_estimators=10, random_state=42)
+    reg = StackingRegressor(
+        estimators=[('svr', LinearSVR(random_state=0))],
+        final_estimator=rf, cv=5
+    )
+    reg_drop = StackingRegressor(
+        estimators=estimators, final_estimator=rf, cv=5
+    )
+
+    reg.fit(X_train, y_train)
+    reg_drop.fit(X_train, y_train)
+    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
+    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
+
+
+@pytest.mark.parametrize(
+    "cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)]
+)
+@pytest.mark.parametrize(
+    "final_estimator, predict_params",
+    [(None, {}),
+     (RandomForestRegressor(random_state=42), {}),
+     (DummyRegressor(), {'return_std': True})]
+)
+def test_stacking_regressor_diabetes(cv, final_estimator, predict_params):
+    # prescale the data to avoid convergence warning without using a pipeline
+    # for later assert
+    X_train, X_test, y_train, _ = train_test_split(
+        scale(X_diabetes), y_diabetes, random_state=42
+    )
+    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
+    reg = StackingRegressor(
+        estimators=estimators, final_estimator=final_estimator, cv=cv
+    )
+    reg.fit(X_train, y_train)
+    result = reg.predict(X_test, **predict_params)
+    expected_result_length = 2 if predict_params else 1
+    if predict_params:
+        assert len(result) == expected_result_length
+
+    X_trans = reg.transform(X_test)
+    assert X_trans.shape[1] == 2
+
+    reg.set_params(lr='drop')
+    reg.fit(X_train, y_train)
+    reg.predict(X_test)
+
+    X_trans = reg.transform(X_test)
+    assert X_trans.shape[1] == 1
+
+
+def test_stacking_classifier_drop_binary_prob():
+    # check that classifier will drop one of the probability column for
+    # binary classification problem
+
+    # Select only the 2 first classes
+    X_, y_ = scale(X_iris[:100]), y_iris[:100]
+
+    estimators = [
+        ('lr', LogisticRegression()), ('rf', RandomForestClassifier())
+    ]
+    clf = StackingClassifier(estimators=estimators)
+    clf.fit(X_, y_)
+    X_meta = clf.transform(X_)
+    assert X_meta.shape[1] == 2
+
+
+class NoWeightRegressor(BaseEstimator, RegressorMixin):
+    def fit(self, X, y):
+        self.reg = DummyRegressor()
+        return self.reg.fit(X, y)
+
+    def predict(self, X):
+        return np.ones(X.shape[0])
+
+
+class NoWeightClassifier(BaseEstimator, ClassifierMixin):
+    def fit(self, X, y):
+        self.clf = DummyClassifier()
+        return self.clf.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "y, params, type_err, msg_err",
+    [(y_iris,
+      {'estimators': None},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_iris,
+      {'estimators': []},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_iris,
+      {'estimators': [('lr', LinearRegression()),
+                      ('svm', LinearSVC(max_iter=5e4))]},
+      ValueError, 'should be a classifier'),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()),
+                      ('svm', SVC(max_iter=5e4))],
+       'stack_method': 'predict_proba'},
+      ValueError, 'does not implement the method predict_proba'),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()),
+                      ('cor', NoWeightClassifier())]},
+      TypeError, 'does not support sample weight'),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()),
+                      ('cor', LinearSVC(max_iter=5e4))],
+       'final_estimator': NoWeightClassifier()},
+      TypeError, 'does not support sample weight'),
+     (y_iris,
+      {'estimators': [('lr', 'drop'), ('svm', 'drop')]},
+      ValueError, 'All estimators are dropped'),
+     (y_iris,
+      {'estimators': [('lr', LogisticRegression()), ('svm', LinearSVC())],
+       'final_estimator': RandomForestRegressor()},
+      ValueError, 'parameter should be a classifier.')]
+)
+def test_stacking_classifier_error(y, params, type_err, msg_err):
+    with pytest.raises(type_err, match=msg_err):
+        clf = StackingClassifier(**params, cv=3)
+        clf.fit(
+            scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0])
+        )
+
+
+@pytest.mark.parametrize(
+    "y, params, type_err, msg_err",
+    [(y_diabetes,
+      {'estimators': None},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_diabetes,
+      {'estimators': []},
+      ValueError, "Invalid 'estimators' attribute,"),
+     (y_diabetes,
+      {'estimators': [('lr', LogisticRegression()), ('svm', LinearSVR())]},
+      ValueError, 'should be a regressor'),
+     (y_diabetes,
+      {'estimators': [('lr', LinearRegression()),
+                      ('cor', NoWeightRegressor())]},
+      TypeError, 'does not support sample weight'),
+     (y_diabetes,
+      {'estimators': [('lr', LinearRegression()),
+                      ('cor', LinearSVR())],
+       'final_estimator': NoWeightRegressor()},
+      TypeError, 'does not support sample weight'),
+     (y_diabetes,
+      {'estimators': [('lr', 'drop'), ('svm', 'drop')]},
+      ValueError, 'All estimators are dropped'),
+     (y_diabetes,
+      {'estimators': [('lr', LinearRegression()), ('svm', LinearSVR())],
+       'final_estimator': RandomForestClassifier()},
+      ValueError, 'parameter should be a regressor.')]
+)
+def test_stacking_regressor_error(y, params, type_err, msg_err):
+    with pytest.raises(type_err, match=msg_err):
+        reg = StackingRegressor(**params, cv=3)
+        reg.fit(
+            scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])
+        )
+
+
+@pytest.mark.parametrize(
+    "stacking_estimator",
+    [StackingClassifier(estimators=[('lr', LogisticRegression()),
+                                    ('svm', LinearSVC())]),
+     StackingRegressor(estimators=[('lr', LinearRegression()),
+                                   ('svm', LinearSVR(max_iter=1e4))])]
+)
+def test_stacking_named_estimators(stacking_estimator):
+    stacking_estimator.fit(scale(X_iris), y_iris)
+    estimators = stacking_estimator.named_estimators_
+    assert len(estimators) == 2
+    assert sorted(list(estimators.keys())) == sorted(['lr', 'svm'])
+
+
+@pytest.mark.parametrize(
+    "stacking_estimator",
+    [StackingClassifier(estimators=[('lr', LogisticRegression()),
+                                    ('rf', RandomForestClassifier()),
+                                    ('svm', LinearSVC())]),
+     StackingRegressor(estimators=[('lr', LinearRegression()),
+                                   ('rf', RandomForestRegressor()),
+                                   ('svm', LinearSVR(max_iter=1e4))])]
+)
+def test_stacking_named_estimators_dropped(stacking_estimator):
+    stacking_estimator.set_params(rf='drop')
+    stacking_estimator.fit(scale(X_iris), y_iris)
+    estimators = stacking_estimator.named_estimators_
+    assert 'rf' not in estimators.keys()
+    assert len(estimators) == 2
+    assert sorted(list(estimators.keys())) == sorted(['lr', 'svm'])
+
+
+@pytest.mark.parametrize(
+    "stacking_estimator",
+    [StackingClassifier(estimators=[('lr', LogisticRegression()),
+                                    ('svm', LinearSVC())]),
+     StackingRegressor(estimators=[('lr', LinearRegression()),
+                                   ('svm', LinearSVR())])]
+)
+def test_stacking_set_get_params(stacking_estimator):
+    params = stacking_estimator.get_params()
+    assert 'lr' in list(params.keys())
+    assert 'svm' in list(params.keys())
+
+    stacking_estimator.set_params(lr='drop')
+    params = stacking_estimator.get_params()
+    assert params['lr'] == 'drop'
+
+
+@pytest.mark.parametrize(
+    "estimator, X, y",
+    [(StackingClassifier(
+        estimators=[('lr', LogisticRegression(random_state=0)),
+                    ('svm', LinearSVC(random_state=0))]),
+      X_iris[:100], y_iris[:100]),  # keep only classes 0 and 1
+     (StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('svm', LinearSVR(random_state=0))]),
+      X_diabetes, y_diabetes)],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_stacking_randomness(estimator, X, y):
+    # checking that fixing the random state of the CV will lead to the same
+    # results
+    estimator_full = clone(estimator)
+    estimator_full.set_params(
+        cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
+    )
+
+    estimator_drop = clone(estimator)
+    estimator_drop.set_params(lr='drop')
+    estimator_drop.set_params(
+        cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
+    )
+
+    assert_allclose(
+        estimator_full.fit(X, y).transform(X)[:, 1:],
+        estimator_drop.fit(X, y).transform(X)
+    )
+
+
+# These warnings are raised due to _BaseComposition
+@pytest.mark.filterwarnings("ignore:TypeError occurred during set_params")
+@pytest.mark.filterwarnings("ignore:Estimator's parameters changed after")
+@pytest.mark.parametrize(
+    "estimator",
+    [StackingClassifier(
+        estimators=[('lr', LogisticRegression(random_state=0)),
+                    ('tree', DecisionTreeClassifier(random_state=0))]),
+     StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('tree', DecisionTreeRegressor(random_state=0))])],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_check_estimators_stacking_estimator(estimator):
+    check_estimator(estimator)
+    check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
+
+
+def test_stacking_classifier_stratify_default():
+    # check that we stratify the classes for the default CV
+    clf = StackingClassifier(
+        estimators=[('lr', LogisticRegression(max_iter=1e4)),
+                    ('svm', LinearSVC(max_iter=1e4))]
+    )
+    # since iris is not shuffled, a simple k-fold would not contain the
+    # 3 classes during training
+    clf.fit(X_iris, y_iris)
+
+
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [(StackingClassifier(
+        estimators=[('lr', LogisticRegression()),
+                    ('svm', LinearSVC(random_state=42))],
+        final_estimator=LogisticRegression(),
+        cv=KFold(shuffle=True, random_state=42)),
+      *load_breast_cancer(return_X_y=True)),
+     (StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('svm', LinearSVR(random_state=42))],
+         final_estimator=LinearRegression(),
+         cv=KFold(shuffle=True, random_state=42)),
+      X_diabetes, y_diabetes)],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_stacking_with_sample_weight(stacker, X, y):
+    # check that sample weights has an influence on the fitting
+    # note: ConvergenceWarning are catch since we are not worrying about the
+    # convergence here
+    n_half_samples = len(y) // 2
+    total_sample_weight = np.array(
+        [0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
+    )
+    X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
+        X, y, total_sample_weight, random_state=42
+    )
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train)
+    y_pred_no_weight = stacker.predict(X_test)
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
+    y_pred_unit_weight = stacker.predict(X_test)
+
+    assert_allclose(y_pred_no_weight, y_pred_unit_weight)
+
+    with ignore_warnings(category=ConvergenceWarning):
+        stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
+    y_pred_biased = stacker.predict(X_test)
+
+    assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
+
+
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize(
+    "stacker, X, y",
+    [(StackingClassifier(
+        estimators=[('lr', LogisticRegression()),
+                    ('svm', LinearSVC(random_state=42))],
+        final_estimator=LogisticRegression()),
+      *load_breast_cancer(return_X_y=True)),
+     (StackingRegressor(
+         estimators=[('lr', LinearRegression()),
+                     ('svm', LinearSVR(random_state=42))],
+         final_estimator=LinearRegression()),
+      X_diabetes, y_diabetes)],
+    ids=['StackingClassifier', 'StackingRegressor']
+)
+def test_stacking_cv_influence(stacker, X, y):
+    # check that the stacking affects the fit of the final estimator but not
+    # the fit of the base estimators
+    # note: ConvergenceWarning are catch since we are not worrying about the
+    # convergence here
+    stacker_cv_3 = clone(stacker)
+    stacker_cv_5 = clone(stacker)
+
+    stacker_cv_3.set_params(cv=3)
+    stacker_cv_5.set_params(cv=5)
+
+    stacker_cv_3.fit(X, y)
+    stacker_cv_5.fit(X, y)
+
+    # the base estimators should be identical
+    for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_,
+                                  stacker_cv_5.estimators_):
+        assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
+
+    # the final estimator should be different
+    with pytest.raises(AssertionError, match='Not equal'):
+        assert_allclose(stacker_cv_3.final_estimator_.coef_,
+                        stacker_cv_5.final_estimator_.coef_)
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index bbfb91751726a..e2fce1eb2e918 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -328,7 +328,7 @@ def test_sample_weight():
         voting='soft')
     msg = ('Underlying estimator KNeighborsClassifier does not support '
            'sample weights.')
-    with pytest.raises(ValueError, match=msg):
+    with pytest.raises(TypeError, match=msg):
         eclf3.fit(X, y, sample_weight)
 
     # check that _parallel_fit_estimator will raise the right error
@@ -524,7 +524,7 @@ def test_none_estimator_with_weights(X, y, voter, drop):
     ids=['VotingRegressor', 'VotingClassifier']
 )
 def test_check_estimators_voting_estimator(estimator):
-    # FIXME: to be removed when meta-estimators can be specified themselves
+    # FIXME: to be removed when meta-estimators can specified themselves
     # their testing parameters (for required parameters).
     check_estimator(estimator)
     check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py
index c7bdac82c7c62..dbc8a2b7bff93 100644
--- a/sklearn/ensemble/voting.py
+++ b/sklearn/ensemble/voting.py
@@ -23,6 +23,7 @@
 from ..base import RegressorMixin
 from ..base import TransformerMixin
 from ..base import clone
+from .base import _parallel_fit_estimator
 from ..preprocessing import LabelEncoder
 from ..utils import Bunch
 from ..utils.validation import check_is_fitted
@@ -32,23 +33,6 @@
 from ..exceptions import NotFittedError
 
 
-def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
-    """Private function used to fit an estimator within a job."""
-    if sample_weight is not None:
-        try:
-            estimator.fit(X, y, sample_weight=sample_weight)
-        except TypeError as exc:
-            if "unexpected keyword argument 'sample_weight'" in str(exc):
-                raise ValueError(
-                    "Underlying estimator {} does not support sample weights."
-                    .format(estimator.__class__.__name__)
-                ) from exc
-            raise
-    else:
-        estimator.fit(X, y)
-    return estimator
-
-
 class _BaseVoting(TransformerMixin, _BaseComposition):
     """Base class for voting.
 
diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx
index b3ee42eaef8a3..676d3676fb8c1 100644
--- a/sklearn/manifold/_utils.pyx
+++ b/sklearn/manifold/_utils.pyx
@@ -13,24 +13,21 @@ cdef float EPSILON_DBL = 1e-8
 cdef float PERPLEXITY_TOLERANCE = 1e-5
 
 cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
-        np.ndarray[np.float32_t, ndim=2] affinities,
-        np.ndarray[np.int64_t, ndim=2] neighbors,
+        np.ndarray[np.float32_t, ndim=2] sqdistances,
         float desired_perplexity,
         int verbose):
     """Binary search for sigmas of conditional Gaussians.
 
     This approximation reduces the computational complexity from O(N^2) to
-    O(uN). See the exact method '_binary_search_perplexity' for more details.
+    O(uN).
 
     Parameters
     ----------
-    affinities : array-like, shape (n_samples, k)
-        Distances between training samples and its k nearest neighbors.
-
-    neighbors : array-like, shape (n_samples, k) or None
-        Each row contains the indices to the k nearest neigbors. If this
-        array is None, then the perplexity is estimated over all data
-        not just the nearest neighbors.
+    sqdistances : array-like, shape (n_samples, n_neighbors)
+        Distances between training samples and their k nearest neighbors.
+        When using the exact method, this is a square (n_samples, n_samples)
+        distance matrix. The TSNE default metric is "euclidean" which is
+        interpreted as squared euclidean distance.
 
     desired_perplexity : float
         Desired perplexity (2^entropy) of the conditional Gaussians.
@@ -46,7 +43,9 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
     # Maximum number of binary search steps
     cdef long n_steps = 100
 
-    cdef long n_samples = affinities.shape[0]
+    cdef long n_samples = sqdistances.shape[0]
+    cdef long n_neighbors = sqdistances.shape[1]
+    cdef int using_neighbors = n_neighbors < n_samples
     # Precisions of conditional Gaussian distributions
     cdef float beta
     cdef float beta_min
@@ -61,11 +60,6 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
     cdef float sum_Pi
     cdef float sum_disti_Pi
     cdef long i, j, k, l
-    cdef long n_neighbors = n_samples
-    cdef int using_neighbors = neighbors is not None
-
-    if using_neighbors:
-        n_neighbors = neighbors.shape[1]
 
     # This array is later used as a 32bit array. It has multiple intermediate
     # floating point additions that benefit from the extra precision
@@ -85,7 +79,7 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
             sum_Pi = 0.0
             for j in range(n_neighbors):
                 if j != i or using_neighbors:
-                    P[i, j] = math.exp(-affinities[i, j] * beta)
+                    P[i, j] = math.exp(-sqdistances[i, j] * beta)
                     sum_Pi += P[i, j]
 
             if sum_Pi == 0.0:
@@ -94,7 +88,7 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
 
             for j in range(n_neighbors):
                 P[i, j] /= sum_Pi
-                sum_disti_Pi += affinities[i, j] * P[i, j]
+                sum_disti_Pi += sqdistances[i, j] * P[i, j]
 
             entropy = math.log(sum_Pi) + beta * sum_disti_Pi
             entropy_diff = entropy - desired_entropy
diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py
index e512ce565553d..93d7a17eca9db 100644
--- a/sklearn/manifold/isomap.py
+++ b/sklearn/manifold/isomap.py
@@ -6,7 +6,8 @@
 import numpy as np
 from ..base import BaseEstimator, TransformerMixin
 from ..neighbors import NearestNeighbors, kneighbors_graph
-from ..utils import check_array
+from ..utils.deprecation import deprecated
+from ..utils.validation import check_is_fitted
 from ..utils.graph import graph_shortest_path
 from ..decomposition import KernelPCA
 from ..preprocessing import KernelCenterer
@@ -58,12 +59,35 @@ class Isomap(TransformerMixin, BaseEstimator):
         Algorithm to use for nearest neighbors search,
         passed to neighbors.NearestNeighbors instance.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int or None, default=None
         The number of parallel jobs to run.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    metric : string, or callable, default="minkowski"
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square. X may be a :term:`Glossary <sparse graph>`.
+
+        .. versionadded:: 0.22
+
+    p : int, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+        .. versionadded:: 0.22
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     embedding_ : array-like, shape (n_samples, n_components)
@@ -73,9 +97,6 @@ class Isomap(TransformerMixin, BaseEstimator):
         :class:`~sklearn.decomposition.KernelPCA` object used to implement the
         embedding.
 
-    training_data_ : array-like, shape (n_samples, n_features)
-        Stores the training data.
-
     nbrs_ : sklearn.neighbors.NearestNeighbors instance
         Stores nearest neighbors instance, including BallTree or KDtree
         if applicable.
@@ -104,7 +125,8 @@ class Isomap(TransformerMixin, BaseEstimator):
 
     def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto',
                  tol=0, max_iter=None, path_method='auto',
-                 neighbors_algorithm='auto', n_jobs=None):
+                 neighbors_algorithm='auto', n_jobs=None, metric='minkowski',
+                 p=2, metric_params=None):
         self.n_neighbors = n_neighbors
         self.n_components = n_components
         self.eigen_solver = eigen_solver
@@ -113,14 +135,19 @@ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto',
         self.path_method = path_method
         self.neighbors_algorithm = neighbors_algorithm
         self.n_jobs = n_jobs
+        self.metric = metric
+        self.p = p
+        self.metric_params = metric_params
 
     def _fit_transform(self, X):
-        X = self._validate_X(X, accept_sparse='csr')
         self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       algorithm=self.neighbors_algorithm,
+                                      metric=self.metric, p=self.p,
+                                      metric_params=self.metric_params,
                                       n_jobs=self.n_jobs)
         self.nbrs_.fit(X)
-        self.training_data_ = self.nbrs_._fit_X
+        self.n_features_in_ = self.nbrs_.n_features_in_
+
         self.kernel_pca_ = KernelPCA(n_components=self.n_components,
                                      kernel="precomputed",
                                      eigen_solver=self.eigen_solver,
@@ -128,6 +155,8 @@ def _fit_transform(self, X):
                                      n_jobs=self.n_jobs)
 
         kng = kneighbors_graph(self.nbrs_, self.n_neighbors,
+                               metric=self.metric, p=self.p,
+                               metric_params=self.metric_params,
                                mode='distance', n_jobs=self.n_jobs)
 
         self.dist_matrix_ = graph_shortest_path(kng,
@@ -138,6 +167,13 @@ def _fit_transform(self, X):
 
         self.embedding_ = self.kernel_pca_.fit_transform(G)
 
+    @property
+    @deprecated("Attribute training_data_ was deprecated in version 0.22 and "
+                "will be removed in 0.24.")
+    def training_data_(self):
+        check_is_fitted(self)
+        return self.nbrs_._fit_X
+
     def reconstruction_error(self):
         """Compute the reconstruction error for the embedding.
 
@@ -167,9 +203,9 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
+        X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
             Sample data, shape = (n_samples, n_features), in the form of a
-            numpy array, precomputed tree, or NearestNeighbors
+            numpy array, sparse graph, precomputed tree, or NearestNeighbors
             object.
 
         y : Ignored
@@ -186,7 +222,7 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix, BallTree, KDTree}
+        X : {array-like, sparse graph, BallTree, KDTree}
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
@@ -212,21 +248,27 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_queries, n_features)
+            If neighbors_algorithm='precomputed', X is assumed to be a
+            distance matrix or a sparse graph of shape
+            (n_queries, n_samples_fit).
 
         Returns
         -------
-        X_new : array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_queries, n_components)
         """
-        X = check_array(X)
+        check_is_fitted(self)
         distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
 
-        # Create the graph of shortest distances from X to self.training_data_
-        # via the nearest neighbors of X.
+        # Create the graph of shortest distances from X to
+        # training data via the nearest neighbors of X.
         # This can be done as a single array operation, but it potentially
         # takes a lot of memory.  To avoid that, use a loop:
-        G_X = np.zeros((X.shape[0], self.training_data_.shape[0]))
-        for i in range(X.shape[0]):
+
+        n_samples_fit = self.nbrs_.n_samples_fit_
+        n_queries = distances.shape[0]
+        G_X = np.zeros((n_queries, n_samples_fit))
+        for i in range(n_queries):
             G_X[i] = np.min(self.dist_matrix_[indices[i]] +
                             distances[i][:, None], 0)
 
diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
index 402153048eb14..dc77cff0f9da5 100644
--- a/sklearn/manifold/locally_linear.py
+++ b/sklearn/manifold/locally_linear.py
@@ -99,7 +99,7 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
     """
     knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(X)
     X = knn._fit_X
-    n_samples = X.shape[0]
+    n_samples = knn.n_samples_fit_
     ind = knn.kneighbors(X, return_distance=False)[:, 1:]
     data = barycenter_weights(X, X[ind], reg=reg)
     indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
index e6a646d13ffd0..1052aeec9c955 100644
--- a/sklearn/manifold/spectral_embedding_.py
+++ b/sklearn/manifold/spectral_embedding_.py
@@ -19,7 +19,7 @@
 from ..utils.extmath import _deterministic_vector_sign_flip
 from ..utils.fixes import lobpcg
 from ..metrics.pairwise import rbf_kernel
-from ..neighbors import kneighbors_graph
+from ..neighbors import kneighbors_graph, NearestNeighbors
 
 
 def _graph_connected_component(graph, node_id):
@@ -157,7 +157,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
 
     Parameters
     ----------
-    adjacency : array-like or sparse matrix, shape: (n_samples, n_samples)
+    adjacency : array-like or sparse graph, shape: (n_samples, n_samples)
         The adjacency matrix of the graph to embed.
 
     n_components : integer, optional, default 8
@@ -369,9 +369,14 @@ class SpectralEmbedding(BaseEstimator):
 
     affinity : string or callable, default : "nearest_neighbors"
         How to construct the affinity matrix.
-         - 'nearest_neighbors' : construct affinity matrix by knn graph
-         - 'rbf' : construct affinity matrix by rbf kernel
-         - 'precomputed' : interpret X as precomputed affinity matrix
+         - 'nearest_neighbors' : construct the affinity matrix by computing a
+           graph of nearest neighbors.
+         - 'rbf' : construct the affinity matrix by computing a radial basis
+           function (RBF) kernel.
+         - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
+         - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
+           of precomputed nearest neighbors, and constructs the affinity matrix
+           by selecting the ``n_neighbors`` nearest neighbors.
          - callable : use passed in function as affinity
            the function takes in data matrix (n_samples, n_features)
            and return affinity matrix (n_samples, n_samples).
@@ -453,7 +458,8 @@ def __init__(self, n_components=2, affinity="nearest_neighbors",
 
     @property
     def _pairwise(self):
-        return self.affinity == "precomputed"
+        return self.affinity in ["precomputed",
+                                 "precomputed_nearest_neighbors"]
 
     def _get_affinity_matrix(self, X, Y=None):
         """Calculate the affinity matrix from data
@@ -477,6 +483,13 @@ def _get_affinity_matrix(self, X, Y=None):
         if self.affinity == 'precomputed':
             self.affinity_matrix_ = X
             return self.affinity_matrix_
+        if self.affinity == 'precomputed_nearest_neighbors':
+            estimator = NearestNeighbors(n_neighbors=self.n_neighbors,
+                                         n_jobs=self.n_jobs,
+                                         metric="precomputed").fit(X)
+            connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+            return self.affinity_matrix_
         if self.affinity == 'nearest_neighbors':
             if sparse.issparse(X):
                 warnings.warn("Nearest neighbors affinity currently does "
@@ -507,12 +520,12 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training vector, where n_samples is the number of samples
             and n_features is the number of features.
 
             If affinity is "precomputed"
-            X : array-like, shape (n_samples, n_samples),
+            X : {array-like, sparse matrix}, shape (n_samples, n_samples),
             Interpret X as precomputed adjacency graph computed from
             samples.
 
@@ -522,12 +535,13 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
 
-        X = self._validate_X(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_X(X, accept_sparse='csr', ensure_min_samples=2,
+                             estimator=self)
 
         random_state = check_random_state(self.random_state)
         if isinstance(self.affinity, str):
-            if self.affinity not in {"nearest_neighbors", "rbf",
-                                     "precomputed"}:
+            if self.affinity not in {"nearest_neighbors", "rbf", "precomputed",
+                                     "precomputed_nearest_neighbors"}:
                 raise ValueError(("%s is not a valid affinity. Expected "
                                   "'precomputed', 'rbf', 'nearest_neighbors' "
                                   "or a callable.") % self.affinity)
@@ -547,12 +561,12 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training vector, where n_samples is the number of samples
             and n_features is the number of features.
 
             If affinity is "precomputed"
-            X : array-like, shape (n_samples, n_samples),
+            X : {array-like, sparse matrix}, shape (n_samples, n_samples),
             Interpret X as precomputed adjacency graph computed from
             samples.
 
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index 70732b8d6ac16..598b820263776 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -11,14 +11,14 @@
 from time import time
 import numpy as np
 from scipy import linalg
-import scipy.sparse as sp
 from scipy.spatial.distance import pdist
 from scipy.spatial.distance import squareform
-from scipy.sparse import csr_matrix
+from scipy.sparse import csr_matrix, issparse
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator
 from ..utils import check_array
 from ..utils import check_random_state
+from ..utils.validation import check_non_negative
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
 from . import _utils
@@ -53,14 +53,14 @@ def _joint_probabilities(distances, desired_perplexity, verbose):
     # the desired perplexity
     distances = distances.astype(np.float32, copy=False)
     conditional_P = _utils._binary_search_perplexity(
-        distances, None, desired_perplexity, verbose)
+        distances, desired_perplexity, verbose)
     P = conditional_P + conditional_P.T
     sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)
     P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)
     return P
 
 
-def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose):
+def _joint_probabilities_nn(distances, desired_perplexity, verbose):
     """Compute joint probabilities p_ij from distances using just nearest
     neighbors.
 
@@ -70,11 +70,9 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose):
 
     Parameters
     ----------
-    distances : array, shape (n_samples, k)
-        Distances of samples to its k nearest neighbors.
-
-    neighbors : array, shape (n_samples, k)
-        Indices of the k nearest-neighbors for each samples.
+    distances : CSR sparse matrix, shape (n_samples, n_samples)
+        Distances of samples to its n_neighbors nearest neighbors. All other
+        distances are left to zero (and are not materialized in memory).
 
     desired_perplexity : float
         Desired perplexity of the joint probability distributions.
@@ -90,17 +88,18 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose):
     t0 = time()
     # Compute conditional probabilities such that they approximately match
     # the desired perplexity
-    n_samples, k = neighbors.shape
-    distances = distances.astype(np.float32, copy=False)
-    neighbors = neighbors.astype(np.int64, copy=False)
+    distances.sort_indices()
+    n_samples = distances.shape[0]
+    distances_data = distances.data.reshape(n_samples, -1)
+    distances_data = distances_data.astype(np.float32, copy=False)
     conditional_P = _utils._binary_search_perplexity(
-        distances, neighbors, desired_perplexity, verbose)
+        distances_data, desired_perplexity, verbose)
     assert np.all(np.isfinite(conditional_P)), \
         "All probabilities should be finite"
 
     # Symmetrize the joint probability distribution using sparse operations
-    P = csr_matrix((conditional_P.ravel(), neighbors.ravel(),
-                    range(0, n_samples * k + 1, k)),
+    P = csr_matrix((conditional_P.ravel(), distances.indices,
+                    distances.indptr),
                    shape=(n_samples, n_samples))
     P = P + P.T
 
@@ -638,55 +637,35 @@ def __init__(self, n_components=2, perplexity=30.0,
         self.angle = angle
 
     def _fit(self, X, skip_num_points=0):
-        """Fit the model using X as training data.
-
-        Note that sparse arrays can only be handled by method='exact'.
-        It is recommended that you convert your sparse array to dense
-        (e.g. `X.toarray()`) if it fits in memory, or otherwise using a
-        dimensionality reduction technique (e.g. TruncatedSVD).
+        """Private function to fit the model using X as training data."""
 
-        Parameters
-        ----------
-        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
-            If the metric is 'precomputed' X must be a square distance
-            matrix. Otherwise it contains a sample per row. Note that
-            when method='barnes_hut', X cannot be a sparse array and
-            will be converted to a 32 bit float array if need be.
-            Method='exact' allows sparse arrays and 64 bit floating point
-            inputs.
-
-        skip_num_points : int (optional, default:0)
-            This does not compute the gradient for points with indices below
-            `skip_num_points`. This is useful when computing transforms of new
-            data where you'd like to keep the old data fixed.
-        """
         if self.method not in ['barnes_hut', 'exact']:
             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
+        if self.method == 'barnes_hut':
+            X = self._validate_X(X, accept_sparse=['csr'],
+                                 ensure_min_samples=2,
+                                 dtype=[np.float32, np.float64])
+        else:
+            X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                                 dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
             if isinstance(self.init, str) and self.init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
                                  "used with metric=\"precomputed\".")
             if X.shape[0] != X.shape[1]:
                 raise ValueError("X should be a square distance matrix")
-            if np.any(X < 0):
-                raise ValueError("All distances should be positive, the "
-                                 "precomputed distances given as X is not "
-                                 "correct")
-        if self.method == 'barnes_hut' and sp.issparse(X):
-            raise TypeError('A sparse matrix was passed, but dense '
-                            'data is required for method="barnes_hut". Use '
-                            'X.toarray() to convert to a dense numpy array if '
-                            'the array is small enough for it to fit in '
-                            'memory. Otherwise consider dimensionality '
-                            'reduction techniques (e.g. TruncatedSVD)')
-        if self.method == 'barnes_hut':
-            X = self._validate_X(X, ensure_min_samples=2,
-                                 dtype=[np.float32, np.float64])
-        else:
-            X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
-                                 dtype=[np.float32, np.float64])
+
+            check_non_negative(X, "TSNE.fit(). With metric='precomputed', X "
+                                  "should contain positive distances.")
+
+            if self.method == "exact" and issparse(X):
+                raise TypeError(
+                    'TSNE with method="exact" does not accept sparse '
+                    'precomputed distance matrix. Use method="barnes_hut" '
+                    'or provide the dense distance matrix.')
+
         if self.method == 'barnes_hut' and self.n_components > 3:
             raise ValueError("'n_components' should be inferior to 4 for the "
                              "barnes_hut algorithm as it relies on "
@@ -730,17 +709,19 @@ def _fit(self, X, skip_num_points=0):
                                     "or then equal to one")
 
         else:
-            # Cpmpute the number of nearest neighbors to find.
+            # Compute the number of nearest neighbors to find.
             # LvdM uses 3 * perplexity as the number of neighbors.
             # In the event that we have very small # of points
             # set the neighbors to n - 1.
-            k = min(n_samples - 1, int(3. * self.perplexity + 1))
+            n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1))
 
             if self.verbose:
-                print("[t-SNE] Computing {} nearest neighbors...".format(k))
+                print("[t-SNE] Computing {} nearest neighbors..."
+                      .format(n_neighbors))
 
             # Find the nearest neighbors for every point
-            knn = NearestNeighbors(algorithm='auto', n_neighbors=k,
+            knn = NearestNeighbors(algorithm='auto',
+                                   n_neighbors=n_neighbors,
                                    metric=self.metric)
             t0 = time()
             knn.fit(X)
@@ -750,12 +731,11 @@ def _fit(self, X, skip_num_points=0):
                     n_samples, duration))
 
             t0 = time()
-            distances_nn, neighbors_nn = knn.kneighbors(
-                None, n_neighbors=k)
+            distances_nn = knn.kneighbors_graph(mode='distance')
             duration = time() - t0
             if self.verbose:
-                print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..."
-                      .format(n_samples, duration))
+                print("[t-SNE] Computed neighbors for {} samples "
+                      "in {:.3f}s...".format(n_samples, duration))
 
             # Free the memory used by the ball_tree
             del knn
@@ -766,11 +746,11 @@ def _fit(self, X, skip_num_points=0):
                 # the method was derived using the euclidean method as in the
                 # input space. Not sure of the implication of using a different
                 # metric.
-                distances_nn **= 2
+                distances_nn.data **= 2
 
             # compute the joint probability distribution for the input space
-            P = _joint_probabilities_nn(distances_nn, neighbors_nn,
-                                        self.perplexity, self.verbose)
+            P = _joint_probabilities_nn(distances_nn, self.perplexity,
+                                        self.verbose)
 
         if isinstance(self.init, np.ndarray):
             X_embedded = self.init
@@ -869,7 +849,10 @@ def fit_transform(self, X, y=None):
         ----------
         X : array, shape (n_samples, n_features) or (n_samples, n_samples)
             If the metric is 'precomputed' X must be a square distance
-            matrix. Otherwise it contains a sample per row.
+            matrix. Otherwise it contains a sample per row. If the method
+            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
+            or 'coo'. If the method is 'barnes_hut' and the metric is
+            'precomputed', X may be a precomputed sparse graph.
 
         y : Ignored
 
@@ -891,7 +874,8 @@ def fit(self, X, y=None):
             If the metric is 'precomputed' X must be a square distance
             matrix. Otherwise it contains a sample per row. If the method
             is 'exact', X may be a sparse matrix of type 'csr', 'csc'
-            or 'coo'.
+            or 'coo'. If the method is 'barnes_hut' and the metric is
+            'precomputed', X may be a precomputed sparse graph.
 
         y : Ignored
         """
diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py
index 4502ffdd6c33b..6122840a5ef33 100644
--- a/sklearn/manifold/tests/test_isomap.py
+++ b/sklearn/manifold/tests/test_isomap.py
@@ -1,6 +1,7 @@
 from itertools import product
 import numpy as np
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
+import pytest
 
 from sklearn import datasets
 from sklearn import manifold
@@ -114,6 +115,57 @@ def test_pipeline():
     assert .9 < clf.score(X, y)
 
 
+def test_pipeline_with_nearest_neighbors_transformer():
+    # Test chaining NearestNeighborsTransformer and Isomap with
+    # neighbors_algorithm='precomputed'
+    algorithm = 'auto'
+    n_neighbors = 10
+
+    X, _ = datasets.make_blobs(random_state=0)
+    X2, _ = datasets.make_blobs(random_state=1)
+
+    # compare the chained version and the compact version
+    est_chain = pipeline.make_pipeline(
+        neighbors.KNeighborsTransformer(
+            n_neighbors=n_neighbors, algorithm=algorithm, mode='distance'),
+        manifold.Isomap(n_neighbors=n_neighbors, metric='precomputed'))
+    est_compact = manifold.Isomap(n_neighbors=n_neighbors,
+                                  neighbors_algorithm=algorithm)
+
+    Xt_chain = est_chain.fit_transform(X)
+    Xt_compact = est_compact.fit_transform(X)
+    assert_array_almost_equal(Xt_chain, Xt_compact)
+
+    Xt_chain = est_chain.transform(X2)
+    Xt_compact = est_compact.transform(X2)
+    assert_array_almost_equal(Xt_chain, Xt_compact)
+
+
+def test_different_metric():
+    # Test that the metric parameters work correctly, and default to euclidean
+    def custom_metric(x1, x2):
+        return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))
+
+    # metric, p, is_euclidean
+    metrics = [('euclidean', 2, True),
+               ('manhattan', 1, False),
+               ('minkowski', 1, False),
+               ('minkowski', 2, True),
+               (custom_metric, 2, False)]
+
+    X, _ = datasets.make_blobs(random_state=0)
+    reference = manifold.Isomap().fit_transform(X)
+
+    for metric, p, is_euclidean in metrics:
+        embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
+
+        if is_euclidean:
+            assert_array_almost_equal(embedding, reference)
+        else:
+            with pytest.raises(AssertionError, match='not almost equal'):
+                assert_array_almost_equal(embedding, reference)
+
+
 def test_isomap_clone_bug():
     # regression test for bug reported in #6062
     model = manifold.Isomap()
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
index d9c066c474b1c..a1d790c699a16 100644
--- a/sklearn/manifold/tests/test_spectral_embedding.py
+++ b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -12,6 +12,7 @@
 from sklearn.manifold import spectral_embedding
 from sklearn.metrics.pairwise import rbf_kernel
 from sklearn.metrics import normalized_mutual_info_score
+from sklearn.neighbors import NearestNeighbors
 from sklearn.cluster import KMeans
 from sklearn.datasets.samples_generator import make_blobs
 from sklearn.utils.extmath import _deterministic_vector_sign_flip
@@ -125,7 +126,9 @@ def test_spectral_embedding_two_components(seed=36):
     assert normalized_mutual_info_score(true_label, label_) == 1.0
 
 
-def test_spectral_embedding_precomputed_affinity(seed=36):
+@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)],
+                         ids=["dense", "sparse"])
+def test_spectral_embedding_precomputed_affinity(X, seed=36):
     # Test spectral embedding with precomputed kernel
     gamma = 1.0
     se_precomp = SpectralEmbedding(n_components=2, affinity="precomputed",
@@ -133,14 +136,33 @@ def test_spectral_embedding_precomputed_affinity(seed=36):
     se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
                                gamma=gamma,
                                random_state=np.random.RandomState(seed))
-    embed_precomp = se_precomp.fit_transform(rbf_kernel(S, gamma=gamma))
-    embed_rbf = se_rbf.fit_transform(S)
+    embed_precomp = se_precomp.fit_transform(rbf_kernel(X, gamma=gamma))
+    embed_rbf = se_rbf.fit_transform(X)
     assert_array_almost_equal(
         se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
     assert _check_with_col_sign_flipping(embed_precomp, embed_rbf, 0.05)
 
 
-def test_spectral_embedding_callable_affinity(seed=36):
+def test_precomputed_nearest_neighbors_filtering():
+    # Test precomputed graph filtering when containing too many neighbors
+    n_neighbors = 2
+    results = []
+    for additional_neighbors in [0, 10]:
+        nn = NearestNeighbors(
+            n_neighbors=n_neighbors + additional_neighbors).fit(S)
+        graph = nn.kneighbors_graph(S, mode='connectivity')
+        embedding = SpectralEmbedding(random_state=0, n_components=2,
+                                      affinity='precomputed_nearest_neighbors',
+                                      n_neighbors=n_neighbors
+                                      ).fit(graph).embedding_
+        results.append(embedding)
+
+    assert_array_equal(results[0], results[1])
+
+
+@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)],
+                         ids=["dense", "sparse"])
+def test_spectral_embedding_callable_affinity(X, seed=36):
     # Test spectral embedding with callable affinity
     gamma = 0.9
     kern = rbf_kernel(S, gamma=gamma)
@@ -152,8 +174,8 @@ def test_spectral_embedding_callable_affinity(seed=36):
     se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
                                gamma=gamma,
                                random_state=np.random.RandomState(seed))
-    embed_rbf = se_rbf.fit_transform(S)
-    embed_callable = se_callable.fit_transform(S)
+    embed_rbf = se_rbf.fit_transform(X)
+    embed_callable = se_callable.fit_transform(X)
     assert_array_almost_equal(
         se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
     assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index 0b11e327256f6..34662604892af 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -3,11 +3,12 @@
 import numpy as np
 from numpy.testing import assert_allclose
 import scipy.sparse as sp
-
 import pytest
 
-from sklearn.neighbors import BallTree
 from sklearn.neighbors import NearestNeighbors
+from sklearn.neighbors import kneighbors_graph
+from sklearn.exceptions import EfficiencyWarning
+from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -104,13 +105,10 @@ def flat_function(_, compute_error=True):
 def test_binary_search():
     # Test if the binary search finds Gaussians with desired perplexity.
     random_state = check_random_state(0)
-    distances = random_state.randn(50, 2).astype(np.float32)
-    # Distances shouldn't be negative
-    distances = np.abs(distances.dot(distances.T))
-    np.fill_diagonal(distances, 0.0)
+    data = random_state.randn(50, 5)
+    distances = pairwise_distances(data).astype(np.float32)
     desired_perplexity = 25.0
-    P = _binary_search_perplexity(distances, None, desired_perplexity,
-                                  verbose=0)
+    P = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
     P = np.maximum(P, np.finfo(np.double).eps)
     mean_perplexity = np.mean([np.exp(-np.sum(P[i] * np.log(P[i])))
                                for i in range(P.shape[0])])
@@ -124,34 +122,34 @@ def test_binary_search_neighbors():
     n_samples = 200
     desired_perplexity = 25.0
     random_state = check_random_state(0)
-    distances = random_state.randn(n_samples, 2).astype(np.float32)
-    # Distances shouldn't be negative
-    distances = np.abs(distances.dot(distances.T))
-    np.fill_diagonal(distances, 0.0)
-    P1 = _binary_search_perplexity(distances, None, desired_perplexity,
-                                   verbose=0)
+    data = random_state.randn(n_samples, 2).astype(np.float32, copy=False)
+    distances = pairwise_distances(data)
+    P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
 
     # Test that when we use all the neighbors the results are identical
-    k = n_samples
-    neighbors_nn = np.argsort(distances, axis=1)[:, 1:k].astype(np.int64,
-                                                                copy=False)
-    distances_nn = np.array([distances[k, neighbors_nn[k]]
-                            for k in range(n_samples)])
-    P2 = _binary_search_perplexity(distances_nn, neighbors_nn,
-                                   desired_perplexity, verbose=0)
-    P_nn = np.array([P1[k, neighbors_nn[k]] for k in range(n_samples)])
-    assert_array_almost_equal(P_nn, P2, decimal=4)
-
-    # Test that the highest P_ij are the same when few neighbors are used
-    for k in np.linspace(80, n_samples, 5):
+    n_neighbors = n_samples - 1
+    nn = NearestNeighbors().fit(data)
+    distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
+                                         mode='distance')
+    distances_nn = distance_graph.data.astype(np.float32, copy=False)
+    distances_nn = distances_nn.reshape(n_samples, n_neighbors)
+    P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0)
+
+    indptr = distance_graph.indptr
+    P1_nn = np.array([P1[k, distance_graph.indices[indptr[k]:indptr[k + 1]]]
+                     for k in range(n_samples)])
+    assert_array_almost_equal(P1_nn, P2, decimal=4)
+
+    # Test that the highest P_ij are the same when fewer neighbors are used
+    for k in np.linspace(150, n_samples - 1, 5):
         k = int(k)
-        topn = k * 10  # check the top 10 *k entries out of k * k entries
-        neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64,
-                                                                   copy=False)
-        distances_nn = np.array([distances[k, neighbors_nn[k]]
-                                for k in range(n_samples)])
-        P2k = _binary_search_perplexity(distances_nn, neighbors_nn,
-                                        desired_perplexity, verbose=0)
+        topn = k * 10  # check the top 10 * k entries out of k * k entries
+        distance_graph = nn.kneighbors_graph(n_neighbors=k, mode='distance')
+        distances_nn = distance_graph.data.astype(np.float32, copy=False)
+        distances_nn = distances_nn.reshape(n_samples, k)
+        P2k = _binary_search_perplexity(distances_nn, desired_perplexity,
+                                        verbose=0)
+        assert_array_almost_equal(P1_nn, P2, decimal=2)
         idx = np.argsort(P1.ravel())[::-1]
         P1top = P1.ravel()[idx][:topn]
         idx = np.argsort(P2k.ravel())[::-1]
@@ -163,20 +161,22 @@ def test_binary_perplexity_stability():
     # Binary perplexity search should be stable.
     # The binary_search_perplexity had a bug wherein the P array
     # was uninitialized, leading to sporadically failing tests.
-    k = 10
+    n_neighbors = 10
     n_samples = 100
     random_state = check_random_state(0)
-    distances = random_state.randn(n_samples, 2).astype(np.float32)
-    # Distances shouldn't be negative
-    distances = np.abs(distances.dot(distances.T))
-    np.fill_diagonal(distances, 0.0)
+    data = random_state.randn(n_samples, 5)
+    nn = NearestNeighbors().fit(data)
+    distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
+                                         mode='distance')
+    distances = distance_graph.data.astype(np.float32, copy=False)
+    distances = distances.reshape(n_samples, n_neighbors)
     last_P = None
-    neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64,
-                                                               copy=False)
+    desired_perplexity = 3
     for _ in range(100):
-        P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(),
-                                      3, verbose=0)
-        P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0)
+        P = _binary_search_perplexity(distances.copy(), desired_perplexity,
+                                      verbose=0)
+        P1 = _joint_probabilities_nn(distance_graph, desired_perplexity,
+                                     verbose=0)
         # Convert the sparse matrix to a dense one for testing
         P1 = P1.toarray()
         if last_P is None:
@@ -262,14 +262,15 @@ def test_optimization_minimizes_kl_divergence():
     assert kl_divergences[2] <= kl_divergences[1]
 
 
-def test_fit_csr_matrix():
+@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
+def test_fit_csr_matrix(method):
     # X can be a sparse matrix.
     random_state = check_random_state(0)
     X = random_state.randn(50, 2)
     X[(np.random.randint(0, 50, 25), np.random.randint(0, 2, 25))] = 0.0
     X_csr = sp.csr_matrix(X)
     tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
-                random_state=0, method='exact', n_iter=750)
+                random_state=0, method=method, n_iter=750)
     X_embedded = tsne.fit_transform(X_csr)
     assert_allclose(trustworthiness(X_csr, X_embedded, n_neighbors=1),
                     1.0, rtol=1.1e-1)
@@ -295,8 +296,8 @@ def test_trustworthiness_not_euclidean_metric():
     random_state = check_random_state(0)
     X = random_state.randn(100, 2)
     assert (trustworthiness(X, X, metric='cosine') ==
-                 trustworthiness(pairwise_distances(X, metric='cosine'), X,
-                                 metric='precomputed'))
+            trustworthiness(pairwise_distances(X, metric='cosine'), X,
+                            metric='precomputed'))
 
 
 def test_early_exaggeration_too_small():
@@ -313,20 +314,55 @@ def test_too_few_iterations():
         tsne.fit_transform(np.array([[0.0], [0.0]]))
 
 
-def test_non_square_precomputed_distances():
-    # Precomputed distance matrices must be square matrices.
+@pytest.mark.parametrize('method, retype', [
+    ('exact', np.asarray),
+    ('barnes_hut', np.asarray),
+    ('barnes_hut', sp.csr_matrix),
+])
+@pytest.mark.parametrize('D, message_regex', [
+    ([[0.0], [1.0]], ".* square distance matrix"),
+    ([[0., -1.], [1., 0.]], ".* positive.*"),
+])
+def test_bad_precomputed_distances(method, D, retype, message_regex):
+    tsne = TSNE(metric="precomputed", method=method)
+    with pytest.raises(ValueError, match=message_regex):
+        tsne.fit_transform(retype(D))
+
+
+def test_exact_no_precomputed_sparse():
+    tsne = TSNE(metric='precomputed', method='exact')
+    with pytest.raises(TypeError, match='sparse'):
+        tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
+
+
+def test_high_perplexity_precomputed_sparse_distances():
+    # Perplexity should be less than 50
+    dist = np.array([[1., 0., 0.], [0., 1., 0.], [1., 0., 0.]])
+    bad_dist = sp.csr_matrix(dist)
     tsne = TSNE(metric="precomputed")
-    with pytest.raises(ValueError, match=".* square distance matrix"):
-        tsne.fit_transform(np.array([[0.0], [1.0]]))
+    msg = "3 neighbors per samples are required, but some samples have only 1"
+    with pytest.raises(ValueError, match=msg):
+        tsne.fit_transform(bad_dist)
+
+
+@ignore_warnings(category=EfficiencyWarning)
+def test_sparse_precomputed_distance():
+    """Make sure that TSNE works identically for sparse and dense matrix"""
+    random_state = check_random_state(0)
+    X = random_state.randn(100, 2)
 
+    D_sparse = kneighbors_graph(X, n_neighbors=100, mode='distance',
+                                include_self=True)
+    D = pairwise_distances(X)
+    assert sp.issparse(D_sparse)
+    assert_almost_equal(D_sparse.A, D)
 
-def test_non_positive_precomputed_distances():
-    # Precomputed distance matrices must be positive.
-    bad_dist = np.array([[0., -1.], [1., 0.]])
-    for method in ['barnes_hut', 'exact']:
-        tsne = TSNE(metric="precomputed", method=method)
-        with pytest.raises(ValueError, match="All distances .*precomputed.*"):
-            tsne.fit_transform(bad_dist)
+    tsne = TSNE(metric="precomputed", random_state=0)
+    Xt_dense = tsne.fit_transform(D)
+
+    for fmt in ['csr', 'lil']:
+        Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt))
+        assert_almost_equal(Xt_dense, Xt_sparse)
 
 
 def test_non_positive_computed_distances():
@@ -563,17 +599,6 @@ def test_reduction_to_one_component():
     assert(np.all(np.isfinite(X_embedded)))
 
 
-def test_no_sparse_on_barnes_hut():
-    # No sparse matrices allowed on Barnes-Hut.
-    random_state = check_random_state(0)
-    X = random_state.randn(100, 2)
-    X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0
-    X_csr = sp.csr_matrix(X)
-    tsne = TSNE(n_iter=199, method='barnes_hut')
-    with pytest.raises(TypeError, match="A sparse matrix was.*"):
-        tsne.fit_transform(X_csr)
-
-
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 @pytest.mark.parametrize('dt', [np.float32, np.float64])
 def test_64bit(method, dt):
@@ -616,25 +641,17 @@ def test_barnes_hut_angle():
         degrees_of_freedom = float(n_components - 1.0)
 
         random_state = check_random_state(0)
-        distances = random_state.randn(n_samples, n_features)
-        distances = distances.astype(np.float32)
-        distances = abs(distances.dot(distances.T))
-        np.fill_diagonal(distances, 0.0)
+        data = random_state.randn(n_samples, n_features)
+        distances = pairwise_distances(data)
         params = random_state.randn(n_samples, n_components)
         P = _joint_probabilities(distances, perplexity, verbose=0)
         kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom,
                                               n_samples, n_components)
 
-        k = n_samples - 1
-        bt = BallTree(distances)
-        distances_nn, neighbors_nn = bt.query(distances, k=k + 1)
-        neighbors_nn = neighbors_nn[:, 1:]
-        distances_nn = np.array([distances[i, neighbors_nn[i]]
-                                 for i in range(n_samples)])
-        assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\
-            abs(distances[0, neighbors_nn[0]] - distances_nn[0])
-        P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn,
-                                       perplexity, verbose=0)
+        n_neighbors = n_samples - 1
+        distances_csr = NearestNeighbors().fit(data).kneighbors_graph(
+            n_neighbors=n_neighbors, mode='distance')
+        P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
         kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom,
                                            n_samples, n_components,
                                            angle=angle, skip_num_points=0,
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 550cab3c01bca..85cc9c3e6a0ad 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -7,6 +7,7 @@
 from .kd_tree import KDTree
 from .dist_metrics import DistanceMetric
 from .graph import kneighbors_graph, radius_neighbors_graph
+from .graph import KNeighborsTransformer, RadiusNeighborsTransformer
 from .unsupervised import NearestNeighbors
 from .classification import KNeighborsClassifier, RadiusNeighborsClassifier
 from .regression import KNeighborsRegressor, RadiusNeighborsRegressor
@@ -21,10 +22,12 @@
            'KDTree',
            'KNeighborsClassifier',
            'KNeighborsRegressor',
+           'KNeighborsTransformer',
            'NearestCentroid',
            'NearestNeighbors',
            'RadiusNeighborsClassifier',
            'RadiusNeighborsRegressor',
+           'RadiusNeighborsTransformer',
            'kneighbors_graph',
            'radius_neighbors_graph',
            'KernelDensity',
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index 37b77f16f7920..28d0483ac9b5b 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -26,7 +26,8 @@
 from ..utils import check_X_y, check_array, gen_even_slices
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
-from ..exceptions import DataConversionWarning
+from ..utils.validation import check_non_negative
+from ..exceptions import DataConversionWarning, EfficiencyWarning
 
 VALID_METRICS = dict(ball_tree=BallTree.valid_metrics,
                      kd_tree=KDTree.valid_metrics,
@@ -103,6 +104,187 @@ def _get_weights(dist, weights):
                          "'distance', or a callable function")
 
 
+def _is_sorted_by_data(graph):
+    """Returns whether the graph's non-zero entries are sorted by data
+
+    The non-zero entries are stored in graph.data and graph.indices.
+    For each row (or sample), the non-zero entries can be either:
+        - sorted by indices, as after graph.sort_indices()
+        - sorted by data, as after _check_precomputed(graph)
+        - not sorted.
+
+    Parameters
+    ----------
+    graph : CSR sparse matrix, shape (n_samples, n_samples)
+        Neighbors graph as given by kneighbors_graph or radius_neighbors_graph
+
+    Returns
+    -------
+    res : boolean
+        Whether input graph is sorted by data
+    """
+    assert graph.format == 'csr'
+    out_of_order = graph.data[:-1] > graph.data[1:]
+    line_change = np.unique(graph.indptr[1:-1] - 1)
+    line_change = line_change[line_change < out_of_order.shape[0]]
+    return (out_of_order.sum() == out_of_order[line_change].sum())
+
+
+def _check_precomputed(X):
+    """Check precomputed distance matrix
+
+    If the precomputed distance matrix is sparse, it checks that the non-zero
+    entries are sorted by distances. If not, the matrix is copied and sorted.
+
+    Parameters
+    ----------
+    X : {sparse matrix, array-like}, (n_samples, n_samples)
+        Distance matrix to other samples. X may be a sparse matrix, in which
+        case only non-zero elements may be considered neighbors.
+
+    Returns
+    -------
+    X : {sparse matrix, array-like}, (n_samples, n_samples)
+        Distance matrix to other samples. X may be a sparse matrix, in which
+        case only non-zero elements may be considered neighbors.
+    """
+    if not issparse(X):
+        X = check_array(X)
+        check_non_negative(X, whom="precomputed distance matrix.")
+        return X
+    else:
+        graph = X
+
+    if graph.format not in ('csr', 'csc', 'coo', 'lil'):
+        raise TypeError('Sparse matrix in {!r} format is not supported due to '
+                        'its handling of explicit zeros'.format(graph.format))
+    copied = graph.format != 'csr'
+    graph = check_array(graph, accept_sparse='csr')
+    check_non_negative(graph, whom="precomputed distance matrix.")
+
+    if not _is_sorted_by_data(graph):
+        warnings.warn('Precomputed sparse input was not sorted by data.',
+                      EfficiencyWarning)
+        if not copied:
+            graph = graph.copy()
+
+        # if each sample has the same number of provided neighbors
+        row_nnz = np.diff(graph.indptr)
+        if row_nnz.max() == row_nnz.min():
+            n_samples = graph.shape[0]
+            distances = graph.data.reshape(n_samples, -1)
+
+            order = np.argsort(distances, kind='mergesort')
+            order += np.arange(n_samples)[:, None] * row_nnz[0]
+            order = order.ravel()
+            graph.data = graph.data[order]
+            graph.indices = graph.indices[order]
+
+        else:
+            for start, stop in zip(graph.indptr, graph.indptr[1:]):
+                order = np.argsort(graph.data[start:stop], kind='mergesort')
+                graph.data[start:stop] = graph.data[start:stop][order]
+                graph.indices[start:stop] = graph.indices[start:stop][order]
+    return graph
+
+
+def _kneighbors_from_graph(graph, n_neighbors, return_distance):
+    """Decompose a nearest neighbors sparse graph into distances and indices
+
+    Parameters
+    ----------
+    graph : CSR sparse matrix, shape (n_samples, n_samples)
+        Neighbors graph as given by kneighbors_graph or radius_neighbors_graph
+
+    n_neighbors : int
+        Number of neighbors required for each sample.
+
+    return_distance : boolean
+        If False, distances will not be returned
+
+    Returns
+    -------
+    neigh_dist : array, shape (n_samples, n_neighbors)
+        Distances to nearest neighbors. Only present if return_distance=True.
+
+    neigh_ind : array, shape (n_samples, n_neighbors)
+        Indices of nearest neighbors.
+    """
+    n_samples = graph.shape[0]
+    assert graph.format == 'csr'
+
+    # number of neighbors by samples
+    row_nnz = np.diff(graph.indptr)
+    row_nnz_min = row_nnz.min()
+    if n_neighbors is not None and row_nnz_min < n_neighbors:
+        raise ValueError(
+            '%d neighbors per samples are required, but some samples have only'
+            ' %d neighbors in precomputed graph matrix. Decrease number of '
+            'neighbors used or recompute the graph with more neighbors.'
+            % (n_neighbors, row_nnz_min))
+
+    def extract(a):
+        # if each sample has the same number of provided neighbors
+        if row_nnz.max() == row_nnz_min:
+            return a.reshape(n_samples, -1)[:, :n_neighbors]
+        else:
+            idx = np.tile(np.arange(n_neighbors), (n_samples, 1))
+            idx += graph.indptr[:-1, None]
+            return a.take(idx, mode='clip').reshape(n_samples, n_neighbors)
+
+    if return_distance:
+        return extract(graph.data), extract(graph.indices)
+    else:
+        return extract(graph.indices)
+
+
+def _radius_neighbors_from_graph(graph, radius, return_distance):
+    """Decompose a nearest neighbors sparse graph into distances and indices
+
+    Parameters
+    ----------
+    graph : CSR sparse matrix, shape (n_samples, n_samples)
+        Neighbors graph as given by kneighbors_graph or radius_neighbors_graph
+
+    radius : float > 0
+        Radius of neighborhoods.
+
+    return_distance : boolean
+        If False, distances will not be returned
+
+    Returns
+    -------
+    neigh_dist : array, shape (n_samples,) of arrays
+        Distances to nearest neighbors. Only present if return_distance=True.
+
+    neigh_ind :array, shape (n_samples,) of arrays
+        Indices of nearest neighbors.
+    """
+    assert graph.format == 'csr'
+
+    no_filter_needed = graph.data.max() <= radius
+
+    if no_filter_needed:
+        data, indices, indptr = graph.data, graph.indices, graph.indptr
+    else:
+        mask = graph.data <= radius
+        if return_distance:
+            data = np.compress(mask, graph.data)
+        indices = np.compress(mask, graph.indices)
+        indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr]
+
+    indices = indices.astype(np.intp, copy=no_filter_needed)
+
+    if return_distance:
+        neigh_dist = np.array(np.split(data, indptr[1:-1]))
+    neigh_ind = np.array(np.split(indices, indptr[1:-1]))
+
+    if return_distance:
+        return neigh_dist, neigh_ind
+    else:
+        return neigh_ind
+
+
 class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for nearest neighbors estimators."""
 
@@ -192,21 +374,28 @@ def _fit(self, X):
             self._fit_X = X._fit_X
             self._tree = X._tree
             self._fit_method = X._fit_method
+            self.n_samples_fit_ = X.n_samples_fit_
             return self
 
         elif isinstance(X, BallTree):
             self._fit_X = X.data
             self._tree = X
             self._fit_method = 'ball_tree'
+            self.n_samples_fit_ = X.data.shape[0]
             return self
 
         elif isinstance(X, KDTree):
             self._fit_X = X.data
             self._tree = X
             self._fit_method = 'kd_tree'
+            self.n_samples_fit_ = X.data.shape[0]
             return self
 
-        X = self._validate_X(X, accept_sparse='csr')
+        if self.effective_metric_ == 'precomputed':
+            X = _check_precomputed(X)
+            self.n_features_in_ = X.shape[1]
+        else:
+            X = self._validate_X(X, accept_sparse='csr')
 
         n_samples = X.shape[0]
         if n_samples == 0:
@@ -233,10 +422,12 @@ def _fit(self, X):
             self._fit_X = X.copy()
             self._tree = None
             self._fit_method = 'brute'
+            self.n_samples_fit_ = X.shape[0]
             return self
 
         self._fit_method = self.algorithm
         self._fit_X = X
+        self.n_samples_fit_ = X.shape[0]
 
         if self._fit_method == 'auto':
             # A tree approach is better for small number of neighbors,
@@ -289,13 +480,13 @@ def _pairwise(self):
         return self.metric == 'precomputed'
 
 
-def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance):
+def _tree_query_parallel_helper(tree, *args, **kwargs):
     """Helper for the Parallel calls in KNeighborsMixin.kneighbors
 
     The Cython method tree.query is not directly picklable by cloudpickle
     under PyPy.
     """
-    return tree.query(data, n_neighbors, return_distance)
+    return tree.query(*args, **kwargs)
 
 
 class KNeighborsMixin:
@@ -342,8 +533,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             The query point or points.
             If not provided, neighbors of each indexed point are returned.
             In this case, the query point is not considered its own neighbor.
@@ -357,11 +548,11 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
 
         Returns
         -------
-        dist : array
+        neigh_dist : array, shape (n_queries, n_neighbors)
             Array representing the lengths to points, only present if
             return_distance=True
 
-        ind : array
+        neigh_ind : array, shape (n_queries, n_neighbors)
             Indices of the nearest points in the population matrix.
 
         Examples
@@ -406,7 +597,10 @@ class from an array representing our data set and ask who's
 
         if X is not None:
             query_is_train = False
-            X = check_array(X, accept_sparse='csr')
+            if self.effective_metric_ == 'precomputed':
+                X = _check_precomputed(X)
+            else:
+                X = check_array(X, accept_sparse='csr')
         else:
             query_is_train = True
             X = self._fit_X
@@ -414,28 +608,34 @@ class from an array representing our data set and ask who's
             # returned, which is removed later
             n_neighbors += 1
 
-        train_size = self._fit_X.shape[0]
-        if n_neighbors > train_size:
+        n_samples_fit = self.n_samples_fit_
+        if n_neighbors > n_samples_fit:
             raise ValueError(
                 "Expected n_neighbors <= n_samples, "
                 " but n_samples = %d, n_neighbors = %d" %
-                (train_size, n_neighbors)
+                (n_samples_fit, n_neighbors)
             )
-        n_samples, _ = X.shape
-        sample_range = np.arange(n_samples)[:, None]
 
         n_jobs = effective_n_jobs(self.n_jobs)
-        if self._fit_method == 'brute':
+        chunked_results = None
+        if (self._fit_method == 'brute' and
+                self.effective_metric_ == 'precomputed' and issparse(X)):
+            results = _kneighbors_from_graph(
+                X, n_neighbors=n_neighbors,
+                return_distance=return_distance)
 
+        elif self._fit_method == 'brute':
             reduce_func = partial(self._kneighbors_reduce_func,
                                   n_neighbors=n_neighbors,
                                   return_distance=return_distance)
 
             # for efficiency, use squared euclidean distances
-            kwds = ({'squared': True} if self.effective_metric_ == 'euclidean'
-                    else self.effective_metric_params_)
+            if self.effective_metric_ == 'euclidean':
+                kwds = {'squared': True}
+            else:
+                kwds = self.effective_metric_params_
 
-            result = list(pairwise_distances_chunked(
+            chunked_results = list(pairwise_distances_chunked(
                 X, self._fit_X, reduce_func=reduce_func,
                 metric=self.effective_metric_, n_jobs=n_jobs,
                 **kwds))
@@ -456,7 +656,7 @@ class from an array representing our data set and ask who's
             else:
                 delayed_query = delayed(_tree_query_parallel_helper)
                 parallel_kwargs = {"prefer": "threads"}
-            result = Parallel(n_jobs, **parallel_kwargs)(
+            chunked_results = Parallel(n_jobs, **parallel_kwargs)(
                 delayed_query(
                     self._tree, X[s], n_neighbors, return_distance)
                 for s in gen_even_slices(X.shape[0], n_jobs)
@@ -464,23 +664,26 @@ class from an array representing our data set and ask who's
         else:
             raise ValueError("internal: _fit_method not recognized")
 
-        if return_distance:
-            dist, neigh_ind = zip(*result)
-            result = np.vstack(dist), np.vstack(neigh_ind)
-        else:
-            result = np.vstack(result)
+        if chunked_results is not None:
+            if return_distance:
+                neigh_dist, neigh_ind = zip(*chunked_results)
+                results = np.vstack(neigh_dist), np.vstack(neigh_ind)
+            else:
+                results = np.vstack(chunked_results)
 
         if not query_is_train:
-            return result
+            return results
         else:
             # If the query data is the same as the indexed data, we would like
             # to ignore the first nearest neighbor of every sample, i.e
             # the sample itself.
             if return_distance:
-                dist, neigh_ind = result
+                neigh_dist, neigh_ind = results
             else:
-                neigh_ind = result
+                neigh_ind = results
 
+            n_queries, _ = X.shape
+            sample_range = np.arange(n_queries)[:, None]
             sample_mask = neigh_ind != sample_range
 
             # Corner case: When the number of duplicates are more
@@ -489,14 +692,13 @@ class from an array representing our data set and ask who's
             # In that case mask the first duplicate.
             dup_gr_nbrs = np.all(sample_mask, axis=1)
             sample_mask[:, 0][dup_gr_nbrs] = False
-
             neigh_ind = np.reshape(
-                neigh_ind[sample_mask], (n_samples, n_neighbors - 1))
+                neigh_ind[sample_mask], (n_queries, n_neighbors - 1))
 
             if return_distance:
-                dist = np.reshape(
-                    dist[sample_mask], (n_samples, n_neighbors - 1))
-                return dist, neigh_ind
+                neigh_dist = np.reshape(
+                    neigh_dist[sample_mask], (n_queries, n_neighbors - 1))
+                return neigh_dist, neigh_ind
             return neigh_ind
 
     def kneighbors_graph(self, X=None, n_neighbors=None,
@@ -505,8 +707,8 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             The query point or points.
             If not provided, neighbors of each indexed point are returned.
             In this case, the query point is not considered its own neighbor.
@@ -522,7 +724,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
 
         Returns
         -------
-        A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]
+        A : sparse graph in CSR format, shape = [n_queries, n_samples_fit]
             n_samples_fit is the number of samples in the fitted data
             A[i, j] is assigned the weight of edge that connects i to j.
 
@@ -547,21 +749,13 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
         if n_neighbors is None:
             n_neighbors = self.n_neighbors
 
-        # kneighbors does the None handling.
-        if X is not None:
-            X = check_array(X, accept_sparse='csr')
-            n_samples1 = X.shape[0]
-        else:
-            n_samples1 = self._fit_X.shape[0]
-
-        n_samples2 = self._fit_X.shape[0]
-        n_nonzero = n_samples1 * n_neighbors
-        A_indptr = np.arange(0, n_nonzero + 1, n_neighbors)
+        # check the input only in self.kneighbors
 
         # construct CSR matrix representation of the k-NN graph
         if mode == 'connectivity':
-            A_data = np.ones(n_samples1 * n_neighbors)
             A_ind = self.kneighbors(X, n_neighbors, return_distance=False)
+            n_queries = A_ind.shape[0]
+            A_data = np.ones(n_queries * n_neighbors)
 
         elif mode == 'distance':
             A_data, A_ind = self.kneighbors(
@@ -573,19 +767,24 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
                 'Unsupported mode, must be one of "connectivity" '
                 'or "distance" but got "%s" instead' % mode)
 
+        n_queries = A_ind.shape[0]
+        n_samples_fit = self.n_samples_fit_
+        n_nonzero = n_queries * n_neighbors
+        A_indptr = np.arange(0, n_nonzero + 1, n_neighbors)
+
         kneighbors_graph = csr_matrix((A_data, A_ind.ravel(), A_indptr),
-                                      shape=(n_samples1, n_samples2))
+                                      shape=(n_queries, n_samples_fit))
 
         return kneighbors_graph
 
 
-def _tree_query_radius_parallel_helper(tree, data, radius, return_distance):
+def _tree_query_radius_parallel_helper(tree, *args, **kwargs):
     """Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors
 
     The Cython method tree.query_radius is not directly picklable by
     cloudpickle under PyPy.
     """
-    return tree.query_radius(data, radius, return_distance)
+    return tree.query_radius(*args, **kwargs)
 
 
 class RadiusNeighborsMixin:
@@ -625,7 +824,8 @@ def _radius_neighbors_reduce_func(self, dist, start,
             results = neigh_ind
         return results
 
-    def radius_neighbors(self, X=None, radius=None, return_distance=True):
+    def radius_neighbors(self, X=None, radius=None, return_distance=True,
+                         sort_results=False):
         """Finds the neighbors within a given radius of a point or points.
 
         Return the indices and distances of each point from the dataset
@@ -647,16 +847,24 @@ def radius_neighbors(self, X=None, radius=None, return_distance=True):
             (default is the value passed to the constructor).
 
         return_distance : boolean, optional. Defaults to True.
-            If False, distances will not be returned
+            If False, distances will not be returned.
+
+        sort_results : boolean, optional. Defaults to False.
+            If True, the distances and indices will be sorted before being
+            returned. If False, the results will not be sorted. If
+            return_distance == False, setting sort_results = True will
+            result in an error.
+
+            .. versionadded:: 0.22
 
         Returns
         -------
-        dist : array, shape (n_samples,) of arrays
+        neigh_dist : array, shape (n_samples,) of arrays
             Array representing the distances to each point, only present if
             return_distance=True. The distance values are computed according
             to the ``metric`` constructor parameter.
 
-        ind : array, shape (n_samples,) of arrays
+        neigh_ind : array, shape (n_samples,) of arrays
             An array of arrays of indices of the approximate nearest points
             from the population matrix that lie within a ball of size
             ``radius`` around the query points.
@@ -695,7 +903,10 @@ class from an array representing our data set and ask who's
 
         if X is not None:
             query_is_train = False
-            X = check_array(X, accept_sparse='csr')
+            if self.effective_metric_ == 'precomputed':
+                X = _check_precomputed(X)
+            else:
+                X = check_array(X, accept_sparse='csr')
         else:
             query_is_train = True
             X = self._fit_X
@@ -703,7 +914,12 @@ class from an array representing our data set and ask who's
         if radius is None:
             radius = self.radius
 
-        if self._fit_method == 'brute':
+        if (self._fit_method == 'brute' and
+                self.effective_metric_ == 'precomputed' and issparse(X)):
+            results = _radius_neighbors_from_graph(
+                X, radius=radius, return_distance=return_distance)
+
+        elif self._fit_method == 'brute':
             # for efficiency, use squared euclidean distances
             if self.effective_metric_ == 'euclidean':
                 radius *= radius
@@ -715,23 +931,23 @@ class from an array representing our data set and ask who's
                                   radius=radius,
                                   return_distance=return_distance)
 
-            results = pairwise_distances_chunked(
+            chunked_results = pairwise_distances_chunked(
                 X, self._fit_X, reduce_func=reduce_func,
                 metric=self.effective_metric_, n_jobs=self.n_jobs,
                 **kwds)
             if return_distance:
-                dist_chunks, neigh_ind_chunks = zip(*results)
-                dist_list = sum(dist_chunks, [])
+                neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)
+                neigh_dist_list = sum(neigh_dist_chunks, [])
                 neigh_ind_list = sum(neigh_ind_chunks, [])
                 # See https://github.com/numpy/numpy/issues/5456
-                # if you want to understand why this is initialized this way.
-                dist = np.empty(len(dist_list), dtype='object')
-                dist[:] = dist_list
+                # to understand why this is initialized this way.
+                neigh_dist = np.empty(len(neigh_dist_list), dtype='object')
+                neigh_dist[:] = neigh_dist_list
                 neigh_ind = np.empty(len(neigh_ind_list), dtype='object')
                 neigh_ind[:] = neigh_ind_list
-                results = dist, neigh_ind
+                results = neigh_dist, neigh_ind
             else:
-                neigh_ind_list = sum(results, [])
+                neigh_ind_list = sum(chunked_results, [])
                 results = np.empty(len(neigh_ind_list), dtype='object')
                 results[:] = neigh_ind_list
 
@@ -750,15 +966,18 @@ class from an array representing our data set and ask who's
             else:
                 delayed_query = delayed(_tree_query_radius_parallel_helper)
                 parallel_kwargs = {"prefer": "threads"}
-            results = Parallel(n_jobs, **parallel_kwargs)(
-                delayed_query(self._tree, X[s], radius, return_distance)
+
+            chunked_results = Parallel(n_jobs, **parallel_kwargs)(
+                delayed_query(self._tree, X[s], radius, return_distance,
+                              sort_results=sort_results)
+
                 for s in gen_even_slices(X.shape[0], n_jobs)
             )
             if return_distance:
-                neigh_ind, dist = tuple(zip(*results))
-                results = np.hstack(dist), np.hstack(neigh_ind)
+                neigh_ind, neigh_dist = tuple(zip(*chunked_results))
+                results = np.hstack(neigh_dist), np.hstack(neigh_ind)
             else:
-                results = np.hstack(results)
+                results = np.hstack(chunked_results)
         else:
             raise ValueError("internal: _fit_method not recognized")
 
@@ -769,7 +988,7 @@ class from an array representing our data set and ask who's
             # to ignore the first nearest neighbor of every sample, i.e
             # the sample itself.
             if return_distance:
-                dist, neigh_ind = results
+                neigh_dist, neigh_ind = results
             else:
                 neigh_ind = results
 
@@ -778,13 +997,14 @@ class from an array representing our data set and ask who's
 
                 neigh_ind[ind] = ind_neighbor[mask]
                 if return_distance:
-                    dist[ind] = dist[ind][mask]
+                    neigh_dist[ind] = neigh_dist[ind][mask]
 
             if return_distance:
-                return dist, neigh_ind
+                return neigh_dist, neigh_ind
             return neigh_ind
 
-    def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
+    def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity',
+                               sort_results=False):
         """Computes the (weighted) graph of Neighbors for points in X
 
         Neighborhoods are restricted the points at a distance lower than
@@ -792,7 +1012,7 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
 
         Parameters
         ----------
-        X : array-like, shape = [n_samples, n_features], optional
+        X : array-like, shape = [n_queries, n_features], optional
             The query point or points.
             If not provided, neighbors of each indexed point are returned.
             In this case, the query point is not considered its own neighbor.
@@ -806,9 +1026,17 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
             connectivity matrix with ones and zeros, in 'distance' the
             edges are Euclidean distance between points.
 
+        sort_results : boolean, optional. Defaults to False.
+            If True, the distances and indices will be sorted before being
+            returned. If False, the results will not be sorted.
+            Only used with mode='distance'.
+
+            .. versionadded:: 0.22
+
         Returns
         -------
-        A : sparse matrix in CSR format, shape = [n_samples, n_samples]
+        A : sparse graph in CSR format, shape = [n_queries, n_samples_fit]
+            n_samples_fit is the number of samples in the fitted data
             A[i, j] is assigned the weight of edge that connects i to j.
 
         Examples
@@ -829,10 +1057,9 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
         kneighbors_graph
         """
         check_is_fitted(self)
-        if X is not None:
-            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
 
-        n_samples2 = self._fit_X.shape[0]
+        # check the input only in self.radius_neighbors
+
         if radius is None:
             radius = self.radius
 
@@ -843,14 +1070,16 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
             A_data = None
         elif mode == 'distance':
             dist, A_ind = self.radius_neighbors(X, radius,
-                                                return_distance=True)
+                                                return_distance=True,
+                                                sort_results=sort_results)
             A_data = np.concatenate(list(dist))
         else:
             raise ValueError(
                 'Unsupported mode, must be one of "connectivity", '
                 'or "distance" but got %s instead' % mode)
 
-        n_samples1 = A_ind.shape[0]
+        n_queries = A_ind.shape[0]
+        n_samples_fit = self.n_samples_fit_
         n_neighbors = np.array([len(a) for a in A_ind])
         A_ind = np.concatenate(list(A_ind))
         if A_data is None:
@@ -859,7 +1088,7 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
                                    np.cumsum(n_neighbors)))
 
         return csr_matrix((A_data, A_ind, A_indptr),
-                          shape=(n_samples1, n_samples2))
+                          shape=(n_queries, n_samples_fit))
 
 
 class SupervisedFloatMixin:
diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py
index a72f710ae57ea..209bd93537166 100644
--- a/sklearn/neighbors/classification.py
+++ b/sklearn/neighbors/classification.py
@@ -74,6 +74,9 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
@@ -157,13 +160,13 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             Test samples.
 
         Returns
         -------
-        y : array of shape [n_samples] or [n_samples, n_outputs]
+        y : array of shape [n_queries] or [n_queries, n_outputs]
             Class labels for each data sample.
         """
         X = check_array(X, accept_sparse='csr')
@@ -176,10 +179,10 @@ def predict(self, X):
             classes_ = [self.classes_]
 
         n_outputs = len(classes_)
-        n_samples = _num_samples(X)
+        n_queries = _num_samples(X)
         weights = _get_weights(neigh_dist, self.weights)
 
-        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
+        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
         for k, classes_k in enumerate(classes_):
             if weights is None:
                 mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
@@ -199,13 +202,13 @@ def predict_proba(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             Test samples.
 
         Returns
         -------
-        p : array of shape = [n_samples, n_classes], or a list of n_outputs
+        p : array of shape = [n_queries, n_classes], or a list of n_outputs
             of such arrays if n_outputs > 1.
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
@@ -220,7 +223,7 @@ def predict_proba(self, X):
             _y = self._y.reshape((-1, 1))
             classes_ = [self.classes_]
 
-        n_samples = _num_samples(X)
+        n_queries = _num_samples(X)
 
         weights = _get_weights(neigh_dist, self.weights)
         if weights is None:
@@ -230,7 +233,7 @@ def predict_proba(self, X):
         probabilities = []
         for k, classes_k in enumerate(classes_):
             pred_labels = _y[:, k][neigh_ind]
-            proba_k = np.zeros((n_samples, classes_k.size))
+            proba_k = np.zeros((n_queries, classes_k.size))
 
             # a simple ':' index doesn't work right
             for i, idx in enumerate(pred_labels.T):  # loop is O(n_neighbors)
@@ -303,6 +306,9 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
 
     outlier_label : {manual label, 'most_frequent'}, optional (default = None)
         label for outlier samples (samples with no neighbors in given radius).
@@ -448,13 +454,13 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             Test samples.
 
         Returns
         -------
-        y : array of shape [n_samples] or [n_samples, n_outputs]
+        y : array of shape [n_queries] or [n_queries, n_outputs]
             Class labels for each data sample.
         """
 
@@ -466,9 +472,8 @@ def predict(self, X):
             classes_ = [self.classes_]
 
         n_outputs = len(classes_)
-        n_samples = probs[0].shape[0]
-        y_pred = np.empty((n_samples, n_outputs),
-                          dtype=classes_[0].dtype)
+        n_queries = probs[0].shape[0]
+        y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
 
         for k, prob in enumerate(probs):
             # iterate over multi-output, assign labels based on probabilities
@@ -491,23 +496,23 @@ def predict_proba(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             Test samples.
 
         Returns
         -------
-        p : array of shape = [n_samples, n_classes], or a list of n_outputs
+        p : array of shape = [n_queries, n_classes], or a list of n_outputs
             of such arrays if n_outputs > 1.
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
 
         X = check_array(X, accept_sparse='csr')
-        n_samples = _num_samples(X)
+        n_queries = _num_samples(X)
 
         neigh_dist, neigh_ind = self.radius_neighbors(X)
-        outlier_mask = np.zeros(n_samples, dtype=np.bool)
+        outlier_mask = np.zeros(n_queries, dtype=np.bool)
         outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
         outliers = np.flatnonzero(outlier_mask)
         inliers = np.flatnonzero(~outlier_mask)
@@ -535,7 +540,7 @@ def predict_proba(self, X):
             pred_labels = np.zeros(len(neigh_ind), dtype=object)
             pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
 
-            proba_k = np.zeros((n_samples, classes_k.size))
+            proba_k = np.zeros((n_queries, classes_k.size))
             proba_inl = np.zeros((len(inliers), classes_k.size))
 
             # samples have different size of neighbors within the same radius
diff --git a/sklearn/neighbors/graph.py b/sklearn/neighbors/graph.py
index 3999ff458e121..da3954ff909c7 100644
--- a/sklearn/neighbors/graph.py
+++ b/sklearn/neighbors/graph.py
@@ -1,11 +1,15 @@
 """Nearest Neighbors graph functions"""
 
 # Author: Jake Vanderplas <vanderplas@astro.washington.edu>
+#         Tom Dupre la Tour
 #
 # License: BSD 3 clause (C) INRIA, University of Amsterdam
-
 from .base import KNeighborsMixin, RadiusNeighborsMixin
+from .base import NeighborsBase
+from .base import UnsupervisedMixin
 from .unsupervised import NearestNeighbors
+from ..base import TransformerMixin
+from ..utils.validation import check_is_fitted
 
 
 def _check_params(X, metric, p, metric_params):
@@ -21,14 +25,16 @@ def _check_params(X, metric, p, metric_params):
                     func_param, param_name, est_params[param_name]))
 
 
-def _query_include_self(X, include_self):
+def _query_include_self(X, include_self, mode):
     """Return the query based on include_self param"""
-    if include_self:
-        query = X._fit_X
-    else:
-        query = None
+    if include_self == 'auto':
+        include_self = mode == 'connectivity'
 
-    return query
+    # it does not include each sample as its own neighbors
+    if not include_self:
+        X = None
+
+    return X
 
 
 def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
@@ -65,10 +71,10 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     metric_params : dict, optional
         additional keyword arguments for the metric function.
 
-    include_self : bool, default=False.
+    include_self : bool or 'auto', default=False
         Whether or not to mark each sample as the first nearest neighbor to
-        itself. If `None`, then True is used for mode='connectivity' and False
-        for mode='distance' as this will preserve backwards compatibility.
+        itself. If 'auto', then True is used for mode='connectivity' and False
+        for mode='distance'.
 
     n_jobs : int or None, optional (default=None)
         The number of parallel jobs to run for neighbors search.
@@ -78,7 +84,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
 
     Returns
     -------
-    A : sparse matrix in CSR format, shape = [n_samples, n_samples]
+    A : sparse graph in CSR format, shape = [n_samples, n_samples]
         A[i, j] is assigned the weight of edge that connects i to j.
 
     Examples
@@ -101,7 +107,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     else:
         _check_params(X, metric, p, metric_params)
 
-    query = _query_include_self(X, include_self)
+    query = _query_include_self(X._fit_X, include_self, mode)
     return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
 
 
@@ -143,10 +149,10 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     metric_params : dict, optional
         additional keyword arguments for the metric function.
 
-    include_self : bool, default=False
+    include_self : bool or 'auto', default=False
         Whether or not to mark each sample as the first nearest neighbor to
-        itself. If `None`, then True is used for mode='connectivity' and False
-        for mode='distance' as this will preserve backwards compatibility.
+        itself. If 'auto', then True is used for mode='connectivity' and False
+        for mode='distance'.
 
     n_jobs : int or None, optional (default=None)
         The number of parallel jobs to run for neighbors search.
@@ -156,7 +162,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
 
     Returns
     -------
-    A : sparse matrix in CSR format, shape = [n_samples, n_samples]
+    A : sparse graph in CSR format, shape = [n_samples, n_samples]
         A[i, j] is assigned the weight of edge that connects i to j.
 
     Examples
@@ -180,5 +186,284 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     else:
         _check_params(X, metric, p, metric_params)
 
-    query = _query_include_self(X, include_self)
+    query = _query_include_self(X._fit_X, include_self, mode)
     return X.radius_neighbors_graph(query, radius, mode)
+
+
+class KNeighborsTransformer(NeighborsBase, KNeighborsMixin,
+                            UnsupervisedMixin, TransformerMixin):
+    """Transform X into a (weighted) graph of k nearest neighbors
+
+    The transformed data is a sparse graph as returned by kneighbors_graph.
+
+    Read more in the :ref:`User Guide <neighbors_transformer>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    mode : {'distance', 'connectivity'}, default='distance'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    n_neighbors : int, default=5
+        Number of neighbors for each sample in the transformed sparse graph.
+        For compatibility reasons, as each sample is considered as its own
+        neighbor, one extra neighbor will be computed when mode == 'distance'.
+        In this case, the sparse graph contains (n_neighbors + 1) neighbors.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : string or callable, default='minkowski'
+        metric to use for distance computation. Any metric from scikit-learn
+        or scipy.spatial.distance can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
+
+        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for scipy.spatial.distance for details on these
+        metrics.
+
+    p : int, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=1
+        The number of parallel jobs to run for neighbors search.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
+    Examples
+    --------
+    >>> from sklearn.manifold import Isomap
+    >>> from sklearn.neighbors import KNeighborsTransformer
+    >>> from sklearn.pipeline import make_pipeline
+    >>> estimator = make_pipeline(
+    ...     KNeighborsTransformer(n_neighbors=5, mode='distance'),
+    ...     Isomap(neighbors_algorithm='precomputed'))
+    """
+    def __init__(self, mode='distance', n_neighbors=5, algorithm='auto',
+                 leaf_size=30, metric='minkowski', p=2, metric_params=None,
+                 n_jobs=1):
+        super(KNeighborsTransformer, self).__init__(
+            n_neighbors=n_neighbors, radius=None, algorithm=algorithm,
+            leaf_size=leaf_size, metric=metric, p=p,
+            metric_params=metric_params, n_jobs=n_jobs)
+        self.mode = mode
+
+    def transform(self, X):
+        """Computes the (weighted) graph of Neighbors for points in X
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_transform, n_features)
+            Sample data
+
+        Returns
+        -------
+        Xt : CSR sparse graph of shape (n_samples_transform, n_samples_fit)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+        """
+        check_is_fitted(self)
+        add_one = self.mode == 'distance'
+        return self.kneighbors_graph(X, mode=self.mode,
+                                     n_neighbors=self.n_neighbors + add_one)
+
+    def fit_transform(self, X, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : ignored
+
+        Returns
+        -------
+        Xt : CSR sparse graph of shape (n_samples, n_samples)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+        """
+        return self.fit(X).transform(X)
+
+
+class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin,
+                                 UnsupervisedMixin, TransformerMixin):
+    """Transform X into a (weighted) graph of neighbors nearer than a radius
+
+    The transformed data is a sparse graph as returned by
+    radius_neighbors_graph.
+
+    Read more in the :ref:`User Guide <neighbors_transformer>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    mode : {'distance', 'connectivity'}, default='distance'
+        Type of returned matrix: 'connectivity' will return the connectivity
+        matrix with ones and zeros, and 'distance' will return the distances
+        between neighbors according to the given metric.
+
+    radius : float, default=1.
+        Radius of neighborhood in the transformed sparse graph.
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        Algorithm used to compute the nearest neighbors:
+
+        - 'ball_tree' will use :class:`BallTree`
+        - 'kd_tree' will use :class:`KDTree`
+        - 'brute' will use a brute-force search.
+        - 'auto' will attempt to decide the most appropriate algorithm
+          based on the values passed to :meth:`fit` method.
+
+        Note: fitting on sparse input will override the setting of
+        this parameter, using brute force.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or KDTree.  This can affect the
+        speed of the construction and query, as well as the memory
+        required to store the tree.  The optimal value depends on the
+        nature of the problem.
+
+    metric : string or callable, default='minkowski'
+        metric to use for distance computation. Any metric from scikit-learn
+        or scipy.spatial.distance can be used.
+
+        If metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays as input and return one value indicating the
+        distance between them. This works for Scipy's metrics, but is less
+        efficient than passing the metric name as a string.
+
+        Distance matrices are not supported.
+
+        Valid values for metric are:
+
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
+
+        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+          'yule']
+
+        See the documentation for scipy.spatial.distance for details on these
+        metrics.
+
+    p : int, default=2
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+    n_jobs : int, default=1
+        The number of parallel jobs to run for neighbors search.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import DBSCAN
+    >>> from sklearn.neighbors import RadiusNeighborsTransformer
+    >>> from sklearn.pipeline import make_pipeline
+    >>> estimator = make_pipeline(
+    ...     RadiusNeighborsTransformer(radius=42.0, mode='distance'),
+    ...     DBSCAN(min_samples=30, metric='precomputed'))
+    """
+    def __init__(self, mode='distance', radius=1., algorithm='auto',
+                 leaf_size=30, metric='minkowski', p=2, metric_params=None,
+                 n_jobs=1):
+        super(RadiusNeighborsTransformer, self).__init__(
+            n_neighbors=None, radius=radius, algorithm=algorithm,
+            leaf_size=leaf_size, metric=metric, p=p,
+            metric_params=metric_params, n_jobs=n_jobs)
+        self.mode = mode
+
+    def transform(self, X):
+        """Computes the (weighted) graph of Neighbors for points in X
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_transform, n_features)
+            Sample data
+
+        Returns
+        -------
+        Xt : CSR sparse graph of shape (n_samples_transform, n_samples_fit)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+        """
+        check_is_fitted(self)
+        return self.radius_neighbors_graph(X, mode=self.mode,
+                                           sort_results=True)
+
+    def fit_transform(self, X, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training set.
+
+        y : ignored
+
+        Returns
+        -------
+        Xt : CSR sparse graph, shape (n_samples, n_samples)
+            Xt[i, j] is assigned the weight of edge that connects i to j.
+            Only the neighbors have an explicit value.
+            The diagonal is always explicit.
+        """
+        return self.fit(X).transform(X)
diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py
index f4f697565cd3e..fa02bed235535 100644
--- a/sklearn/neighbors/lof.py
+++ b/sklearn/neighbors/lof.py
@@ -60,8 +60,9 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
         metric used for the distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
-        If 'precomputed', the training input X is expected to be a distance
-        matrix.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square. X may be a sparse matrix, in which case only "nonzero"
+        elements may be considered neighbors.
 
         If metric is a callable function, it is called on each
         pair of instances (rows) and the resulting value recorded. The callable
@@ -118,8 +119,6 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
-        Affects only :meth:`kneighbors` and :meth:`kneighbors_graph` methods.
-
 
     Attributes
     ----------
@@ -239,7 +238,7 @@ def fit(self, X, y=None):
 
         super().fit(X)
 
-        n_samples = self._fit_X.shape[0]
+        n_samples = self.n_samples_fit_
         if self.n_neighbors > n_samples:
             warnings.warn("n_neighbors (%s) is greater than the "
                           "total number of samples (%s). n_neighbors "
@@ -247,8 +246,8 @@ def fit(self, X, y=None):
                           % (self.n_neighbors, n_samples))
         self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
 
-        self._distances_fit_X_, _neighbors_indices_fit_X_ = (
-            self.kneighbors(None, n_neighbors=self.n_neighbors_))
+        self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
+            n_neighbors=self.n_neighbors_)
 
         self._lrd = self._local_reachability_density(
             self._distances_fit_X_, _neighbors_indices_fit_X_)
@@ -320,7 +319,7 @@ def _predict(self, X=None):
             is_inlier = np.ones(X.shape[0], dtype=int)
             is_inlier[self.decision_function(X) < 0] = -1
         else:
-            is_inlier = np.ones(self._fit_X.shape[0], dtype=int)
+            is_inlier = np.ones(self.n_samples_fit_, dtype=int)
             is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
 
         return is_inlier
@@ -475,17 +474,17 @@ def _local_reachability_density(self, distances_X, neighbors_indices):
 
         Parameters
         ----------
-        distances_X : array, shape (n_query, self.n_neighbors)
+        distances_X : array, shape (n_queries, self.n_neighbors)
             Distances to the neighbors (in the training samples `self._fit_X`)
             of each query point to compute the LRD.
 
-        neighbors_indices : array, shape (n_query, self.n_neighbors)
+        neighbors_indices : array, shape (n_queries, self.n_neighbors)
             Neighbors indices (of each query point) among training samples
             self._fit_X.
 
         Returns
         -------
-        local_reachability_density : array, shape (n_samples,)
+        local_reachability_density : array, shape (n_queries,)
             The local reachability density of each sample.
         """
         dist_k = self._distances_fit_X_[neighbors_indices,
diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py
index a8819b222c1bd..cbb033f0b8cae 100644
--- a/sklearn/neighbors/regression.py
+++ b/sklearn/neighbors/regression.py
@@ -13,7 +13,6 @@
 import warnings
 
 import numpy as np
-from scipy.sparse import issparse
 
 from .base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin
 from .base import RadiusNeighborsMixin, SupervisedFloatMixin
@@ -78,6 +77,9 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
@@ -151,20 +153,15 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             Test samples.
 
         Returns
         -------
-        y : array of int, shape = [n_samples] or [n_samples, n_outputs]
+        y : array of int, shape = [n_queries] or [n_queries, n_outputs]
             Target values
         """
-        if issparse(X) and self.metric == 'precomputed':
-            raise ValueError(
-                "Sparse matrices not supported for prediction with "
-                "precomputed kernels. Densify your matrix."
-            )
         X = check_array(X, accept_sparse='csr')
 
         neigh_dist, neigh_ind = self.kneighbors(X)
@@ -249,6 +246,9 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
@@ -315,13 +315,13 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_query, n_features), \
-                or (n_query, n_indexed) if metric == 'precomputed'
+        X : array-like, shape (n_queries, n_features), \
+                or (n_queries, n_indexed) if metric == 'precomputed'
             Test samples.
 
         Returns
         -------
-        y : array of float, shape = [n_samples] or [n_samples, n_outputs]
+        y : array of float, shape = [n_queries] or [n_queries, n_outputs]
             Target values
         """
         X = check_array(X, accept_sparse='csr')
diff --git a/sklearn/neighbors/tests/test_graph.py b/sklearn/neighbors/tests/test_graph.py
new file mode 100644
index 0000000000000..b4f6ddb42ed06
--- /dev/null
+++ b/sklearn/neighbors/tests/test_graph.py
@@ -0,0 +1,79 @@
+import numpy as np
+
+from sklearn.metrics import euclidean_distances
+from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
+from sklearn.neighbors.base import _is_sorted_by_data
+
+
+def test_transformer_result():
+    # Test the number of neighbors returned
+    n_neighbors = 5
+    n_samples_fit = 20
+    n_queries = 18
+    n_features = 10
+
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+    X2 = rng.randn(n_queries, n_features)
+    radius = np.percentile(euclidean_distances(X), 10)
+
+    # with n_neighbors
+    for mode in ['distance', 'connectivity']:
+        add_one = mode == 'distance'
+        nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
+        Xt = nnt.fit_transform(X)
+        assert Xt.shape == (n_samples_fit, n_samples_fit)
+        assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
+        assert Xt.format == 'csr'
+        assert _is_sorted_by_data(Xt)
+
+        X2t = nnt.transform(X2)
+        assert X2t.shape == (n_queries, n_samples_fit)
+        assert X2t.data.shape == (n_queries * (n_neighbors + add_one), )
+        assert X2t.format == 'csr'
+        assert _is_sorted_by_data(X2t)
+
+    # with radius
+    for mode in ['distance', 'connectivity']:
+        add_one = mode == 'distance'
+        nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
+        Xt = nnt.fit_transform(X)
+        assert Xt.shape == (n_samples_fit, n_samples_fit)
+        assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
+        assert Xt.format == 'csr'
+        assert _is_sorted_by_data(Xt)
+
+        X2t = nnt.transform(X2)
+        assert X2t.shape == (n_queries, n_samples_fit)
+        assert not X2t.data.shape == (n_queries * (n_neighbors + add_one), )
+        assert X2t.format == 'csr'
+        assert _is_sorted_by_data(X2t)
+
+
+def _has_explicit_diagonal(X):
+    """Return True if the diagonal is explicitly stored"""
+    X = X.tocoo()
+    explicit = X.row[X.row == X.col]
+    return len(explicit) == X.shape[0]
+
+
+def test_explicit_diagonal():
+    # Test that the diagonal is explicitly stored in the sparse graph
+    n_neighbors = 5
+    n_samples_fit, n_samples_transform, n_features = 20, 18, 10
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples_fit, n_features)
+    X2 = rng.randn(n_samples_transform, n_features)
+
+    nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
+    Xt = nnt.fit_transform(X)
+    assert _has_explicit_diagonal(Xt)
+    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
+
+    Xt = nnt.transform(X)
+    assert _has_explicit_diagonal(Xt)
+    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
+
+    # Using transform on new data should not always have zero diagonal
+    X2t = nnt.transform(X2)
+    assert not _has_explicit_diagonal(X2t)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 3da1c2579700f..0d7166da64fd8 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1,19 +1,22 @@
 from itertools import product
 
+import pytest
 import numpy as np
 from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
                           dok_matrix, lil_matrix, issparse)
 
-import pytest
-
 from sklearn import metrics
 from sklearn import neighbors, datasets
+from sklearn.base import clone
 from sklearn.exceptions import DataConversionWarning
+from sklearn.exceptions import EfficiencyWarning
 from sklearn.exceptions import NotFittedError
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors.base import VALID_METRICS_SPARSE, VALID_METRICS
+from sklearn.neighbors.base import _is_sorted_by_data, _check_precomputed
+from sklearn.pipeline import make_pipeline
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
@@ -133,15 +136,15 @@ def test_not_fitted_error_gets_raised():
     assert_raises(NotFittedError, neighbors_.radius_neighbors_graph, X)
 
 
-def test_precomputed(random_state=42):
+@ignore_warnings(category=EfficiencyWarning)
+def check_precomputed(make_train_test, estimators):
     """Tests unsupervised NearestNeighbors with a distance matrix."""
     # Note: smaller samples may result in spurious test success
-    rng = np.random.RandomState(random_state)
+    rng = np.random.RandomState(42)
     X = rng.random_sample((10, 4))
     Y = rng.random_sample((3, 4))
-    DXX = metrics.pairwise_distances(X, metric='euclidean')
-    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
-    for method in ['kneighbors']:
+    DXX, DYX = make_train_test(X, Y)
+    for method in ['kneighbors', ]:
         # TODO: also test radius_neighbors, but requires different assertion
 
         # As a feature matrix (n_samples by n_features)
@@ -175,11 +178,7 @@ def test_precomputed(random_state=42):
         assert_raises(ValueError, getattr(nbrs_D, method), X)
 
     target = np.arange(X.shape[0])
-    for Est in (neighbors.KNeighborsClassifier,
-                neighbors.RadiusNeighborsClassifier,
-                neighbors.KNeighborsRegressor,
-                neighbors.RadiusNeighborsRegressor):
-        print(Est)
+    for Est in estimators:
         est = Est(metric='euclidean')
         est.radius = est.n_neighbors = 1
         pred_X = est.fit(X, target).predict(Y)
@@ -188,6 +187,118 @@ def test_precomputed(random_state=42):
         assert_array_almost_equal(pred_X, pred_D)
 
 
+def test_precomputed_dense():
+    def make_train_test(X_train, X_test):
+        return (metrics.pairwise_distances(X_train),
+                metrics.pairwise_distances(X_test, X_train))
+
+    estimators = [
+        neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier, neighbors.RadiusNeighborsRegressor
+    ]
+    check_precomputed(make_train_test, estimators)
+
+
+@pytest.mark.parametrize('fmt', ['csr', 'lil'])
+def test_precomputed_sparse_knn(fmt):
+    def make_train_test(X_train, X_test):
+        nn = neighbors.NearestNeighbors(n_neighbors=3 + 1).fit(X_train)
+        return (nn.kneighbors_graph(X_train, mode='distance').asformat(fmt),
+                nn.kneighbors_graph(X_test, mode='distance').asformat(fmt))
+
+    # We do not test RadiusNeighborsClassifier and RadiusNeighborsRegressor
+    # since the precomputed neighbors graph is built with k neighbors only.
+    estimators = [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ]
+    check_precomputed(make_train_test, estimators)
+
+
+@pytest.mark.parametrize('fmt', ['csr', 'lil'])
+def test_precomputed_sparse_radius(fmt):
+    def make_train_test(X_train, X_test):
+        nn = neighbors.NearestNeighbors(radius=1).fit(X_train)
+        return (nn.radius_neighbors_graph(X_train,
+                                          mode='distance').asformat(fmt),
+                nn.radius_neighbors_graph(X_test,
+                                          mode='distance').asformat(fmt))
+
+    # We do not test KNeighborsClassifier and KNeighborsRegressor
+    # since the precomputed neighbors graph is built with a radius.
+    estimators = [
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ]
+    check_precomputed(make_train_test, estimators)
+
+
+def test_is_sorted_by_data():
+    # Test that _is_sorted_by_data works as expected. In CSR sparse matrix,
+    # entries in each row can be sorted by indices, by data, or unsorted.
+    # _is_sorted_by_data should return True when entries are sorted by data,
+    # and False in all other cases.
+
+    # Test with sorted 1D array
+    X = csr_matrix(np.arange(10))
+    assert _is_sorted_by_data(X)
+    # Test with unsorted 1D array
+    X[0, 2] = 5
+    assert not _is_sorted_by_data(X)
+
+    # Test when the data is sorted in each sample, but not necessarily
+    # between samples
+    X = csr_matrix([[0, 1, 2], [3, 0, 0], [3, 4, 0], [1, 0, 2]])
+    assert _is_sorted_by_data(X)
+
+    # Test with duplicates entries in X.indptr
+    data, indices, indptr = [0, 4, 2, 2], [0, 1, 1, 1], [0, 2, 2, 4]
+    X = csr_matrix((data, indices, indptr), shape=(3, 3))
+    assert _is_sorted_by_data(X)
+
+
+@ignore_warnings(category=EfficiencyWarning)
+def test_check_precomputed():
+    # Test that _check_precomputed returns a graph sorted by data
+    X = csr_matrix(np.abs(np.random.RandomState(42).randn(10, 10)))
+    assert not _is_sorted_by_data(X)
+    Xt = _check_precomputed(X)
+    assert _is_sorted_by_data(Xt)
+
+    # est with a different number of nonzero entries for each sample
+    mask = np.random.RandomState(42).randint(2, size=(10, 10))
+    X = X.toarray()
+    X[mask == 1] = 0
+    X = csr_matrix(X)
+    assert not _is_sorted_by_data(X)
+    Xt = _check_precomputed(X)
+    assert _is_sorted_by_data(Xt)
+
+
+@ignore_warnings(category=EfficiencyWarning)
+def test_precomputed_sparse_invalid():
+    dist = np.array([[0., 2., 1.], [2., 0., 3.], [1., 3., 0.]])
+    dist_csr = csr_matrix(dist)
+    neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed")
+    neigh.fit(dist_csr)
+    neigh.kneighbors(None, n_neighbors=1)
+    neigh.kneighbors(np.array([[0., 0., 0.]]), n_neighbors=2)
+
+    # Ensures enough number of nearest neighbors
+    dist = np.array([[0., 2., 0.], [2., 0., 3.], [0., 3., 0.]])
+    dist_csr = csr_matrix(dist)
+    neigh.fit(dist_csr)
+    msg = "2 neighbors per samples are required, but some samples have only 1"
+    assert_raises_regex(ValueError, msg, neigh.kneighbors, None, n_neighbors=1)
+
+    # Checks error with inconsistent distance matrix
+    dist = np.array([[5., 2., 1.], [-2., 0., 3.], [1., 3., 0.]])
+    dist_csr = csr_matrix(dist)
+    msg = "Negative values in data passed to precomputed distance matrix."
+    assert_raises_regex(ValueError, msg, neigh.kneighbors, dist_csr,
+                        n_neighbors=1)
+
+
 def test_precomputed_cross_validation():
     # Ensure array is split correctly
     rng = np.random.RandomState(0)
@@ -821,6 +932,7 @@ def test_RadiusNeighborsRegressor_multioutput(n_samples=40,
         assert np.all(np.abs(y_pred - y_target) < 0.3)
 
 
+@ignore_warnings(category=EfficiencyWarning)
 def test_kneighbors_regressor_sparse(n_samples=40,
                                      n_features=5,
                                      n_test_pts=10,
@@ -846,10 +958,7 @@ def test_kneighbors_regressor_sparse(n_samples=40,
             assert np.mean(knn.predict(X2).round() == y) > 0.95
 
             X2_pre = sparsev(pairwise_distances(X, metric='euclidean'))
-            if issparse(sparsev(X2_pre)):
-                assert_raises(ValueError, knn_pre.predict, X2_pre)
-            else:
-                assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95
+            assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95
 
 
 def test_neighbors_iris():
@@ -1318,6 +1427,7 @@ def test_k_and_radius_neighbors_duplicates():
 
         rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5,
                                         mode='distance')
+        rng.sort_indices()
         assert_array_equal(rng.A, [[0, 1], [1, 0]])
         assert_array_equal(rng.indices, [0, 1, 0, 1])
         assert_array_equal(rng.data, [0, 1, 1, 0])
@@ -1498,3 +1608,45 @@ def test_radius_neighbors_predict_proba():
         proba_label = np.where(proba.sum(axis=1) == 0,
                                outlier_label, proba_label)
         assert_array_equal(pred, proba_label)
+
+
+def test_pipeline_with_nearest_neighbors_transformer():
+    # Test chaining KNeighborsTransformer and classifiers/regressors
+    rng = np.random.RandomState(0)
+    X = 2 * rng.rand(40, 5) - 1
+    X2 = 2 * rng.rand(40, 5) - 1
+    y = rng.rand(40, 1)
+
+    n_neighbors = 12
+    radius = 1.5
+    # We precompute more neighbors than necessary, to have equivalence between
+    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
+    factor = 2
+
+    k_trans = neighbors.KNeighborsTransformer(
+        n_neighbors=n_neighbors, mode='distance')
+    k_trans_factor = neighbors.KNeighborsTransformer(
+        n_neighbors=int(n_neighbors * factor), mode='distance')
+
+    r_trans = neighbors.RadiusNeighborsTransformer(
+        radius=radius, mode='distance')
+    r_trans_factor = neighbors.RadiusNeighborsTransformer(
+        radius=int(radius * factor), mode='distance')
+
+    k_reg = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)
+    r_reg = neighbors.RadiusNeighborsRegressor(radius=radius)
+
+    test_list = [(k_trans, k_reg), (k_trans_factor, r_reg),
+                 (r_trans, r_reg), (r_trans_factor, k_reg), ]
+
+    for trans, reg in test_list:
+        # compare the chained version and the compact version
+        reg_compact = clone(reg)
+        reg_precomp = clone(reg)
+        reg_precomp.set_params(metric='precomputed')
+
+        reg_chain = make_pipeline(clone(trans), reg_precomp)
+
+        y_pred_chain = reg_chain.fit(X, y).predict(X2)
+        y_pred_compact = reg_compact.fit(X, y).predict(X2)
+        assert_array_almost_equal(y_pred_chain, y_pred_compact)
diff --git a/sklearn/neighbors/tests/test_neighbors_pipeline.py b/sklearn/neighbors/tests/test_neighbors_pipeline.py
new file mode 100644
index 0000000000000..455cca6937dc1
--- /dev/null
+++ b/sklearn/neighbors/tests/test_neighbors_pipeline.py
@@ -0,0 +1,221 @@
+"""
+This is testing the equivalence between some estimators with internal nearest
+neighbors computations, and the corresponding pipeline versions with
+KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
+neighbors.
+"""
+
+import numpy as np
+
+from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.datasets.samples_generator import make_blobs
+from sklearn.pipeline import make_pipeline
+from sklearn.base import clone
+
+from sklearn.neighbors import KNeighborsTransformer
+from sklearn.neighbors import RadiusNeighborsTransformer
+
+from sklearn.cluster import DBSCAN
+from sklearn.cluster import SpectralClustering
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.neighbors import RadiusNeighborsRegressor
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.manifold import SpectralEmbedding
+from sklearn.manifold import Isomap
+from sklearn.manifold import TSNE
+
+
+def test_spectral_clustering():
+    # Test chaining KNeighborsTransformer and SpectralClustering
+    n_neighbors = 5
+    X, _ = make_blobs(random_state=0)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
+        SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed',
+                           random_state=42))
+    est_compact = SpectralClustering(
+        n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
+    labels_compact = est_compact.fit_predict(X)
+    labels_chain = est_chain.fit_predict(X)
+    assert_array_almost_equal(labels_chain, labels_compact)
+
+
+def test_spectral_embedding():
+    # Test chaining KNeighborsTransformer and SpectralEmbedding
+    n_neighbors = 5
+
+    n_samples = 1000
+    centers = np.array([
+        [0.0, 5.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 4.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 5.0, 1.0],
+    ])
+    S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
+                                cluster_std=1., random_state=42)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
+        SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed',
+                          random_state=42))
+    est_compact = SpectralEmbedding(
+        n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
+    St_compact = est_compact.fit_transform(S)
+    St_chain = est_chain.fit_transform(S)
+    assert_array_almost_equal(St_chain, St_compact)
+
+
+def test_dbscan():
+    # Test chaining RadiusNeighborsTransformer and DBSCAN
+    radius = 0.3
+    n_clusters = 3
+    X = generate_clustered_data(n_clusters=n_clusters)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        RadiusNeighborsTransformer(radius=radius, mode='distance'),
+        DBSCAN(metric='precomputed', eps=radius))
+    est_compact = DBSCAN(eps=radius)
+
+    labels_chain = est_chain.fit_predict(X)
+    labels_compact = est_compact.fit_predict(X)
+    assert_array_almost_equal(labels_chain, labels_compact)
+
+
+def test_isomap():
+    # Test chaining KNeighborsTransformer and Isomap with
+    # neighbors_algorithm='precomputed'
+    algorithm = 'auto'
+    n_neighbors = 10
+
+    X, _ = make_blobs(random_state=0)
+    X2, _ = make_blobs(random_state=1)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, algorithm=algorithm,
+                              mode='distance'),
+        Isomap(n_neighbors=n_neighbors, metric='precomputed'))
+    est_compact = Isomap(n_neighbors=n_neighbors,
+                         neighbors_algorithm=algorithm)
+
+    Xt_chain = est_chain.fit_transform(X)
+    Xt_compact = est_compact.fit_transform(X)
+    assert_array_almost_equal(Xt_chain, Xt_compact)
+
+    Xt_chain = est_chain.transform(X2)
+    Xt_compact = est_compact.transform(X2)
+    assert_array_almost_equal(Xt_chain, Xt_compact)
+
+
+def test_tsne():
+    # Test chaining KNeighborsTransformer and TSNE
+    n_iter = 250
+    perplexity = 5
+    n_neighbors = int(3. * perplexity + 1)
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(20, 2)
+
+    for metric in ['minkowski', 'sqeuclidean']:
+
+        # compare the chained version and the compact version
+        est_chain = make_pipeline(
+            KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
+                                  metric=metric),
+            TSNE(metric='precomputed', perplexity=perplexity,
+                 method="barnes_hut", random_state=42, n_iter=n_iter))
+        est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter,
+                           method="barnes_hut", random_state=42)
+
+        Xt_chain = est_chain.fit_transform(X)
+        Xt_compact = est_compact.fit_transform(X)
+        assert_array_almost_equal(Xt_chain, Xt_compact)
+
+
+def test_lof_novelty_false():
+    # Test chaining KNeighborsTransformer and LocalOutlierFactor
+    n_neighbors = 4
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(40, 2)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
+        LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
+                           novelty=False, contamination="auto"))
+    est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=False,
+                                     contamination="auto")
+
+    pred_chain = est_chain.fit_predict(X)
+    pred_compact = est_compact.fit_predict(X)
+    assert_array_almost_equal(pred_chain, pred_compact)
+
+
+def test_lof_novelty_true():
+    # Test chaining KNeighborsTransformer and LocalOutlierFactor
+    n_neighbors = 4
+
+    rng = np.random.RandomState(0)
+    X1 = rng.randn(40, 2)
+    X2 = rng.randn(40, 2)
+
+    # compare the chained version and the compact version
+    est_chain = make_pipeline(
+        KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
+        LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
+                           novelty=True, contamination="auto"))
+    est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True,
+                                     contamination="auto")
+
+    pred_chain = est_chain.fit(X1).predict(X2)
+    pred_compact = est_compact.fit(X1).predict(X2)
+    assert_array_almost_equal(pred_chain, pred_compact)
+
+
+def test_kneighbors_regressor():
+    # Test chaining KNeighborsTransformer and classifiers/regressors
+    rng = np.random.RandomState(0)
+    X = 2 * rng.rand(40, 5) - 1
+    X2 = 2 * rng.rand(40, 5) - 1
+    y = rng.rand(40, 1)
+
+    n_neighbors = 12
+    radius = 1.5
+    # We precompute more neighbors than necessary, to have equivalence between
+    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
+    factor = 2
+
+    k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance')
+    k_trans_factor = KNeighborsTransformer(n_neighbors=int(
+        n_neighbors * factor), mode='distance')
+
+    r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance')
+    r_trans_factor = RadiusNeighborsTransformer(radius=int(
+        radius * factor), mode='distance')
+
+    k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
+    r_reg = RadiusNeighborsRegressor(radius=radius)
+
+    test_list = [
+        (k_trans, k_reg),
+        (k_trans_factor, r_reg),
+        (r_trans, r_reg),
+        (r_trans_factor, k_reg),
+    ]
+
+    for trans, reg in test_list:
+        # compare the chained version and the compact version
+        reg_compact = clone(reg)
+        reg_precomp = clone(reg)
+        reg_precomp.set_params(metric='precomputed')
+
+        reg_chain = make_pipeline(clone(trans), reg_precomp)
+
+        y_pred_chain = reg_chain.fit(X, y).predict(X2)
+        y_pred_compact = reg_compact.fit(X, y).predict(X2)
+        assert_array_almost_equal(y_pred_chain, y_pred_compact)
diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py
index 806b6f7736472..4bd02ed0dbfd0 100644
--- a/sklearn/neighbors/unsupervised.py
+++ b/sklearn/neighbors/unsupervised.py
@@ -1,5 +1,4 @@
 """Unsupervised nearest neighbors learner"""
-
 from .base import NeighborsBase
 from .base import KNeighborsMixin
 from .base import RadiusNeighborsMixin
@@ -40,30 +39,13 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
         nature of the problem.
 
     metric : string or callable, default 'minkowski'
-        metric to use for distance computation. Any metric from scikit-learn
-        or scipy.spatial.distance can be used.
-
-        If metric is a callable function, it is called on each
-        pair of instances (rows) and the resulting value recorded. The callable
-        should take two arrays as input and return one value indicating the
-        distance between them. This works for Scipy's metrics, but is less
-        efficient than passing the metric name as a string.
-
-        Distance matrices are not supported.
-
-        Valid values for metric are:
-
-        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
-
-        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
-          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
-          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
-          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
-          'yule']
-
-        See the documentation for scipy.spatial.distance for details on these
-        metrics.
+        the distance metric to use for the tree.  The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric. See the documentation of the DistanceMetric class for a
+        list of available metrics.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
 
     p : integer, optional (default = 2)
         Parameter for the Minkowski metric from
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index ce862255f02eb..9114a8e5f7631 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -19,21 +19,18 @@
 
 from sklearn.utils.testing import all_estimators
 from sklearn.utils.testing import ignore_warnings
-from sklearn.exceptions import ConvergenceWarning, SkipTestWarning
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.estimator_checks import check_estimator
 
 import sklearn
-from sklearn.base import RegressorMixin
 from sklearn.cluster.bicluster import BiclusterMixin
 
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.linear_model.base import LinearClassifierMixin
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.utils import IS_PYPY
 from sklearn.utils.testing import SkipTest
 from sklearn.utils.estimator_checks import (
-    _safe_tags,
     _construct_instance,
     set_checking_parameters,
     _set_check_estimator_ids,
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 5a10bf6db88fe..c3a498c6f42f8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -681,10 +681,11 @@ def check_sample_weights_pandas_series(name, estimator_orig):
         try:
             import pandas as pd
             X = np.array([[1, 1], [1, 2], [1, 3], [1, 4],
-                          [2, 1], [2, 2], [2, 3], [2, 4]])
+                          [2, 1], [2, 2], [2, 3], [2, 4],
+                          [3, 1], [3, 2], [3, 3], [3, 4]])
             X = pd.DataFrame(pairwise_estimator_convert_X(X, estimator_orig))
-            y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2])
-            weights = pd.Series([1] * 8)
+            y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
+            weights = pd.Series([1] * 12)
             if _safe_tags(estimator, "multioutput_only"):
                 y = pd.DataFrame(y)
             try:
@@ -705,14 +706,15 @@ def check_sample_weights_list(name, estimator_orig):
     if has_fit_parameter(estimator_orig, "sample_weight"):
         estimator = clone(estimator_orig)
         rnd = np.random.RandomState(0)
-        X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)),
+        n_samples = 30
+        X = pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
                                          estimator_orig)
         if _safe_tags(estimator, 'binary_only'):
-            y = np.arange(10) % 2
+            y = np.arange(n_samples) % 2
         else:
-            y = np.arange(10) % 3
+            y = np.arange(n_samples) % 3
         y = _enforce_estimator_tags_y(estimator, y)
-        sample_weight = [3] * 10
+        sample_weight = [3] * n_samples
         # Test that estimators don't raise any exception
         estimator.fit(X, y, sample_weight=sample_weight)
 
@@ -940,6 +942,7 @@ def _apply_on_subsets(func, X):
     n_features = X.shape[1]
     result_by_batch = [func(batch.reshape(1, n_features))
                        for batch in X]
+
     # func can output tuple (e.g. score_samples)
     if type(result_full) == tuple:
         result_full = result_full[0]
@@ -948,6 +951,7 @@ def _apply_on_subsets(func, X):
     if sparse.issparse(result_full):
         result_full = result_full.A
         result_by_batch = [x.A for x in result_by_batch]
+
     return np.ravel(result_full), np.ravel(result_by_batch)
 
 
@@ -1234,12 +1238,13 @@ def check_fit_score_takes_y(name, estimator_orig):
     # check that all estimators accept an optional y
     # in fit and score so they can be used in pipelines
     rnd = np.random.RandomState(0)
-    X = rnd.uniform(size=(10, 3))
+    n_samples = 30
+    X = rnd.uniform(size=(n_samples, 3))
     X = pairwise_estimator_convert_X(X, estimator_orig)
     if _safe_tags(estimator_orig, 'binary_only'):
-        y = np.arange(10) % 2
+        y = np.arange(n_samples) % 2
     else:
-        y = np.arange(10) % 3
+        y = np.arange(n_samples) % 3
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
     set_random_state(estimator)
@@ -1801,7 +1806,7 @@ def check_estimators_fit_returns_self(name, estimator_orig,
         n_centers = 2
     else:
         n_centers = 3
-    X, y = make_blobs(random_state=0, n_samples=9, centers=n_centers)
+    X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers)
     # some want non-negative input
     X -= X.min()
     X = pairwise_estimator_convert_X(X, estimator_orig)
@@ -1839,11 +1844,14 @@ def check_supervised_y_2d(name, estimator_orig):
         # These only work on 2d, so this test makes no sense
         return
     rnd = np.random.RandomState(0)
-    X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig)
+    n_samples = 30
+    X = pairwise_estimator_convert_X(
+        rnd.uniform(size=(n_samples, 3)), estimator_orig
+    )
     if tags['binary_only']:
-        y = np.arange(10) % 2
+        y = np.arange(n_samples) % 2
     else:
-        y = np.arange(10) % 3
+        y = np.arange(n_samples) % 3
     y = _enforce_estimator_tags_y(estimator_orig, y)
     estimator = clone(estimator_orig)
     set_random_state(estimator)
@@ -2177,7 +2185,7 @@ def check_estimators_overwrite_params(name, estimator_orig):
         n_centers = 2
     else:
         n_centers = 3
-    X, y = make_blobs(random_state=0, n_samples=9, centers=n_centers)
+    X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers)
     # some want non-negative input
     X -= X.min()
     X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
@@ -2269,9 +2277,10 @@ def check_sparsify_coefficients(name, estimator_orig):
 
 @ignore_warnings(category=DeprecationWarning)
 def check_classifier_data_not_an_array(name, estimator_orig):
-    X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]])
+    X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1],
+                  [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]])
     X = pairwise_estimator_convert_X(X, estimator_orig)
-    y = [1, 1, 1, 2, 2, 2]
+    y = [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]
     y = _enforce_estimator_tags_y(estimator_orig, y)
     check_estimators_data_not_an_array(name, estimator_orig, X, y)
 

From fe052e6e6b8b3967de26cca3c5333211633013b6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 19 Sep 2019 09:13:00 -0400
Subject: [PATCH 34/53] set n_features_in_ for stacking estimators

---
 sklearn/ensemble/_stacking.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index c2a09c54b4622..dbf51fdcfbcc7 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -182,6 +182,7 @@ def fit(self, X, y, sample_weight=None):
             delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight)
             for est in all_estimators if est != 'drop'
         )
+        self.n_features_in_ = self.estimators_[0].n_features_in_
 
         self.named_estimators_ = Bunch()
         est_fitted_idx = 0

From 9a205dd8faf09a953ed968c6774fb57bcf779a5e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 25 Sep 2019 14:04:41 -0400
Subject: [PATCH 35/53] dont hardcode attribute in init for sparsecoder

---
 sklearn/decomposition/dict_learning.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 9ed705d680059..501b259422533 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -1024,7 +1024,6 @@ def __init__(self, dictionary, transform_algorithm='omp',
                                        transform_alpha, split_sign, n_jobs,
                                        positive_code, transform_max_iter)
         self.components_ = dictionary
-        self.n_features_in_ = dictionary.shape[1]
 
     def fit(self, X, y=None):
         """Do nothing and return the estimator unchanged
@@ -1045,6 +1044,10 @@ def fit(self, X, y=None):
         """
         return self
 
+    @property
+    def n_features_in_(self):
+        return self.components_.shape[1]
+
 
 class DictionaryLearning(SparseCodingMixin, BaseEstimator):
     """Dictionary learning

From 0e81156e556150c569cfe54637b9cb8c80b18bfb Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 13 Jan 2020 11:18:03 -0500
Subject: [PATCH 36/53] more merge

---
 sklearn/feature_selection/_rfe.py | 8 ++------
 sklearn/impute/_base.py           | 4 ----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 1417fd04f9eac..d91665e252686 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -492,13 +492,9 @@ def fit(self, X, y, groups=None):
             train/test set. Only used in conjunction with a "Group" :term:`cv`
             instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
         """
-<<<<<<< HEAD:sklearn/feature_selection/rfe.py
         X, y = self._validate_X_y(X, y, accept_sparse="csr",
-                                  ensure_min_features=2)
-=======
-        X, y = check_X_y(X, y, "csr", ensure_min_features=2,
-                         force_all_finite=False)
->>>>>>> 19479d7af1711f1bb403eca1c02eebf212999091:sklearn/feature_selection/_rfe.py
+                                  ensure_min_features=2,
+                                  force_all_finite=False)
 
         # Initialization
         cv = check_cv(self.cv, y, is_classifier(self.estimator))
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index d202af7f1e9ae..437d0e8ab6d0b 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -411,12 +411,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-<<<<<<< HEAD
         X = self._validate_input(X, in_fit=False)
-=======
-        X = self._validate_input(X)
         X_indicator = super()._transform_indicator(X)
->>>>>>> 19479d7af1711f1bb403eca1c02eebf212999091
 
         statistics = self.statistics_
 

From d4d92bc1c230b4d0344f88847a32277de98e66bc Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 13 Jan 2020 15:26:22 -0500
Subject: [PATCH 37/53] fixed some bugs

---
 sklearn/impute/_base.py | 1 -
 sklearn/naive_bayes.py  | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 437d0e8ab6d0b..038780ef118a6 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -272,7 +272,6 @@ def fit(self, X, y=None):
         -------
         self : SimpleImputer
         """
-        X = self._validate_input(X)
         X = self._validate_input(X, in_fit=True)
         super()._fit_indicator(X)
 
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 38117b4182dfa..a73d786c2eb7c 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -473,7 +473,7 @@ def _check_X(self, X):
         return check_array(X, accept_sparse='csr')
 
     def _check_X_y(self, X, y):
-        return check_X_y(X, y, accept_sparse='csr')
+        return self._validate_X_y(X, y, accept_sparse='csr')
 
     def _update_class_log_prior(self, class_prior=None):
         n_classes = len(self.classes_)
@@ -607,7 +607,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._validate_X_y(X, y)
+        X, y = self._check_X_y(X, y)
         _, n_features = X.shape
         self.n_features_ = n_features
 
@@ -1154,8 +1154,8 @@ def _check_X(self, X):
         return X
 
     def _check_X_y(self, X, y):
-        X, y = check_X_y(X, y, dtype='int', accept_sparse=False,
-                         force_all_finite=True)
+        X, y = self._validate_X_y(X, y, dtype='int', accept_sparse=False,
+                                  force_all_finite=True)
         if np.any(X < 0):
             raise ValueError("X must not contain negative values.")
         return X, y

From 40cd141ad8c43ebc9bbea6787be7415dc415c3b9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Jan 2020 10:04:40 -0500
Subject: [PATCH 38/53] fixed more bugs

---
 sklearn/base.py                | 6 ++++--
 sklearn/linear_model/_ridge.py | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 829b5b5cfac95..19f74c4b45477 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -355,12 +355,14 @@ def _validate_n_features(self, X, check_n_features):
 
     def _validate_X(self, X, check_n_features=False, **check_array_params):
         X = check_array(X, **check_array_params)
-        self._validate_n_features(X, check_n_features)
+        if check_array_params.get('ensure_2d', True):
+            self._validate_n_features(X, check_n_features)
         return X
 
     def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
         X, y = check_X_y(X, y, **check_X_y_params)
-        self._validate_n_features(X, check_n_features)
+        if check_X_y_params.get('ensure_2d', True):
+            self._validate_n_features(X, check_n_features)
         return X, y
 
 class ClassifierMixin:
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index b1ee3d94b929a..577e171aa3cf3 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -918,8 +918,8 @@ def fit(self, X, y, sample_weight=None):
         """
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
                                                   self.solver)
-        self._validate_X_y(X, y, accept_sparse=_accept_sparse,
-                           multi_output=True, y_numeric=False)
+        X, y = self._validate_X_y(X, y, accept_sparse=_accept_sparse,
+                                  multi_output=True, y_numeric=False)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
@@ -1879,8 +1879,8 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                           multi_output=True, y_numeric=False)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                  multi_output=True, y_numeric=False)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)

From b3251fef069b70a9216a6c382e38d3ffd15a394e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Jan 2020 11:07:17 -0500
Subject: [PATCH 39/53] fixed warnings

---
 sklearn/tests/test_dummy.py       | 1 +
 sklearn/utils/estimator_checks.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
index f814cc47474b2..38abb0b158fd3 100644
--- a/sklearn/tests/test_dummy.py
+++ b/sklearn/tests/test_dummy.py
@@ -756,6 +756,7 @@ def test_dtype_of_classifier_probas(strategy):
     assert probas.dtype == np.float64
 
 
+@pytest.mark.filterwarnings("ignore:The default value of strategy.*")  # 0.24
 @pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier))
 def test_n_features_in_(Dummy):
     X = [[1, 2]]
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 095328c82f219..d5775853205ab 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2856,7 +2856,7 @@ def check_n_features_in(name, estimator_orig):
 
     n_samples = 100
     X = rng.normal(loc=100, size=(n_samples, 2))
-    X = pairwise_estimator_convert_X(X, estimator)
+    X = _pairwise_estimator_convert_X(X, estimator)
     if is_regressor(estimator_orig):
         y = rng.normal(size=n_samples)
     else:

From 7e73a244d1d8ca49f70c2ee012ee195c28f3a39c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Jan 2020 13:10:17 -0500
Subject: [PATCH 40/53] use _validate_data() method

---
 sklearn/base.py                               | 40 ++++++-----
 sklearn/calibration.py                        |  4 +-
 sklearn/cluster/_affinity_propagation.py      |  2 +-
 sklearn/cluster/_agglomerative.py             |  6 +-
 sklearn/cluster/_bicluster.py                 |  2 +-
 sklearn/cluster/_birch.py                     |  2 +-
 sklearn/cluster/_dbscan.py                    |  2 +-
 sklearn/cluster/_kmeans.py                    | 10 +--
 sklearn/cluster/_mean_shift.py                |  2 +-
 sklearn/cluster/_optics.py                    |  2 +-
 sklearn/cluster/_spectral.py                  |  4 +-
 sklearn/compose/_column_transformer.py        |  4 +-
 sklearn/covariance/_empirical_covariance.py   |  2 +-
 sklearn/covariance/_graph_lasso.py            |  6 +-
 sklearn/covariance/_robust_covariance.py      |  2 +-
 sklearn/covariance/_shrunk_covariance.py      |  6 +-
 sklearn/cross_decomposition/_pls.py           |  8 +--
 sklearn/decomposition/_dict_learning.py       |  4 +-
 sklearn/decomposition/_factor_analysis.py     |  2 +-
 sklearn/decomposition/_fastica.py             |  4 +-
 sklearn/decomposition/_incremental_pca.py     |  4 +-
 sklearn/decomposition/_kernel_pca.py          |  2 +-
 sklearn/decomposition/_lda.py                 | 24 +++----
 sklearn/decomposition/_nmf.py                 |  2 +-
 sklearn/decomposition/_pca.py                 |  4 +-
 sklearn/decomposition/_sparse_pca.py          |  4 +-
 sklearn/decomposition/_truncated_svd.py       |  4 +-
 sklearn/discriminant_analysis.py              |  6 +-
 sklearn/ensemble/_bagging.py                  |  2 +-
 sklearn/ensemble/_forest.py                   |  2 +-
 sklearn/ensemble/_gb.py                       |  4 +-
 .../gradient_boosting.py                      |  4 +-
 sklearn/ensemble/_weight_boosting.py          | 34 ++++-----
 sklearn/feature_selection/_rfe.py             |  8 +--
 .../_univariate_selection.py                  |  4 +-
 .../feature_selection/_variance_threshold.py  |  5 +-
 sklearn/gaussian_process/_gpc.py              |  8 +--
 sklearn/gaussian_process/_gpr.py              |  8 +--
 sklearn/impute/_base.py                       | 16 ++---
 sklearn/impute/_iterative.py                  |  4 +-
 sklearn/impute/_knn.py                        |  6 +-
 sklearn/kernel_approximation.py               |  8 +--
 sklearn/kernel_ridge.py                       |  4 +-
 sklearn/linear_model/_base.py                 |  4 +-
 sklearn/linear_model/_bayes.py                |  6 +-
 sklearn/linear_model/_coordinate_descent.py   | 24 +++----
 sklearn/linear_model/_huber.py                |  2 +-
 sklearn/linear_model/_least_angle.py          |  6 +-
 sklearn/linear_model/_logistic.py             | 12 ++--
 sklearn/linear_model/_omp.py                  |  6 +-
 sklearn/linear_model/_ransac.py               |  2 +-
 sklearn/linear_model/_ridge.py                | 22 +++---
 sklearn/linear_model/_stochastic_gradient.py  | 12 ++--
 sklearn/linear_model/_theil_sen.py            |  2 +-
 sklearn/manifold/_locally_linear.py           |  2 +-
 sklearn/manifold/_mds.py                      |  2 +-
 sklearn/manifold/_spectral_embedding.py       |  4 +-
 sklearn/manifold/_t_sne.py                    | 10 +--
 sklearn/mixture/_base.py                      |  2 +-
 sklearn/multiclass.py                         |  4 +-
 sklearn/multioutput.py                        |  4 +-
 sklearn/naive_bayes.py                        |  8 +--
 sklearn/neighbors/_base.py                    |  2 +-
 sklearn/neighbors/_kde.py                     |  2 +-
 sklearn/neighbors/_nca.py                     |  2 +-
 sklearn/neighbors/_nearest_centroid.py        |  4 +-
 .../neural_network/_multilayer_perceptron.py  |  8 +--
 sklearn/neural_network/_rbm.py                |  2 +-
 sklearn/preprocessing/_data.py                | 70 +++++++++----------
 sklearn/preprocessing/_discretization.py      |  2 +-
 .../preprocessing/_function_transformer.py    |  2 +-
 sklearn/random_projection.py                  |  2 +-
 sklearn/semi_supervised/_label_propagation.py |  2 +-
 sklearn/svm/_base.py                          |  6 +-
 sklearn/svm/_classes.py                       | 12 ++--
 sklearn/tree/_classes.py                      |  2 +-
 sklearn/utils/tests/test_estimator_checks.py  | 32 ++++-----
 77 files changed, 281 insertions(+), 284 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 19f74c4b45477..c3bb97c5d7a61 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -337,33 +337,37 @@ def _get_tags(self):
                 collected_tags.update(more_tags)
         return collected_tags
 
-    def _validate_n_features(self, X, check_n_features):
-        if check_n_features:
+    def _validate_n_features(self, X, reset):
+        n_features = X.shape[1]
+
+        if reset:
+            self.n_features_in_ = n_features
+        else:
             if not hasattr(self, 'n_features_in_'):
                 raise RuntimeError(
-                    "check_n_features is True but there is no n_features_in_ "
+                    "reset parameter is False but there is no n_features_in_ "
                     "attribute."
                 )
-            if X.shape[1] != self.n_features_in_:
+            if n_features != self.n_features_in_:
                 raise ValueError(
                     'X has {} features, but this {} is expecting {} features '
-                    'as input.'.format(X.shape[1], self.__class__.__name__,
+                    'as input.'.format(n_features, self.__class__.__name__,
                                        self.n_features_in_)
                 )
+
+    def _validate_data(self, X, y=None, reset=True, **check_params):
+        if y is None:
+            X = check_array(X, **check_params)
+            out = X
         else:
-            self.n_features_in_ = X.shape[1]
-
-    def _validate_X(self, X, check_n_features=False, **check_array_params):
-        X = check_array(X, **check_array_params)
-        if check_array_params.get('ensure_2d', True):
-            self._validate_n_features(X, check_n_features)
-        return X
-
-    def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
-        X, y = check_X_y(X, y, **check_X_y_params)
-        if check_X_y_params.get('ensure_2d', True):
-            self._validate_n_features(X, check_n_features)
-        return X, y
+            X, y = check_X_y(X, y, **check_params)
+            out = X, y
+
+        if check_params.get('ensure_2d', True):
+            self._validate_n_features(X, reset=reset)
+
+        return out
+
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index e06f4217122c9..6bb68122aaa57 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -131,8 +131,8 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns an instance of self.
         """
-        X, y = self._validate_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
-                                  force_all_finite=False, allow_nd=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo'],
+                                   force_all_finite=False, allow_nd=True)
         X, y = indexable(X, y)
         le = LabelBinarizer().fit(y)
         self.classes_ = le.classes_
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 4360bef52c8db..aa06fb30b1669 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -374,7 +374,7 @@ def fit(self, X, y=None):
             accept_sparse = False
         else:
             accept_sparse = 'csr'
-        X = self._validate_X(X, accept_sparse=accept_sparse)
+        X = self._validate_data(X, accept_sparse=accept_sparse)
         if self.affinity == "precomputed":
             self.affinity_matrix_ = X
         elif self.affinity == "euclidean":
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 9bb31f85a36c4..b29b1078333cc 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -805,7 +805,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = self._validate_X(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_data(X, ensure_min_samples=2, estimator=self)
         memory = check_memory(self.memory)
 
         if self.n_clusters is not None and self.n_clusters <= 0:
@@ -1051,8 +1051,8 @@ def fit(self, X, y=None, **params):
         -------
         self
         """
-        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
-                             ensure_min_features=2, estimator=self)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                ensure_min_features=2, estimator=self)
         n_features_in_ = self.n_features_in_
         AgglomerativeClustering.fit(self, X.T, **params)
         # Need to restore n_features_in_ attribute that was overridden in
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index c7d04707d1571..37eca3c7ec4c2 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -115,7 +115,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = self._validate_X(X, accept_sparse='csr', dtype=np.float64)
+        X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
         self._fit(X)
         return self
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 153f793277e9c..d1cf13662d26a 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -459,7 +459,7 @@ def fit(self, X, y=None):
         return self._fit(X)
 
     def _fit(self, X):
-        X = self._validate_X(X, accept_sparse='csr', copy=self.copy)
+        X = self._validate_data(X, accept_sparse='csr', copy=self.copy)
         threshold = self.threshold
         branching_factor = self.branching_factor
 
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index bddf63a889efd..c258ce3d5f406 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -307,7 +307,7 @@ def fit(self, X, y=None, sample_weight=None):
         self
 
         """
-        X = self._validate_X(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr')
 
         if not self.eps > 0.0:
             raise ValueError("eps must be positive.")
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 57b67c162a018..6d8decfe1fd05 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -854,9 +854,9 @@ def fit(self, X, y=None, sample_weight=None):
 
         # avoid forcing order when copy_x=False
         order = "C" if self.copy_x else None
-        X = self._validate_X(X, accept_sparse='csr',
-                             dtype=[np.float64, np.float32],
-                             order=order, copy=self.copy_x)
+        X = self._validate_data(X, accept_sparse='csr',
+                                dtype=[np.float64, np.float32],
+                                order=order, copy=self.copy_x)
         # verify that the number of samples given is larger than k
         if _num_samples(X) < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
@@ -1505,8 +1505,8 @@ def fit(self, X, y=None, sample_weight=None):
         self
         """
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X, accept_sparse="csr", order='C',
-                             dtype=[np.float64, np.float32])
+        X = self._validate_data(X, accept_sparse="csr", order='C',
+                                dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
         if n_samples < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d"
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 96f83ed9ef086..3d0dc6304dd5a 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -367,7 +367,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         bandwidth = self.bandwidth
         if bandwidth is None:
             bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index e6d1f09b77ed9..d28941c7d8401 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -244,7 +244,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = self._validate_X(X, dtype=np.float)
+        X = self._validate_data(X, dtype=np.float)
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index 1d54a84b93d64..e76822d6b3732 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -474,8 +474,8 @@ def fit(self, X, y=None):
         self
 
         """
-        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
-                             dtype=np.float64, ensure_min_samples=2)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                dtype=np.float64, ensure_min_samples=2)
         allow_squared = self.affinity in ["precomputed",
                                           "precomputed_nearest_neighbors"]
         if X.shape[0] == X.shape[1] and not allow_squared:
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 42e65c61623bb..8e87cc937ce7a 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -512,7 +512,7 @@ def fit_transform(self, X, y=None):
             self._feature_names_in = None
         X = _check_X(X)
         # set n_features_in_ attribute
-        self._validate_n_features(X, check_n_features=False)
+        self._validate_n_features(X)
         self._validate_transformers()
         self._validate_column_callables(X)
         self._validate_remainder(X)
@@ -586,7 +586,7 @@ def transform(self, X):
                                  'and for transform when using the '
                                  'remainder keyword')
 
-        # TODO: also call _validate_n_features(check_n_features=True) in 0.24
+        # TODO: also call _validate_n_features(reset=False) in 0.24
         self._validate_features(X.shape[1], X_feature_names)
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index 02c48fc5824ae..9da2be98335de 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -191,7 +191,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 91d29e0bc43b7..26e5408048c65 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -377,8 +377,8 @@ def fit(self, X, y=None):
         y : (ignored)
         """
         # Covariance does not make sense for a single feature
-        X = self._validate_X(X, ensure_min_features=2, ensure_min_samples=2,
-                             estimator=self)
+        X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2,
+                                estimator=self)
 
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
@@ -644,7 +644,7 @@ def fit(self, X, y=None):
         y : (ignored)
         """
         # Covariance does not make sense for a single feature
-        X = self._validate_X(X, ensure_min_features=2, estimator=self)
+        X = self._validate_data(X, ensure_min_features=2, estimator=self)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index 46711d3eb2afc..2081874b03110 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -636,7 +636,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = self._validate_X(X, ensure_min_samples=2, estimator='MinCovDet')
+        X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet')
         random_state = check_random_state(self.random_state)
         n_samples, n_features = X.shape
         # check that the empirical covariance is full rank
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 9240b1d81716e..d86474b40ec89 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -143,7 +143,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid a potential
         # matrix inversion when setting the precision
         if self.assume_centered:
@@ -419,7 +419,7 @@ def fit(self, X, y=None):
         """
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
@@ -572,7 +572,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
         if self.assume_centered:
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index af2a5218002d4..af81ece6baf58 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -277,8 +277,8 @@ def fit(self, X, Y):
 
         # copy since this will contains the residuals (deflated) matrices
         check_consistent_length(X, Y)
-        X = self._validate_X(X, dtype=np.float64, copy=self.copy,
-                             ensure_min_samples=2)
+        X = self._validate_data(X, dtype=np.float64, copy=self.copy,
+                                ensure_min_samples=2)
         Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
@@ -886,8 +886,8 @@ def fit(self, X, Y):
         """
         # copy since this will contains the centered data
         check_consistent_length(X, Y)
-        X = self._validate_X(X, dtype=np.float64, copy=self.copy,
-                             ensure_min_samples=2)
+        X = self._validate_data(X, dtype=np.float64, copy=self.copy,
+                                ensure_min_samples=2)
         Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 3582dd23ac8dc..49b78a0916e7a 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1221,7 +1221,7 @@ def fit(self, X, y=None):
             Returns the object itself
         """
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         if self.n_components is None:
             n_components = X.shape[1]
         else:
@@ -1428,7 +1428,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X)
+        X = self._validate_data(X)
 
         U, (A, B), self.n_iter_ = dict_learning_online(
             X, self.n_components, self.alpha,
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index 15ce4dc31af9b..7147fd452559c 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -168,7 +168,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = self._validate_X(X, copy=self.copy, dtype=np.float64)
+        X = self._validate_data(X, copy=self.copy, dtype=np.float64)
 
         n_samples, n_features = X.shape
         n_components = self.n_components
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index 1558827b3db06..ef9f376bba66d 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -427,8 +427,8 @@ def _fit(self, X, compute_sources=False):
 
         # This validates twice but there is not clean way to avoid validation
         # in fastica(). Please see issue 14897.
-        self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES,
-                         ensure_min_samples=2).T
+        self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES,
+                            ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args
         random_state = check_random_state(self.random_state)
 
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index e62dc0189d55f..2a0d19d373dbb 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -194,8 +194,8 @@ def fit(self, X, y=None):
         self.singular_values_ = None
         self.noise_variance_ = None
 
-        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'lil'],
-                             copy=self.copy, dtype=[np.float64, np.float32])
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'lil'],
+                                copy=self.copy, dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
 
         if self.batch_size is None:
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 3a2ea007e0307..b1f83c8e0ff81 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -275,7 +275,7 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        X = self._validate_X(X, accept_sparse='csr', copy=self.copy_X)
+        X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X)
         self._centerer = KernelCenterer()
         K = self._get_kernel(X)
         self._fit_transform(K)
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 08fe12154e8bb..201b393374a08 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -467,7 +467,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None):
     def _more_tags(self):
         return {'requires_positive_X': True}
 
-    def _check_non_neg_array(self, X, check_n_features, whom):
+    def _check_non_neg_array(self, X, reset_n_features, whom):
         """check X format
 
         check X format and make sure no negative value in X.
@@ -477,8 +477,8 @@ def _check_non_neg_array(self, X, check_n_features, whom):
         X :  array-like or sparse matrix
 
         """
-        X = self._validate_X(X, check_n_features=check_n_features,
-                             accept_sparse='csr')
+        X = self._validate_data(X, reset=reset_n_features,
+                                accept_sparse='csr')
         check_non_negative(X, whom)
         return X
 
@@ -498,13 +498,13 @@ def partial_fit(self, X, y=None):
         """
         self._check_params()
         first_time = not hasattr(self, 'components_')
+
         # deactivating check for now (specific tests about error message would
         # break)
-        # TODO: uncomment when addressing check_n_features in
-        # predict/transform/etc.
-        # check_n_features = not in_fit
-        check_n_features = False
-        X = self._check_non_neg_array(X, check_n_features,
+        # TODO: uncomment when addressing reset in predict/transform/etc.
+        # reset = first_time
+        reset_n_features = True
+        X = self._check_non_neg_array(X, reset_n_features,
                                       "LatentDirichletAllocation.partial_fit")
         n_samples, n_features = X.shape
         batch_size = self.batch_size
@@ -548,7 +548,7 @@ def fit(self, X, y=None):
         self
         """
         self._check_params()
-        X = self._check_non_neg_array(X, check_n_features=False,
+        X = self._check_non_neg_array(X, reset_n_features=True,
                                       whom="LatentDirichletAllocation.fit")
         n_samples, n_features = X.shape
         max_iter = self.max_iter
@@ -619,7 +619,7 @@ def _unnormalized_transform(self, X):
 
         # make sure feature size is the same in fitted model and in X
         X = self._check_non_neg_array(
-            X, check_n_features=False,
+            X, reset_n_features=True,
             whom="LatentDirichletAllocation.transform")
         n_samples, n_features = X.shape
         if n_features != self.components_.shape[1]:
@@ -744,7 +744,7 @@ def score(self, X, y=None):
         score : float
             Use approximate bound as score.
         """
-        X = self._check_non_neg_array(X, check_n_features=False,
+        X = self._check_non_neg_array(X, reset_n_features=True,
                                       whom="LatentDirichletAllocation.score")
 
         doc_topic_distr = self._unnormalized_transform(X)
@@ -775,7 +775,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
         check_is_fitted(self)
 
         X = self._check_non_neg_array(
-            X, check_n_features=False,
+            X, reset_n_features=True,
             whom="LatentDirichletAllocation.perplexity")
 
         if doc_topic_distr is None:
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index c4edfcb15040a..1ab996b8c8059 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -1268,7 +1268,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : array, shape (n_samples, n_components)
             Transformed data.
         """
-        X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=float)
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=float)
 
         W, H, n_iter_ = non_negative_factorization(
             X=X, W=W, H=H, n_components=self.n_components, init=self.init,
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 0fb7f41a58a67..178d83f4aeb0d 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -386,8 +386,8 @@ def _fit(self, X):
             raise TypeError('PCA does not support sparse input. See '
                             'TruncatedSVD for a possible alternative.')
 
-        X = self._validate_X(X, dtype=[np.float64, np.float32], ensure_2d=True,
-                             copy=self.copy)
+        X = self._validate_data(X, dtype=[np.float64, np.float32],
+                                ensure_2d=True, copy=self.copy)
 
         # Handle n_components==None
         if self.n_components is None:
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index f5395012f9e08..4cdbe21bae0c7 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -165,7 +165,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X)
+        X = self._validate_data(X)
 
         _check_normalize_components(
             self.normalize_components, self.__class__.__name__
@@ -363,7 +363,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X)
+        X = self._validate_data(X)
 
         _check_normalize_components(
             self.normalize_components, self.__class__.__name__
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 9b6c71e05d29b..940eab56feea8 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -157,8 +157,8 @@ def fit_transform(self, X, y=None):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = self._validate_X(X, accept_sparse=['csr', 'csc'],
-                             ensure_min_features=2)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc'],
+                                ensure_min_features=2)
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 14610087c1d37..7e30233b0e131 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -423,8 +423,8 @@ def fit(self, X, y):
         y : array, shape (n_samples,)
             Target values.
         """
-        X, y = self._validate_X_y(X, y, ensure_min_samples=2, estimator=self,
-                                  dtype=[np.float64, np.float32])
+        X, y = self._validate_data(X, y, ensure_min_samples=2, estimator=self,
+                                   dtype=[np.float64, np.float32])
         self.classes_ = unique_labels(y)
         n_samples, _ = X.shape
         n_classes = len(self.classes_)
@@ -645,7 +645,7 @@ def fit(self, X, y):
         y : array, shape = [n_samples]
             Target values (integers)
         """
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         check_classification_targets(y)
         self.classes_, y = np.unique(y, return_inverse=True)
         n_samples, n_features = X.shape
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index 8dee9b8af33db..000e5a054627f 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -278,7 +278,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
         random_state = check_random_state(self.random_state)
 
         # Convert data (X is required to be 2d and indexable)
-        X, y = self._validate_X_y(
+        X, y = self._validate_data(
             X, y, accept_sparse=['csr', 'csc'], dtype=None,
             force_all_finite=False, multi_output=True
         )
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index da744ad4cea70..9d1cfee7e7266 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -293,7 +293,7 @@ def fit(self, X, y, sample_weight=None):
         self : object
         """
         # Validate or convert input data
-        X = self._validate_X(X, accept_sparse="csc", dtype=DTYPE)
+        X = self._validate_data(X, accept_sparse="csc", dtype=DTYPE)
         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index c9304ad398396..f6cc644c57d7b 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -411,8 +411,8 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         # Check input
         # Since check_array converts both X and y to the same dtype, but the
         # trees use different types for X and y, checking them separately.
-        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
-                             dtype=DTYPE)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                dtype=DTYPE)
         n_samples, self.n_features_ = X.shape
 
         sample_weight_is_none = sample_weight is None
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index d4665a1495881..5ab33e52df73d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -101,8 +101,8 @@ def fit(self, X, y):
         acc_compute_hist_time = 0.  # time spent computing histograms
         # time spent predicting X for gradient and hessians update
         acc_prediction_time = 0.
-        X, y = self._validate_X_y(X, y, dtype=[X_DTYPE],
-                                  force_all_finite=False)
+        X, y = self._validate_data(X, y, dtype=[X_DTYPE],
+                                   force_all_finite=False)
         y = self._encode_y(y)
 
         rng = check_random_state(self.random_state)
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 9e2aaf8fb7925..4716309280699 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -71,7 +71,7 @@ def __init__(self,
         self.learning_rate = learning_rate
         self.random_state = random_state
 
-    def _validate_data(self, X):
+    def _check_X(self, X):
         return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True,
                            allow_nd=True, dtype=None)
 
@@ -100,12 +100,12 @@ def fit(self, X, y, sample_weight=None):
         if self.learning_rate <= 0:
             raise ValueError("learning_rate must be greater than zero")
 
-        X, y = self._validate_X_y(X, y,
-                                  accept_sparse=['csr', 'csc'],
-                                  ensure_2d=True,
-                                  allow_nd=True,
-                                  dtype=None,
-                                  y_numeric=is_regressor(self))
+        X, y = self._validate_data(X, y,
+                                   accept_sparse=['csr', 'csc'],
+                                   ensure_2d=True,
+                                   allow_nd=True,
+                                   dtype=None,
+                                   y_numeric=is_regressor(self))
 
         sample_weight = _check_sample_weight(sample_weight, X, np.float64)
         sample_weight /= sample_weight.sum()
@@ -216,7 +216,7 @@ def staged_score(self, X, y, sample_weight=None):
         ------
         z : float
         """
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         for y_pred in self.staged_predict(X):
             if is_classifier(self):
@@ -611,7 +611,7 @@ def predict(self, X):
         y : ndarray of shape (n_samples,)
             The predicted classes.
         """
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         pred = self.decision_function(X)
 
@@ -641,7 +641,7 @@ def staged_predict(self, X):
         y : generator of ndarray of shape (n_samples,)
             The predicted classes.
         """
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         n_classes = self.n_classes_
         classes = self.classes_
@@ -675,7 +675,7 @@ def decision_function(self, X):
             class in ``classes_``, respectively.
         """
         check_is_fitted(self)
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         n_classes = self.n_classes_
         classes = self.classes_[:, np.newaxis]
@@ -718,7 +718,7 @@ def staged_decision_function(self, X):
             class in ``classes_``, respectively.
         """
         check_is_fitted(self)
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         n_classes = self.n_classes_
         classes = self.classes_[:, np.newaxis]
@@ -787,7 +787,7 @@ def predict_proba(self, X):
             outputs is the same of that of the :term:`classes_` attribute.
         """
         check_is_fitted(self)
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         n_classes = self.n_classes_
 
@@ -821,7 +821,7 @@ def staged_predict_proba(self, X):
             The class probabilities of the input samples. The order of
             outputs is the same of that of the :term:`classes_` attribute.
         """
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         n_classes = self.n_classes_
 
@@ -847,7 +847,7 @@ def predict_log_proba(self, X):
             The class probabilities of the input samples. The order of
             outputs is the same of that of the :term:`classes_` attribute.
         """
-        X = self._validate_data(X)
+        X = self._check_X(X)
         return np.log(self.predict_proba(X))
 
 
@@ -1115,7 +1115,7 @@ def predict(self, X):
             The predicted regression values.
         """
         check_is_fitted(self)
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         return self._get_median_predict(X, len(self.estimators_))
 
@@ -1140,7 +1140,7 @@ def staged_predict(self, X):
             The predicted regression values.
         """
         check_is_fitted(self)
-        X = self._validate_data(X)
+        X = self._check_X(X)
 
         for i, _ in enumerate(self.estimators_, 1):
             yield self._get_median_predict(X, limit=i)
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index d91665e252686..6e6800ee2a6a7 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -155,7 +155,7 @@ def _fit(self, X, y, step_score=None):
         # self.scores_ will not be calculated when calling _fit through fit
 
         tags = self._get_tags()
-        X, y = self._validate_X_y(
+        X, y = self._validate_data(
             X, y, accept_sparse="csc",
             ensure_min_features=2,
             force_all_finite=not tags.get('allow_nan', True)
@@ -492,9 +492,9 @@ def fit(self, X, y, groups=None):
             train/test set. Only used in conjunction with a "Group" :term:`cv`
             instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
         """
-        X, y = self._validate_X_y(X, y, accept_sparse="csr",
-                                  ensure_min_features=2,
-                                  force_all_finite=False)
+        X, y = self._validate_data(X, y, accept_sparse="csr",
+                                   ensure_min_features=2,
+                                   force_all_finite=False)
 
         # Initialization
         cv = check_cv(self.cv, y, is_classifier(self.estimator))
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 5acf44d8e0407..221e46f2a505e 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -338,8 +338,8 @@ def fit(self, X, y):
         -------
         self : object
         """
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'],
-                                  multi_output=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
+                                   multi_output=True)
 
         if not callable(self.score_func):
             raise TypeError("The score function should be a callable, %s (%s) "
diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py
index a160ef634be7c..6438e6b80dc0a 100644
--- a/sklearn/feature_selection/_variance_threshold.py
+++ b/sklearn/feature_selection/_variance_threshold.py
@@ -65,8 +65,9 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=np.float64,
-                             force_all_finite='allow-nan')
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'),
+                                dtype=np.float64,
+                                force_all_finite='allow-nan')
 
         if hasattr(X, "toarray"):   # sparse matrix
             _, self.variances_ = mean_variance_axis(X, axis=0)
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index 4c0c63923e5fc..d34aa0962472b 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -625,11 +625,11 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         if self.kernel is None or self.kernel.requires_vector_input:
-            X, y = self._validate_X_y(X, y, multi_output=False,
-                                      ensure_2d=True, dtype="numeric")
+            X, y = self._validate_data(X, y, multi_output=False,
+                                       ensure_2d=True, dtype="numeric")
         else:
-            X, y = self._validate_X_y(X, y, multi_output=False,
-                                      ensure_2d=False, dtype=None)
+            X, y = self._validate_data(X, y, multi_output=False,
+                                       ensure_2d=False, dtype=None)
 
         self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
             self.kernel, self.optimizer, self.n_restarts_optimizer,
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 522f233213015..cc3fbb2f08d56 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -187,11 +187,11 @@ def fit(self, X, y):
         self._rng = check_random_state(self.random_state)
 
         if self.kernel_.requires_vector_input:
-            X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True,
-                                      ensure_2d=True, dtype="numeric")
+            X, y = self._validate_data(X, y, multi_output=True, y_numeric=True,
+                                       ensure_2d=True, dtype="numeric")
         else:
-            X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True,
-                                      ensure_2d=False, dtype=None)
+            X, y = self._validate_data(X, y, multi_output=True, y_numeric=True,
+                                       ensure_2d=False, dtype=None)
 
         # Normalize target value
         if self.normalize_y:
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 038780ef118a6..bc98778d5c5d8 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -235,11 +235,10 @@ def _validate_input(self, X, in_fit):
             force_all_finite = "allow-nan"
 
         try:
-            check_n_features = not in_fit
-            X = self._validate_X(X, check_n_features=check_n_features,
-                                 accept_sparse='csc', dtype=dtype,
-                                 force_all_finite=force_all_finite,
-                                 copy=self.copy)
+            X = self._validate_data(X, reset=in_fit,
+                                    accept_sparse='csc', dtype=dtype,
+                                    force_all_finite=force_all_finite,
+                                    copy=self.copy)
         except ValueError as ve:
             if "could not convert" in str(ve):
                 new_ve = ValueError("Cannot use {} strategy with non-numeric "
@@ -595,10 +594,9 @@ def _validate_input(self, X, in_fit):
             force_all_finite = True
         else:
             force_all_finite = "allow-nan"
-        check_n_features = not in_fit
-        X = self._validate_X(X, check_n_features=check_n_features,
-                             accept_sparse=('csc', 'csr'), dtype=None,
-                             force_all_finite=force_all_finite)
+        X = self._validate_data(X, reset=in_fit,
+                                accept_sparse=('csc', 'csr'), dtype=None,
+                                force_all_finite=force_all_finite)
         _check_inputs_dtype(X, self.missing_values)
         if X.dtype.kind not in ("i", "u", "f", "O"):
             raise ValueError("MissingIndicator does not support data with "
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 6463327836253..2e2fdc3f5e2de 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -503,8 +503,8 @@ def _initial_imputation(self, X):
         else:
             force_all_finite = True
 
-        X = self._validate_X(X, dtype=FLOAT_DTYPES, order="F",
-                             force_all_finite=force_all_finite)
+        X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F",
+                                force_all_finite=force_all_finite)
         _check_inputs_dtype(X, self.missing_values)
 
         mask_missing_values = _get_mask(X, self.missing_values)
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index b263416dd40aa..ea2fe35899304 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -178,9 +178,9 @@ def fit(self, X, y=None):
             raise ValueError(
                 "Expected n_neighbors > 0. Got {}".format(self.n_neighbors))
 
-        X = self._validate_X(X, accept_sparse=False, dtype=FLOAT_DTYPES,
-                             force_all_finite=force_all_finite,
-                             copy=self.copy)
+        X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES,
+                                force_all_finite=force_all_finite,
+                                copy=self.copy)
         super()._fit_indicator(X)
 
         _check_weights(self.weights)
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index ee572d8842e21..b29b56bbc38af 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -91,7 +91,7 @@ def fit(self, X, y=None):
             Returns the transformer.
         """
 
-        X = self._validate_X(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr')
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
 
@@ -197,7 +197,7 @@ def fit(self, X, y=None):
             Returns the transformer.
         """
 
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
         uniform = random_state.uniform(size=(n_features, self.n_components))
@@ -324,7 +324,7 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        self._validate_X(X, accept_sparse='csr')
+        self._validate_data(X, accept_sparse='csr')
         if self.sample_interval is None:
             # See reference, figure 2 c)
             if self.sample_steps == 1:
@@ -542,7 +542,7 @@ def fit(self, X, y=None):
         X : array-like of shape (n_samples, n_features)
             Training data.
         """
-        X = self._validate_X(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr')
         rnd = check_random_state(self.random_state)
         n_samples = X.shape[0]
 
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index f647923867eab..c504b2cb31cb3 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -148,8 +148,8 @@ def fit(self, X, y=None, sample_weight=None):
         self : returns an instance of self.
         """
         # Convert data
-        X, y = self._validate_X_y(X, y, accept_sparse=("csr", "csc"),
-                                  multi_output=True, y_numeric=True)
+        X, y = self._validate_data(X, y, accept_sparse=("csr", "csc"),
+                                   multi_output=True, y_numeric=True)
         if sample_weight is not None and not isinstance(sample_weight, float):
             sample_weight = check_array(sample_weight, ensure_2d=False)
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index be41ec6a4bb3a..8dd5d5bdbf983 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -488,8 +488,8 @@ def fit(self, X, y, sample_weight=None):
         """
 
         n_jobs_ = self.n_jobs
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                                  y_numeric=True, multi_output=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                   y_numeric=True, multi_output=True)
 
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X,
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 7b3cd5c3f3751..c67fc54f43157 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -190,7 +190,7 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError('n_iter should be greater than or equal to 1.'
                              ' Got {!r}.'.format(self.n_iter))
 
-        X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True)
+        X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True)
 
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X,
@@ -526,8 +526,8 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True,
-                                  ensure_min_samples=2)
+        X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True,
+                                   ensure_min_samples=2)
 
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index b2d2434fe68a1..43c027be1af69 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -695,11 +695,11 @@ def fit(self, X, y, check_input=True):
         # when bypassing checks
         if check_input:
             X_copied = self.copy_X and self.fit_intercept
-            X, y = self._validate_X_y(X, y, accept_sparse='csc',
-                                      order='F',
-                                      dtype=[np.float64, np.float32],
-                                      copy=X_copied, multi_output=True,
-                                      y_numeric=True)
+            X, y = self._validate_data(X, y, accept_sparse='csc',
+                                       order='F',
+                                       dtype=[np.float64, np.float32],
+                                       copy=X_copied, multi_output=True,
+                                       y_numeric=True)
             y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
                             ensure_2d=False)
 
@@ -1111,8 +1111,8 @@ def fit(self, X, y):
             # Let us not impose fortran ordering so far: it is
             # not useful for the cross-validation loop and will be done
             # by the model fitting itself
-            X = self._validate_X(X, accept_sparse='csc',
-                                 dtype=[np.float64, np.float32], copy=False)
+            X = self._validate_data(X, accept_sparse='csc',
+                                    dtype=[np.float64, np.float32], copy=False)
             if sparse.isspmatrix(X):
                 if (hasattr(reference_to_old_X, "data") and
                    not np.may_share_memory(reference_to_old_X.data, X.data)):
@@ -1123,9 +1123,9 @@ def fit(self, X, y):
                 copy_X = False
             del reference_to_old_X
         else:
-            X = self._validate_X(X, accept_sparse='csc',
-                                 dtype=[np.float64, np.float32], order='F',
-                                 copy=copy_X)
+            X = self._validate_data(X, accept_sparse='csc',
+                                    dtype=[np.float64, np.float32], order='F',
+                                    copy=copy_X)
             copy_X = False
 
         if X.shape[0] != y.shape[0]:
@@ -1743,8 +1743,8 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        X = self._validate_X(X, dtype=[np.float64, np.float32], order='F',
-                             copy=self.copy_X and self.fit_intercept)
+        X = self._validate_data(X, dtype=[np.float64, np.float32], order='F',
+                                copy=self.copy_X and self.fit_intercept)
         y = check_array(y, dtype=X.dtype.type, ensure_2d=False)
 
         if hasattr(self, 'l1_ratio'):
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index 7ab9b14168af9..1d3a3fcc73421 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._validate_X_y(
+        X, y = self._validate_data(
             X, y, copy=False, accept_sparse=['csr'], y_numeric=True,
             dtype=[np.float64, np.float32])
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 1858e1dbd6675..9f0f62471376a 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -944,7 +944,7 @@ def fit(self, X, y, Xy=None):
         self : object
             returns an instance of self.
         """
-        X, y = self._validate_X_y(X, y, y_numeric=True, multi_output=True)
+        X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
 
         alpha = getattr(self, 'alpha', 0.)
         if hasattr(self, 'n_nonzero_coefs'):
@@ -1367,7 +1367,7 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = self._validate_X_y(X, y, y_numeric=True)
+        X, y = self._validate_data(X, y, y_numeric=True)
         X = as_float_array(X, copy=self.copy_X)
         y = as_float_array(y, copy=self.copy_X)
 
@@ -1748,7 +1748,7 @@ def fit(self, X, y, copy_X=None):
         """
         if copy_X is None:
             copy_X = self.copy_X
-        X, y = self._validate_X_y(X, y, y_numeric=True)
+        X, y = self._validate_data(X, y, y_numeric=True)
 
         X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data(
             X, y, self.fit_intercept, self.normalize, copy_X)
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index e215c28d2a615..9e84e56ee0284 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1339,9 +1339,9 @@ def fit(self, X, y, sample_weight=None):
         else:
             _dtype = [np.float64, np.float32]
 
-        X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=_dtype,
-                                  order="C",
-                                  accept_large_sparse=solver != 'liblinear')
+        X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
+                                   order="C",
+                                   accept_large_sparse=solver != 'liblinear')
         check_classification_targets(y)
         self.classes_ = np.unique(y)
 
@@ -1813,9 +1813,9 @@ def fit(self, X, y, sample_weight=None):
                 "LogisticRegressionCV."
             )
 
-        X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-                                  order="C",
-                                  accept_large_sparse=solver != 'liblinear')
+        X, y = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64,
+                                   order="C",
+                                   accept_large_sparse=solver != 'liblinear')
         check_classification_targets(y)
 
         class_weight = self.class_weight
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index 7c16dcd243fdc..54b751423c933 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -641,7 +641,7 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True)
+        X, y = self._validate_data(X, y, multi_output=True, y_numeric=True)
         n_features = X.shape[1]
 
         X, y, X_offset, y_offset, X_scale, Gram, Xy = \
@@ -879,8 +879,8 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = self._validate_X_y(X, y, y_numeric=True, ensure_min_features=2,
-                                  estimator=self)
+        X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2,
+                                   estimator=self)
         X = as_float_array(X, copy=False, force_all_finite=False)
         cv = check_cv(self.cv, classifier=False)
         max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1])
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 0a746d9e49e07..cd5e3db49842d 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -246,7 +246,7 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
-        X = self._validate_X(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr')
         y = check_array(y, ensure_2d=False)
         check_consistent_length(X, y)
 
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 577e171aa3cf3..6c93d413752d1 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -536,10 +536,10 @@ def fit(self, X, y, sample_weight=None):
         _dtype = [np.float64, np.float32]
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
                                                   self.solver)
-        X, y = self._validate_X_y(X, y,
-                                  accept_sparse=_accept_sparse,
-                                  dtype=_dtype,
-                                  multi_output=True, y_numeric=True)
+        X, y = self._validate_data(X, y,
+                                   accept_sparse=_accept_sparse,
+                                   dtype=_dtype,
+                                   multi_output=True, y_numeric=True)
         if sparse.issparse(X) and self.fit_intercept:
             if self.solver not in ['auto', 'sparse_cg', 'sag']:
                 raise ValueError(
@@ -918,8 +918,8 @@ def fit(self, X, y, sample_weight=None):
         """
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
                                                   self.solver)
-        X, y = self._validate_X_y(X, y, accept_sparse=_accept_sparse,
-                                  multi_output=True, y_numeric=False)
+        X, y = self._validate_data(X, y, accept_sparse=_accept_sparse,
+                                   multi_output=True, y_numeric=False)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
@@ -1447,9 +1447,9 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                                  dtype=[np.float64],
-                                  multi_output=True, y_numeric=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                   dtype=[np.float64],
+                                   multi_output=True, y_numeric=True)
 
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X,
@@ -1879,8 +1879,8 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                                  multi_output=True, y_numeric=False)
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                   multi_output=True, y_numeric=False)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 69fa4c2f01052..d50a0997fbb56 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -521,9 +521,9 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None,
         if hasattr(self, "classes_"):
             self.classes_ = None
 
-        X, y = self._validate_X_y(X, y, accept_sparse='csr',
-                                  dtype=np.float64, order="C",
-                                  accept_large_sparse=False)
+        X, y = self._validate_data(X, y, accept_sparse='csr',
+                                   dtype=np.float64, order="C",
+                                   accept_large_sparse=False)
 
         # labels can be encoded as float, int, or string literals
         # np.unique sorts in asc order; largest class id is positive class
@@ -1096,9 +1096,9 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
 
     def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
                      max_iter, sample_weight, coef_init, intercept_init):
-        X, y = self._validate_X_y(X, y, accept_sparse="csr", copy=False,
-                                  order='C', dtype=np.float64,
-                                  accept_large_sparse=False)
+        X, y = self._validate_data(X, y, accept_sparse="csr", copy=False,
+                                   order='C', dtype=np.float64,
+                                   accept_large_sparse=False)
         y = y.astype(np.float64, copy=False)
 
         n_samples, n_features = X.shape
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index bbe1b90e37af0..a29cc26cdc0a3 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -357,7 +357,7 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         random_state = check_random_state(self.random_state)
-        X, y = self._validate_X_y(X, y, y_numeric=True)
+        X, y = self._validate_data(X, y, y_numeric=True)
         n_samples, n_features = X.shape
         n_subsamples, self.n_subpopulation_ = self._check_subparams(n_samples,
                                                                     n_features)
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 7b99fde631c97..6b371bd5821ac 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -656,7 +656,7 @@ def _fit_transform(self, X):
                                       n_jobs=self.n_jobs)
 
         random_state = check_random_state(self.random_state)
-        X = self._validate_X(X, dtype=float)
+        X = self._validate_data(X, dtype=float)
         self.nbrs_.fit(X)
         self.embedding_, self.reconstruction_error_ = \
             locally_linear_embedding(
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index 0ddf8dda7f31c..674c8e1527602 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -414,7 +414,7 @@ def fit_transform(self, X, y=None, init=None):
             algorithm. By default, the algorithm is initialized with a randomly
             chosen array.
         """
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
             warnings.warn("The MDS API has changed. ``fit`` now constructs an"
                           " dissimilarity matrix from data. To use a custom "
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 1052aeec9c955..c40ea7e1689b1 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -535,8 +535,8 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
 
-        X = self._validate_X(X, accept_sparse='csr', ensure_min_samples=2,
-                             estimator=self)
+        X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2,
+                                estimator=self)
 
         random_state = check_random_state(self.random_state)
         if isinstance(self.affinity, str):
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index be0e2df599cbf..92cb154292327 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -662,12 +662,12 @@ def _fit(self, X, skip_num_points=0):
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
         if self.method == 'barnes_hut':
-            X = self._validate_X(X, accept_sparse=['csr'],
-                                 ensure_min_samples=2,
-                                 dtype=[np.float32, np.float64])
+            X = self._validate_data(X, accept_sparse=['csr'],
+                                    ensure_min_samples=2,
+                                    dtype=[np.float32, np.float64])
         else:
-            X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
-                                 dtype=[np.float32, np.float64])
+            X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                    dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
             if isinstance(self.init, str) and self.init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index 56f3649f2b11c..e96978f9018f2 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -217,7 +217,7 @@ def fit_predict(self, X, y=None):
             Component labels.
         """
         X = _check_X(X, self.n_components, ensure_min_samples=2)
-        self._validate_n_features(X, check_n_features=False)
+        self._validate_n_features(X, reset=True)
         self._check_initial_parameters(X)
 
         # if we enable warm_start, we will have a unique initialisation
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 041f5a90c48c6..9eeb4248f83fd 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -535,7 +535,7 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
         check_classification_targets(y)
 
         self.classes_ = np.unique(y)
@@ -776,7 +776,7 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         if self.code_size <= 0:
             raise ValueError("code_size should be greater than 0, got {0}"
                              "".format(self.code_size))
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 33608db22213f..1c2eecfb76d5e 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -152,7 +152,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             raise ValueError("The base estimator should implement"
                              " a fit method")
 
-        X, y = self._validate_X_y(X, y, multi_output=True, accept_sparse=True)
+        X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True)
 
         if is_classifier(self):
             check_classification_targets(y)
@@ -416,7 +416,7 @@ def fit(self, X, Y):
         -------
         self : object
         """
-        X, Y = self._validate_X_y(X, Y, multi_output=True, accept_sparse=True)
+        X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)
 
         random_state = check_random_state(self.random_state)
         check_array(X, accept_sparse=True)
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index a73d786c2eb7c..8a2fefe7c08a0 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -203,7 +203,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         y = column_or_1d(y, warn=True)
         return self._partial_fit(X, y, np.unique(y), _refit=True,
                                  sample_weight=sample_weight)
@@ -473,7 +473,7 @@ def _check_X(self, X):
         return check_array(X, accept_sparse='csr')
 
     def _check_X_y(self, X, y):
-        return self._validate_X_y(X, y, accept_sparse='csr')
+        return self._validate_data(X, y, accept_sparse='csr')
 
     def _update_class_log_prior(self, class_prior=None):
         n_classes = len(self.classes_)
@@ -1154,8 +1154,8 @@ def _check_X(self, X):
         return X
 
     def _check_X_y(self, X, y):
-        X, y = self._validate_X_y(X, y, dtype='int', accept_sparse=False,
-                                  force_all_finite=True)
+        X, y = self._validate_data(X, y, dtype='int', accept_sparse=False,
+                                   force_all_finite=True)
         if np.any(X < 0):
             raise ValueError("X must not contain negative values.")
         return X, y
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index c5e6c08983348..76afc6e5dcb68 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -397,7 +397,7 @@ def _fit(self, X):
             X = _check_precomputed(X)
             self.n_features_in_ = X.shape[1]
         else:
-            X = self._validate_X(X, accept_sparse='csr')
+            X = self._validate_data(X, accept_sparse='csr')
 
         n_samples = X.shape[0]
         if n_samples == 0:
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index ec0b238b1a4a7..6a48ee6e60d27 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None):
             Returns instance of object.
         """
         algorithm = self._choose_algorithm(self.algorithm, self.metric)
-        X = self._validate_X(X, order='C', dtype=DTYPE)
+        X = self._validate_data(X, order='C', dtype=DTYPE)
 
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, DTYPE)
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 9a0d25d332f81..278e36ffd13db 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -298,7 +298,7 @@ def _validate_params(self, X, y):
         """
 
         # Validate the inputs X and y, and converts y to numerical classes.
-        X, y = self._validate_X_y(X, y, ensure_min_samples=2)
+        X, y = self._validate_data(X, y, ensure_min_samples=2)
         check_classification_targets(y)
         y = LabelEncoder().fit_transform(y)
 
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index c5b41db2c895f..48712c1fcfb44 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -104,9 +104,9 @@ def fit(self, X, y):
         # If X is sparse and the metric is "manhattan", store it in a csc
         # format is easier to calculate the median.
         if self.metric == 'manhattan':
-            X, y = self._validate_X_y(X, y, accept_sparse=['csc'])
+            X, y = self._validate_data(X, y, accept_sparse=['csc'])
         else:
-            X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'])
+            X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
         is_X_sparse = sp.issparse(X)
         if is_X_sparse and self.shrink_threshold:
             raise ValueError("threshold shrinking not supported"
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index cd6b4b44a6b82..038b9c31678a7 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -942,8 +942,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
             n_iter_no_change=n_iter_no_change, max_fun=max_fun)
 
     def _validate_input(self, X, y, incremental):
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'],
-                                  multi_output=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
+                                   multi_output=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)
 
@@ -1350,8 +1350,8 @@ def predict(self, X):
         return y_pred
 
     def _validate_input(self, X, y, incremental):
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'],
-                                  multi_output=True, y_numeric=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
+                                   multi_output=True, y_numeric=True)
         if y.ndim == 2 and y.shape[1] == 1:
             y = column_or_1d(y, warn=True)
         return X, y
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index b6c27de17000e..711dee806c138 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -336,7 +336,7 @@ def fit(self, X, y=None):
         self : BernoulliRBM
             The fitted model.
         """
-        X = self._validate_X(X, accept_sparse='csr', dtype=np.float64)
+        X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
         n_samples = X.shape[0]
         rng = check_random_state(self.random_state)
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 5aecf5f879063..b1ebeadba3530 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -369,10 +369,9 @@ def partial_fit(self, X, y=None):
                             "Consider using MaxAbsScaler instead.")
 
         first_pass = not hasattr(self, 'n_samples_seen_')
-        check_n_features = not first_pass
-        X = self._validate_X(X, check_n_features=check_n_features,
-                             estimator=self, dtype=FLOAT_DTYPES,
-                             force_all_finite="allow-nan")
+        X = self._validate_data(X, reset=first_pass,
+                                estimator=self, dtype=FLOAT_DTYPES,
+                                force_all_finite="allow-nan")
 
         data_min = np.nanmin(X, axis=0)
         data_max = np.nanmax(X, axis=0)
@@ -695,9 +694,9 @@ def partial_fit(self, X, y=None):
         self : object
             Transformer instance.
         """
-        X = self._validate_X(X, accept_sparse=('csr', 'csc'),
-                             estimator=self, dtype=FLOAT_DTYPES,
-                             force_all_finite='allow-nan')
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'),
+                                estimator=self, dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
 
         # Even in the case of `with_mean=False`, we update the mean anyway
         # This is needed for the incremental computation of the var
@@ -790,10 +789,10 @@ def transform(self, X, copy=None):
         check_is_fitted(self)
 
         copy = copy if copy is not None else self.copy
-        X = self._validate_X(X, check_n_features=True,
-                             accept_sparse='csr', copy=copy,
-                             estimator=self, dtype=FLOAT_DTYPES,
-                             force_all_finite='allow-nan')
+        X = self._validate_data(X, reset=False,
+                                accept_sparse='csr', copy=copy,
+                                estimator=self, dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             if self.with_mean:
@@ -967,10 +966,10 @@ def partial_fit(self, X, y=None):
             Transformer instance.
         """
         first_pass = not hasattr(self, 'n_samples_seen_')
-        check_n_features = not first_pass
-        X = self._validate_X(X, check_n_features=check_n_features,
-                             accept_sparse=('csr', 'csc'), estimator=self,
-                             dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
+        X = self._validate_data(X, reset=first_pass,
+                                accept_sparse=('csr', 'csc'), estimator=self,
+                                dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
@@ -1197,8 +1196,9 @@ def fit(self, X, y=None):
         """
         # at fit, convert sparse matrices to csc for optimized computation of
         # the quantiles
-        X = self._validate_X(X, accept_sparse='csc', estimator=self,
-                             dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
+        X = self._validate_data(X, accept_sparse='csc', estimator=self,
+                                dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
 
         q_min, q_max = self.quantile_range
         if not 0 <= q_min <= q_max <= 100:
@@ -1506,7 +1506,8 @@ def fit(self, X, y=None):
         -------
         self : instance
         """
-        n_samples, n_features = self._validate_X(X, accept_sparse=True).shape
+        n_samples, n_features = self._validate_data(
+            X, accept_sparse=True).shape
         combinations = self._combinations(n_features, self.degree,
                                           self.interaction_only,
                                           self.include_bias)
@@ -1812,7 +1813,7 @@ def fit(self, X, y=None):
         ----------
         X : array-like
         """
-        self._validate_X(X, accept_sparse='csr')
+        self._validate_data(X, accept_sparse='csr')
         return self
 
     def transform(self, X, copy=None):
@@ -1946,7 +1947,7 @@ def fit(self, X, y=None):
         ----------
         X : array-like
         """
-        self._validate_X(X, accept_sparse='csr')
+        self._validate_data(X, accept_sparse='csr')
         return self
 
     def transform(self, X, copy=None):
@@ -2026,7 +2027,7 @@ def fit(self, K, y=None):
         self : returns an instance of self.
         """
 
-        K = self._validate_X(K, dtype=FLOAT_DTYPES)
+        K = self._validate_data(K, dtype=FLOAT_DTYPES)
 
         if K.shape[0] != K.shape[1]:
             raise ValueError("Kernel matrix must be a square matrix."
@@ -2444,14 +2445,14 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
         """Check inputs before fit and transform"""
         # deactivating check for now (specific tests about error message would
         # break)
-        # TODO: uncomment when addressing check_n_features in
-        # predict/transform/etc.
-        # check_n_features = not in_fit
-        check_n_features = False
-
-        X = self._validate_X(X, check_n_features=check_n_features,
-                             accept_sparse='csc', copy=copy,
-                             dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
+        # TODO: uncomment when addressing reset in predict/transform/etc.
+        # reset = in_fit
+        reset = True
+
+        X = self._validate_data(X, reset=reset,
+                                accept_sparse='csc', copy=copy,
+                                dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
         # we only accept positive sparse matrix when ignore_implicit_zeros is
         # false and that we call fit or transform.
         with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
@@ -3004,15 +3005,8 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
         check_method : bool
             If True, check that the transformation method is valid.
         """
-        # deactivating check for now (specific tests about error message would
-        # break)
-        # TODO: uncomment when addressing check_n_features in
-        # predict/transform/etc.
-        # check_n_features = not in_fit
-        check_n_features = False
-        X = self._validate_X(X, check_n_features=check_n_features,
-                             ensure_2d=True, dtype=FLOAT_DTYPES,
-                             copy=self.copy, force_all_finite='allow-nan')
+        X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES,
+                                copy=self.copy, force_all_finite='allow-nan')
 
         with np.warnings.catch_warnings():
             np.warnings.filterwarnings(
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index a4dc703e9f3cb..67641601e06f5 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -137,7 +137,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = self._validate_X(X, dtype='numeric')
+        X = self._validate_data(X, dtype='numeric')
 
         valid_encode = ('onehot', 'onehot-dense', 'ordinal')
         if self.encode not in valid_encode:
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 211bad665778c..85ce3a1f845c1 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -92,7 +92,7 @@ def __init__(self, func=None, inverse_func=None, validate=False,
 
     def _check_input(self, X):
         if self.validate:
-            return self._validate_X(X, accept_sparse=self.accept_sparse)
+            return self._validate_data(X, accept_sparse=self.accept_sparse)
         return X
 
     def _check_inverse_transform(self, X):
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 508ad5f5a76fa..5ad26c2dd4c90 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -356,7 +356,7 @@ def fit(self, X, y=None):
         self
 
         """
-        X = self._validate_X(X, accept_sparse=['csr', 'csc'])
+        X = self._validate_data(X, accept_sparse=['csr', 'csc'])
 
         n_samples, n_features = X.shape
 
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index e1bb3b3436896..a84a9950aa3ac 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -223,7 +223,7 @@ def fit(self, X, y):
         -------
         self : object
         """
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         self.X_ = X
         check_classification_targets(y)
 
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index fb0ca341481b9..061923f68f6cd 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -143,9 +143,9 @@ def fit(self, X, y, sample_weight=None):
             raise TypeError("Sparse precomputed kernels are not supported.")
         self._sparse = sparse and not callable(self.kernel)
 
-        X, y = self._validate_X_y(X, y, dtype=np.float64,
-                                  order='C', accept_sparse='csr',
-                                  accept_large_sparse=False)
+        X, y = self._validate_data(X, y, dtype=np.float64,
+                                   order='C', accept_sparse='csr',
+                                   accept_large_sparse=False)
         y = self._validate_targets(y)
 
         sample_weight = np.asarray([]
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index c795f4cd6d099..b7e1881e9331f 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -218,9 +218,9 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Penalty term must be positive; got (C=%r)"
                              % self.C)
 
-        X, y = self._validate_X_y(X, y, accept_sparse='csr',
-                                  dtype=np.float64, order="C",
-                                  accept_large_sparse=False)
+        X, y = self._validate_data(X, y, accept_sparse='csr',
+                                   dtype=np.float64, order="C",
+                                   accept_large_sparse=False)
         check_classification_targets(y)
         self.classes_ = np.unique(y)
 
@@ -398,9 +398,9 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Penalty term must be positive; got (C=%r)"
                              % self.C)
 
-        X, y = self._validate_X_y(X, y, accept_sparse='csr',
-                                  dtype=np.float64, order="C",
-                                  accept_large_sparse=False)
+        X, y = self._validate_data(X, y, accept_sparse='csr',
+                                   dtype=np.float64, order="C",
+                                   accept_large_sparse=False)
         penalty = 'l2'  # SVR only accepts l2 penalty
         self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
             X, y, self.C, self.fit_intercept, self.intercept_scaling,
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 38ea318e3796b..6ca52e0901742 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -146,7 +146,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             raise ValueError("ccp_alpha must be greater than or equal to 0")
 
         if check_input:
-            X = self._validate_X(X, dtype=DTYPE, accept_sparse="csc")
+            X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc")
             y = check_array(y, ensure_2d=False, dtype=None)
             if issparse(X):
                 X.sort_indices()
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 06ed9b2b18f63..70c51c0069f7b 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -58,7 +58,7 @@ def __init__(self, key=0):
         self.key = key
 
     def fit(self, X, y=None):
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
     def predict(self, X):
@@ -73,7 +73,7 @@ def __init__(self, acceptable_key=0):
 
     def fit(self, X, y=None):
         self.wrong_attribute = 0
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
@@ -83,14 +83,14 @@ def __init__(self, wrong_attribute=0):
 
     def fit(self, X, y=None):
         self.wrong_attribute = 1
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
 class ChangesUnderscoreAttribute(BaseEstimator):
     def fit(self, X, y=None):
         self._good_attribute = 1
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
@@ -107,7 +107,7 @@ def set_params(self, **kwargs):
         return super().set_params(**kwargs)
 
     def fit(self, X, y=None):
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
@@ -124,7 +124,7 @@ def set_params(self, **kwargs):
         return super().set_params(**kwargs)
 
     def fit(self, X, y=None):
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
@@ -143,19 +143,19 @@ def set_params(self, **kwargs):
         return super().set_params(**kwargs)
 
     def fit(self, X, y=None):
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
 class NoCheckinPredict(BaseBadClassifier):
     def fit(self, X, y):
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         return self
 
 
 class NoSparseClassifier(BaseBadClassifier):
     def fit(self, X, y):
-        X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
         if sp.issparse(X):
             raise ValueError("Nonsensical Error")
         return self
@@ -167,7 +167,7 @@ def predict(self, X):
 
 class CorrectNotFittedErrorClassifier(BaseBadClassifier):
     def fit(self, X, y):
-        X, y = self._validate_X_y(X, y)
+        X, y = self._validate_data(X, y)
         self.coef_ = np.ones(X.shape[1])
         return self
 
@@ -180,7 +180,7 @@ def predict(self, X):
 class NoSampleWeightPandasSeriesType(BaseEstimator):
     def fit(self, X, y, sample_weight=None):
         # Convert data
-        X, y = self._validate_X_y(
+        X, y = self._validate_data(
             X, y,
             accept_sparse=("csr", "csc"),
             multi_output=True,
@@ -221,7 +221,7 @@ def fit(self, X, y):
 
 class BadTransformerWithoutMixin(BaseEstimator):
     def fit(self, X, y=None):
-        X = self._validate_X(X)
+        X = self._validate_data(X)
         return self
 
     def transform(self, X):
@@ -232,7 +232,7 @@ def transform(self, X):
 class NotInvariantPredict(BaseEstimator):
     def fit(self, X, y):
         # Convert data
-        X, y = self._validate_X_y(
+        X, y = self._validate_data(
             X, y,
             accept_sparse=("csr", "csc"),
             multi_output=True,
@@ -249,7 +249,7 @@ def predict(self, X):
 
 class LargeSparseNotSupportedClassifier(BaseEstimator):
     def fit(self, X, y):
-        X, y = self._validate_X_y(
+        X, y = self._validate_data(
             X, y,
             accept_sparse=("csr", "csc", "coo"),
             accept_large_sparse=True,
@@ -270,7 +270,7 @@ def fit(self, X, y):
 
 class SparseTransformer(BaseEstimator):
     def fit(self, X, y=None):
-        self.X_shape_ = self._validate_X(X).shape
+        self.X_shape_ = self._validate_data(X).shape
         return self
 
     def fit_transform(self, X, y=None):
@@ -301,7 +301,7 @@ def _more_tags(self):
 class RequiresPositiveYRegressor(LinearRegression):
 
     def fit(self, X, y):
-        X, y = self._validate_X_y(X, y, multi_output=True)
+        X, y = self._validate_data(X, y, multi_output=True)
         if (y <= 0).any():
             raise ValueError('negative y values not supported!')
         return super().fit(X, y)

From 2f448aa6b1229fc70bf8402dc9e35e9d30f58b93 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Jan 2020 13:49:55 -0500
Subject: [PATCH 41/53] fixed columntransformer issue

---
 sklearn/compose/_column_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 8e87cc937ce7a..ffddd316cbf7f 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -512,7 +512,7 @@ def fit_transform(self, X, y=None):
             self._feature_names_in = None
         X = _check_X(X)
         # set n_features_in_ attribute
-        self._validate_n_features(X)
+        self._validate_n_features(X, reset=True)
         self._validate_transformers()
         self._validate_column_callables(X)
         self._validate_remainder(X)

From a6a344d42c5e28647b739f19213d86cbc1cd1b50 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Jan 2020 15:08:13 -0500
Subject: [PATCH 42/53] comments

---
 sklearn/base.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/sklearn/base.py b/sklearn/base.py
index c3bb97c5d7a61..51f0181979c63 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -338,6 +338,18 @@ def _get_tags(self):
         return collected_tags
 
     def _validate_n_features(self, X, reset):
+        """Set the n_features_in_ attribute, or check against it.
+
+        Parameters
+        ----------
+
+        X : ndarray or sparse matrix
+            The input samples
+        reset : bool
+            If True, the n_features_in_ attribute is set to X.shape[1]. Else,
+            the attribute must already exist and the function checks that it is
+            equal to X.shape[1].
+        """
         n_features = X.shape[1]
 
         if reset:
@@ -356,6 +368,28 @@ def _validate_n_features(self, X, reset):
                 )
 
     def _validate_data(self, X, y=None, reset=True, **check_params):
+        """Validate input data and set or check the n_features_in_ attribute.
+
+        Parameters
+        ----------
+
+        X : array-like
+            The input samples.
+        y : array-like or None, default=None
+            The targets. If None, check_array is called on X and check_X_y is
+            called otherwise.
+        reset : bool, default=True
+            Whether to reset the n_features_in_ attribute. See
+            _validate_n_features().
+        **check_params : kwargs
+            Parameters passed to check_array() or check_X_y().
+
+        Returns
+        -------
+        out : {ndarray, sparse matrix} or tuple of these
+            The validated input. A tuple is returned if y is not None.
+        """
+
         if y is None:
             X = check_array(X, **check_params)
             out = X

From 9e0c3d7d4fd3d62c4a7357da4517c59f63bdfa60 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Jan 2020 15:15:35 -0500
Subject: [PATCH 43/53] minor renaming

---
 sklearn/base.py                        | 6 +++---
 sklearn/compose/_column_transformer.py | 4 ++--
 sklearn/mixture/_base.py               | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 51f0181979c63..2918fc75fa745 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -337,7 +337,7 @@ def _get_tags(self):
                 collected_tags.update(more_tags)
         return collected_tags
 
-    def _validate_n_features(self, X, reset):
+    def _check_n_features(self, X, reset):
         """Set the n_features_in_ attribute, or check against it.
 
         Parameters
@@ -380,7 +380,7 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
             called otherwise.
         reset : bool, default=True
             Whether to reset the n_features_in_ attribute. See
-            _validate_n_features().
+            _check_n_features().
         **check_params : kwargs
             Parameters passed to check_array() or check_X_y().
 
@@ -398,7 +398,7 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
             out = X, y
 
         if check_params.get('ensure_2d', True):
-            self._validate_n_features(X, reset=reset)
+            self._check_n_features(X, reset=reset)
 
         return out
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index ffddd316cbf7f..5b360bb94aeae 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -512,7 +512,7 @@ def fit_transform(self, X, y=None):
             self._feature_names_in = None
         X = _check_X(X)
         # set n_features_in_ attribute
-        self._validate_n_features(X, reset=True)
+        self._check_n_features(X, reset=True)
         self._validate_transformers()
         self._validate_column_callables(X)
         self._validate_remainder(X)
@@ -586,7 +586,7 @@ def transform(self, X):
                                  'and for transform when using the '
                                  'remainder keyword')
 
-        # TODO: also call _validate_n_features(reset=False) in 0.24
+        # TODO: also call _check_n_features(reset=False) in 0.24
         self._validate_features(X.shape[1], X_feature_names)
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index e96978f9018f2..b8877da2a7c1c 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -217,7 +217,7 @@ def fit_predict(self, X, y=None):
             Component labels.
         """
         X = _check_X(X, self.n_components, ensure_min_samples=2)
-        self._validate_n_features(X, reset=True)
+        self._check_n_features(X, reset=True)
         self._check_initial_parameters(X)
 
         # if we enable warm_start, we will have a unique initialisation

From d7963e7f8ed6372db01f7cd416491e70241cea74 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 31 Jan 2020 10:21:21 -0500
Subject: [PATCH 44/53] Apply suggestions from code review

Co-Authored-By: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/base.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 2918fc75fa745..9b08b36ecd7c0 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -338,17 +338,16 @@ def _get_tags(self):
         return collected_tags
 
     def _check_n_features(self, X, reset):
-        """Set the n_features_in_ attribute, or check against it.
+        """Set the `n_features_in_` attribute, or check against it.
 
         Parameters
         ----------
-
-        X : ndarray or sparse matrix
-            The input samples
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
         reset : bool
-            If True, the n_features_in_ attribute is set to X.shape[1]. Else,
+            If True, the `n_features_in_` attribute is set to `X.shape[1]`. Else,
             the attribute must already exist and the function checks that it is
-            equal to X.shape[1].
+            equal to `X.shape[1]`.
         """
         n_features = X.shape[1]
 
@@ -357,7 +356,7 @@ def _check_n_features(self, X, reset):
         else:
             if not hasattr(self, 'n_features_in_'):
                 raise RuntimeError(
-                    "reset parameter is False but there is no n_features_in_ "
+                    "The reset parameter is False but there is no n_features_in_ "
                     "attribute."
                 )
             if n_features != self.n_features_in_:
@@ -368,26 +367,25 @@ def _check_n_features(self, X, reset):
                 )
 
     def _validate_data(self, X, y=None, reset=True, **check_params):
-        """Validate input data and set or check the n_features_in_ attribute.
+        """Validate input data and set or check the `n_features_in_` attribute.
 
         Parameters
         ----------
-
-        X : array-like
+        X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)
             The input samples.
-        y : array-like or None, default=None
-            The targets. If None, check_array is called on X and check_X_y is
+        y : array-like of shape (n_samples,), default=None
+            The targets. If None, `check_array` is called on `X` and `check_X_y` is
             called otherwise.
         reset : bool, default=True
-            Whether to reset the n_features_in_ attribute. See
-            _check_n_features().
+            Whether to reset the `n_features_in_` attribute. See
+            :func:`_check_n_features`.
         **check_params : kwargs
-            Parameters passed to check_array() or check_X_y().
+            Parameters passed to :func:`sklearn.utils.check_array` or :func:`sklearn.utils.check_X_y`.
 
         Returns
         -------
         out : {ndarray, sparse matrix} or tuple of these
-            The validated input. A tuple is returned if y is not None.
+            The validated input. A tuple is returned if `y` is not None.
         """
 
         if y is None:

From d6f0451bd308306700f3960cfa654770a78e7418 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 31 Jan 2020 10:48:58 -0500
Subject: [PATCH 45/53] addressed most comments

---
 sklearn/base.py                               | 20 ++++++++++---------
 sklearn/cluster/_agglomerative.py             |  4 ++--
 sklearn/decomposition/_fastica.py             | 11 ++--------
 sklearn/ensemble/_forest.py                   |  4 ++--
 .../tests/test_dict_vectorizer.py             |  3 +--
 sklearn/pipeline.py                           |  1 +
 6 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 5bbe0a215192d..9d97539687179 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -352,9 +352,9 @@ def _check_n_features(self, X, reset):
         X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             The input samples.
         reset : bool
-            If True, the `n_features_in_` attribute is set to `X.shape[1]`. Else,
-            the attribute must already exist and the function checks that it is
-            equal to `X.shape[1]`.
+            If True, the `n_features_in_` attribute is set to `X.shape[1]`.
+            Else, the attribute must already exist and the function checks
+            that it is equal to `X.shape[1]`.
         """
         n_features = X.shape[1]
 
@@ -363,8 +363,8 @@ def _check_n_features(self, X, reset):
         else:
             if not hasattr(self, 'n_features_in_'):
                 raise RuntimeError(
-                    "The reset parameter is False but there is no n_features_in_ "
-                    "attribute."
+                    "The reset parameter is False but there is no "
+                    "n_features_in_ attribute. Is this estimator fitted?"
                 )
             if n_features != self.n_features_in_:
                 raise ValueError(
@@ -378,16 +378,18 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
             The input samples.
         y : array-like of shape (n_samples,), default=None
-            The targets. If None, `check_array` is called on `X` and `check_X_y` is
-            called otherwise.
+            The targets. If None, `check_array` is called on `X` and
+            `check_X_y` is called otherwise.
         reset : bool, default=True
             Whether to reset the `n_features_in_` attribute. See
             :func:`_check_n_features`.
         **check_params : kwargs
-            Parameters passed to :func:`sklearn.utils.check_array` or :func:`sklearn.utils.check_X_y`.
+            Parameters passed to :func:`sklearn.utils.check_array` or
+            :func:`sklearn.utils.check_X_y`.
 
         Returns
         -------
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index b29b1078333cc..8d21e69c32e7f 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -1053,10 +1053,10 @@ def fit(self, X, y=None, **params):
         """
         X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
                                 ensure_min_features=2, estimator=self)
+        # save n_features_in_ attribute here to reset it after, because it will
+        # be overridden in AgglomerativeClustering since we passed it X.T.
         n_features_in_ = self.n_features_in_
         AgglomerativeClustering.fit(self, X.T, **params)
-        # Need to restore n_features_in_ attribute that was overridden in
-        # AgglomerativeClustering since we passed it X.T.
         self.n_features_in_ = n_features_in_
         return self
 
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index ef9f376bba66d..f9e3a148f6860 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -425,18 +425,11 @@ def _fit(self, X, compute_sources=False):
             X_new : array-like, shape (n_samples, n_components)
         """
 
-        # This validates twice but there is not clean way to avoid validation
-        # in fastica(). Please see issue 14897.
-        self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES,
-                            ensure_min_samples=2).T
+        X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES,
+                                ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args
         random_state = check_random_state(self.random_state)
 
-        # make interface compatible with other decompositions
-        # a copy is required only for non whitened data
-        X = check_array(X, copy=self.whiten, dtype=FLOAT_DTYPES,
-                        ensure_min_samples=2).T
-
         alpha = fun_args.get('alpha', 1.0)
         if not 1 <= alpha <= 2:
             raise ValueError('alpha must be in [1,2]')
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 9d1cfee7e7266..3bb6b14aaa7f1 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -293,8 +293,8 @@ def fit(self, X, y, sample_weight=None):
         self : object
         """
         # Validate or convert input data
-        X = self._validate_data(X, accept_sparse="csc", dtype=DTYPE)
-        y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
+        X, y = self._validate_data(X, y, multi_output=True,
+                                   accept_sparse="csc", dtype=DTYPE)
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
         if issparse(X):
diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
index a65feb2d7590b..22a7402908cf1 100644
--- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py
+++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -113,8 +113,7 @@ def test_deterministic_vocabulary():
 
 
 def test_n_features_in():
-    # For vectorizers, n_features_in_ does not make sense and it is always
-    # None
+    # For vectorizers, n_features_in_ does not make sense and does not exist.
     dv = DictVectorizer()
     assert not hasattr(dv, 'n_features_in_')
     d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 1ad9dda276427..0a914c1cff9af 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -629,6 +629,7 @@ def _pairwise(self):
 
     @property
     def n_features_in_(self):
+        # delegate to first step (which will call _check_is_fitted)
         return self.steps[0][1].n_features_in_
 
 
From b917d72dd81d2002c0b19e3ff0e391440e5ada26 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 31 Jan 2020 10:57:06 -0500
Subject: [PATCH 46/53] Better comments

---
 sklearn/decomposition/_lda.py  | 11 +++++++----
 sklearn/preprocessing/_data.py | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 201b393374a08..ba68e03a16191 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -499,10 +499,13 @@ def partial_fit(self, X, y=None):
         self._check_params()
         first_time = not hasattr(self, 'components_')
 
-        # deactivating check for now (specific tests about error message would
-        # break)
-        # TODO: uncomment when addressing reset in predict/transform/etc.
-        # reset = first_time
+        # In theory reset should be equal to `first_time`, but there are tests
+        # checking the input number of feature and they expect a specific
+        # string, which is not the same one raised by check_n_features. So we
+        # don't check n_features_in_ here for now (it's done with adhoc code in
+        # the estimator anyway).
+        # TODO: set reset=first_time when addressing reset in
+        # predict/transform/etc.
         reset_n_features = True
         X = self._check_non_neg_array(X, reset_n_features,
                                       "LatentDirichletAllocation.partial_fit")
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index b43f1b9c2203a..72ad6bacd43b4 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -2443,10 +2443,13 @@ def _transform_col(self, X_col, quantiles, inverse):
     def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
                       copy=False):
         """Check inputs before fit and transform"""
-        # deactivating check for now (specific tests about error message would
-        # break)
-        # TODO: uncomment when addressing reset in predict/transform/etc.
-        # reset = in_fit
+        # In theory reset should be equal to `in_fit`, but there are tests
+        # checking the input number of feature and they expect a specific
+        # string, which is not the same one raised by check_n_features. So we
+        # don't check n_features_in_ here for now (it's done with adhoc code in
+        # the estimator anyway).
+        # TODO: set reset=in_fit when addressing reset in
+        # predict/transform/etc.
         reset = True
 
         X = self._validate_data(X, reset=reset,

From 4fb756ea0d1a1622e0f11dc421de8ed6ea1a5399 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 5 Feb 2020 10:00:12 -0500
Subject: [PATCH 47/53] pep8

---
 sklearn/feature_selection/_rfe.py | 14 +++++---------
 sklearn/svm/_base.py              |  4 ++--
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 0edc4f35ac679..69e3cc4de9e6c 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -493,16 +493,12 @@ def fit(self, X, y, groups=None):
             train/test set. Only used in conjunction with a "Group" :term:`cv`
             instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
         """
-<<<<<<< HEAD
-        X, y = self._validate_data(X, y, accept_sparse="csr",
-                                   ensure_min_features=2,
-                                   force_all_finite=False)
-=======
         tags = self._get_tags()
-        X, y = check_X_y(X, y, "csc", ensure_min_features=2,
-                         force_all_finite=not tags.get('allow_nan', True),
-                         multi_output=True)
->>>>>>> 54c3a1fbe7ef0f6814ae6406fbc0d52804303370
+        X, y = self._validate_data(
+            X, y, accept_sparse="csr", ensure_min_features=2,
+            force_all_finite=not tags.get('allow_nan', True),
+            multi_output=True
+        )
 
         # Initialization
         cv = check_cv(self.cv, y, is_classifier(self.estimator))
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 116b6c2b6cfeb..ee64c3fb1692e 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -148,8 +148,8 @@ def fit(self, X, y, sample_weight=None):
             check_consistent_length(X, y)
         else:
             X, y = self._validate_data(X, y, dtype=np.float64,
-                                    order='C', accept_sparse='csr',
-                                    accept_large_sparse=False)
+                                       order='C', accept_sparse='csr',
+                                       accept_large_sparse=False)
 
         y = self._validate_targets(y)
 

From 511c395223dd37ffe8fc4e031080fdca1630425b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 7 Feb 2020 10:42:37 -0500
Subject: [PATCH 48/53] Addressed comments and raise warning instad of error

---
 sklearn/base.py                   |  5 +++--
 sklearn/compose/_target.py        |  2 +-
 sklearn/utils/estimator_checks.py | 17 +++++++++++++++--
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 9d97539687179..58ef44eefc047 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -385,8 +385,9 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
             The targets. If None, `check_array` is called on `X` and
             `check_X_y` is called otherwise.
         reset : bool, default=True
-            Whether to reset the `n_features_in_` attribute. See
-            :func:`_check_n_features`.
+            Whether to reset the `n_features_in_` attribute.
+            If False, the input will be checked for consistency with data
+            provided when reset was last True.
         **check_params : kwargs
             Parameters passed to :func:`sklearn.utils.check_array` or
             :func:`sklearn.utils.check_X_y`.
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 03727e82d7e90..a2bfb9a5d35d4 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -239,7 +239,7 @@ def _more_tags(self):
     @property
     def n_features_in_(self):
         # For consistency with other estimators we raise a AttributeError so
-        # that hasattr() fails if the estimator isn't fitted.
+        # that hasattr() returns False the estimator isn't fitted.
         try:
             check_is_fitted(self)
         except NotFittedError as nfe:
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index f3ad8c01a46df..3b2c518d08c2c 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2883,7 +2883,7 @@ def check_n_features_in(name, estimator_orig):
 
     estimator = clone(estimator_orig)
     set_random_state(estimator)
-    if 'warm_start' in estimator.get_params().keys():
+    if 'warm_start' in estimator.get_params():
         estimator.set_params(warm_start=False)
 
     n_samples = 100
@@ -2897,4 +2897,17 @@ def check_n_features_in(name, estimator_orig):
 
     assert not hasattr(estimator, 'n_features_in_')
     estimator.fit(X, y)
-    assert estimator.n_features_in_ == X.shape[1]
+    if hasattr(estimator, 'n_features_in_'):
+        assert estimator.n_features_in_ == X.shape[1]
+    else:
+        warnings.warn(
+            "As of scikit-learn 0.23, estimators should expose a "
+            "n_features_in_ attribute, unless the 'no_validation' tag is "
+            "True. This attribute should be equal to the number of features "
+            "passed to the fit method. "
+            "An error will be raised from version 0.25 when calling "
+            "check_estimator(). "
+            "See SLEP010: "
+            "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html",  # noqa
+            FutureWarning
+        )

From c2708841dce1e42e17e1406c29e289f7778ed7e0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 7 Feb 2020 10:47:52 -0500
Subject: [PATCH 49/53] Added whatsnew

---
 doc/whats_new/v0.23.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index a1830229b57ec..b4f378c7b591d 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -14,6 +14,14 @@ Version 0.23.0
 
 Put the changes in their relevant module.
 
+New `n_features_in_` attribute
+------------------------------
+
+Most estimators now expose a `n_features_in_` attribute. This attribute is
+equal to the number of features passed to the `fit` method. See
+`SLEP010 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
+for details.
+
 
 Changed models
 --------------

From 39d83714a8631f28cc0dabb8c9017b5f79e8183b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 7 Feb 2020 11:15:45 -0500
Subject: [PATCH 50/53] Updated estimator API

---
 doc/developers/develop.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 5e11f46eccdb8..7b5c3db36f526 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -226,6 +226,14 @@ the dataset, e.g. when ``X`` is a precomputed kernel matrix. Specifically,
 the :term:`_pairwise` property is used by ``utils.metaestimators._safe_split``
 to slice rows and columns.
 
+Universal attributes
+^^^^^^^^^^^^^^^^^^^^
+
+Estimators that expect rectangular input have a `n_features_in_` attribute
+indicating the number of features that the estimator expects. See `SLEP010
+<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
+for details.
+
 .. _rolling_your_own_estimator:
 
 Rolling your own estimator

From 9effdbf962363784c970419952bf782d7911d2b8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 7 Feb 2020 11:17:04 -0500
Subject: [PATCH 51/53] formulation

---
 doc/developers/develop.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 7b5c3db36f526..1b936684eafb1 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -229,8 +229,9 @@ to slice rows and columns.
 Universal attributes
 ^^^^^^^^^^^^^^^^^^^^
 
-Estimators that expect rectangular input have a `n_features_in_` attribute
-indicating the number of features that the estimator expects. See `SLEP010
+Estimators that expect rectangular input should have a `n_features_in_`
+attribute indicating the number of features that the estimator expects. See
+`SLEP010
 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
 for details.
 

From 4f8ca86e86bf949847d58a561ed1fe2f07d44d46 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 10 Feb 2020 10:32:02 -0500
Subject: [PATCH 52/53] Comment about estimator API

---
 doc/developers/develop.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 1b936684eafb1..db1ba3900e40a 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -229,8 +229,10 @@ to slice rows and columns.
 Universal attributes
 ^^^^^^^^^^^^^^^^^^^^
 
-Estimators that expect rectangular input should have a `n_features_in_`
-attribute indicating the number of features that the estimator expects. See
+Estimators that expect tabular input should set a `n_features_in_`
+attribute at `fit` time to indicate the number of features that the estimator
+expects for subsequent calls to `predict` or `transform`.
+See
 `SLEP010
 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
 for details.

From a101d2d5ce4634f0373599dfa4b43f67b1ea2f7b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 11 Feb 2020 10:16:15 -0500
Subject: [PATCH 53/53] Updated changelog

---
 doc/whats_new/v0.23.rst | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index b4f378c7b591d..2ed63ed2fdf22 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -14,15 +14,6 @@ Version 0.23.0
 
 Put the changes in their relevant module.
 
-New `n_features_in_` attribute
-------------------------------
-
-Most estimators now expose a `n_features_in_` attribute. This attribute is
-equal to the number of features passed to the `fit` method. See
-`SLEP010 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
-for details.
-
-
 Changed models
 --------------
 
@@ -275,3 +266,13 @@ Changelog
 - |Enhancement| add warning in :func:`utils.validation.check_array` for
   pandas sparse DataFrame.
   :pr:`16021` by :user:`Rushabh Vasani <rushabh-v>`.
+
+Miscellaneous
+.............
+
+- |API| Most estimators now expose a `n_features_in_` attribute. This
+  attribute is equal to the number of features passed to the `fit` method.
+  See `SLEP010
+  <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
+  for details. :pr:`16112` by `Nicolas Hug`_.
+