diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index f11f1062cf73c..1f37ff0929ff9 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -1419,22 +1419,18 @@ advised to maintain notes on the `GitHub wiki
 Specific models
 ---------------
 
-Classifiers should accept ``y`` (target) arguments to ``fit``
-that are sequences (lists, arrays) of either strings or integers.
-They should not assume that the class labels
-are a contiguous range of integers;
-instead, they should store a list of classes
-in a ``classes_`` attribute or property.
-The order of class labels in this attribute
-should match the order in which ``predict_proba``, ``predict_log_proba``
-and ``decision_function`` return their values.
-The easiest way to achieve this is to put::
+Classifiers should accept ``y`` (target) arguments to ``fit`` that are
+sequences (lists, arrays) of either strings or integers.  They should not
+assume that the class labels are a contiguous range of integers; instead, they
+should store a list of classes in a ``classes_`` attribute or property.  The
+order of class labels in this attribute should match the order in which
+``predict_proba``, ``predict_log_proba`` and ``decision_function`` return their
+values.  The easiest way to achieve this is to put::
 
     self.classes_, y = np.unique(y, return_inverse=True)
 
-in ``fit``.
-This returns a new ``y`` that contains class indexes, rather than labels,
-in the range [0, ``n_classes``).
+in ``fit``.  This returns a new ``y`` that contains class indexes, rather than
+labels, in the range [0, ``n_classes``).
 
 A classifier's ``predict`` method should return
 arrays containing class labels from ``classes_``.
@@ -1445,14 +1441,89 @@ this can be achieved with::
         D = self.decision_function(X)
         return self.classes_[np.argmax(D, axis=1)]
 
-In linear models, coefficients are stored in an array called ``coef_``,
-and the independent term is stored in ``intercept_``.
-``sklearn.linear_model.base`` contains a few base classes and mixins
-that implement common linear model patterns.
+In linear models, coefficients are stored in an array called ``coef_``, and the
+independent term is stored in ``intercept_``.  ``sklearn.linear_model.base``
+contains a few base classes and mixins that implement common linear model
+patterns.
 
 The :mod:`sklearn.utils.multiclass` module contains useful functions
 for working with multiclass and multilabel problems.
 
+Estimator Tags
+--------------
+.. warning::
+
+    The estimator tags are experimental and the API is subject to change.
+
+Scikit-learn introduced estimator tags in version 0.21.  These are annotations
+of estimators that allow programmatic inspection of their capabilities, such as
+sparse matrix support, supported output types and supported methods.  The
+estimator tags are a dictionary returned by the method ``_get_tags()``.  These
+tags are used by the common tests and the :func:`sklearn.utils.estomator_checks.check_estimator` function to
+decide what tests to run and what input data is appropriate. Tags can depends on
+estimator parameters or even system architecture and can in general only be
+determined at runtime.
+
+The default value of all tags except for ``X_types`` is ``False``.
+
+The current set of estimator tags are:
+
+non_deterministic
+    whether the estimator is not deterministic given a fixed ``random_state``
+
+requires_positive_data - unused for now
+    whether the estimator requires positive X.
+
+no_validation
+    whether the estimator skips input-validation. This is only meant for stateless and dummy transformers!
+
+multioutput - unused for now
+    whether a regressor supports multi-target outputs or a classifier supports multi-class multi-output.
+
+multilabel
+    whether the estimator supports multilabel output
+
+stateless
+    whether the estimator needs access to data for fitting. Even though
+    an estimator is stateless, it might still need a call to ``fit`` for initialization.
+
+allow_nan
+    whether the estimator supports data with missing values encoded as np.NaN
+
+poor_score
+    whether the estimator fails to provide a "reasonable" test-set score, which
+    currently for regression is an R2 of 0.5 on a subset of the boston housing
+    dataset, and for classification an accuracy of 0.83 on
+    ``make_blobs(n_samples=300, random_state=0)``. These datasets and values
+    are based on current estimators in sklearn and might be replaced by
+    something more systematic.
+
+multioutput_only
+    whether estimator supports only multi-output classification or regression.
+
+_skip_test
+    whether to skip common tests entirely. Don't use this unless you have a *very good* reason.
+
+X_types
+    Supported input types for X as list of strings. Tests are currently only run if '2darray' is contained
+    in the list, signifying that the estimator takes continuous 2d numpy arrays as input. The default
+    value is ['2darray']. Other possible types are ``'string'``, ``'sparse'``,
+    ``'categorical'``, ``dict``, ``'1dlabels'`` and ``'2dlabels'``.
+    The goals is that in the future the supported input type will determine the
+    data used during testsing, in particular for ``'string'``, ``'sparse'`` and
+    ``'categorical'`` data.  For now, the test for sparse data do not make use
+    of the ``'sparse'`` tag.
+
+
+In addition to the tags, estimators are also need to declare any non-optional
+parameters to ``__init__`` in the ``_required_parameters`` class attribute,
+which is a list or tuple.  If ``_required_parameters`` is only
+``["estimator"]`` or ``["base_estimator"]``, then the estimator will be
+instantiated with an instance of ``LinearDiscriminantAnalysis`` (or
+``RidgeRegression`` if the estimator is a regressor) in the tests. The choice
+of these two models is somewhat idiosyncratic but both should provide robust
+closed-form solutions.
+
 .. _reading-code:
 
 Reading the existing code base
diff --git a/sklearn/base.py b/sklearn/base.py
index c5b1436bac926..cdace05f9817c 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -6,12 +6,25 @@
 import copy
 import warnings
 from collections import defaultdict
-from inspect import signature
+import struct
+import inspect
 
 import numpy as np
 
 from . import __version__
 
+_DEFAULT_TAGS = {
+    'non_deterministic': False,
+    'requires_positive_data': False,
+    'X_types': ['2darray'],
+    'poor_score': False,
+    'no_validation': False,
+    'multioutput': False,
+    "allow_nan": False,
+    'stateless': False,
+    'multilabel': False,
+    '_skip_test': False,
+    'multioutput_only': False}
 
 
 def clone(estimator, safe=True):
@@ -61,7 +74,6 @@ def clone(estimator, safe=True):
     return new_object
 
 
-###############################################################################
 def _pprint(params, offset=0, printer=repr):
     """Pretty print the dictionary 'params'
 
@@ -112,7 +124,17 @@ def _pprint(params, offset=0, printer=repr):
     return lines
 
 
-###############################################################################
+def _update_if_consistent(dict1, dict2):
+    common_keys = set(dict1.keys()).intersection(dict2.keys())
+    for key in common_keys:
+        if dict1[key] != dict2[key]:
+            raise TypeError("Inconsistent values for tag {}: {} != {}".format(
+                key, dict1[key], dict2[key]
+            ))
+    dict1.update(dict2)
+    return dict1
+
+
 class BaseEstimator:
     """Base class for all estimators in scikit-learn
 
@@ -135,7 +157,7 @@ def _get_param_names(cls):
 
         # introspect the constructor arguments to find the model parameters
         # to represent
-        init_signature = signature(init)
+        init_signature = inspect.signature(init)
         # Consider the constructor parameters excluding 'self'
         parameters = [p for p in init_signature.parameters.values()
                       if p.name != 'self' and p.kind != p.VAR_KEYWORD]
@@ -255,8 +277,22 @@ def __setstate__(self, state):
         except AttributeError:
             self.__dict__.update(state)
 
+    def _get_tags(self):
+        collected_tags = {}
+        for base_class in inspect.getmro(self.__class__):
+            if (hasattr(base_class, '_more_tags')
+                    and base_class != self.__class__):
+                more_tags = base_class._more_tags(self)
+                collected_tags = _update_if_consistent(collected_tags,
+                                                       more_tags)
+        if hasattr(self, '_more_tags'):
+            more_tags = self._more_tags()
+            collected_tags = _update_if_consistent(collected_tags, more_tags)
+        tags = _DEFAULT_TAGS.copy()
+        tags.update(collected_tags)
+        return tags
+
 
-###############################################################################
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""
     _estimator_type = "classifier"
@@ -289,7 +325,6 @@ def score(self, X, y, sample_weight=None):
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
 
-###############################################################################
 class RegressorMixin:
     """Mixin class for all regression estimators in scikit-learn."""
     _estimator_type = "regressor"
@@ -330,7 +365,6 @@ def score(self, X, y, sample_weight=None):
                         multioutput='variance_weighted')
 
 
-###############################################################################
 class ClusterMixin:
     """Mixin class for all cluster estimators in scikit-learn."""
     _estimator_type = "clusterer"
@@ -432,7 +466,6 @@ def get_submatrix(self, i, data):
         return data[row_ind[:, np.newaxis], col_ind]
 
 
-###############################################################################
 class TransformerMixin:
     """Mixin class for all transformers in scikit-learn."""
 
@@ -510,13 +543,27 @@ def fit_predict(self, X, y=None):
         return self.fit(X).predict(X)
 
 
-###############################################################################
 class MetaEstimatorMixin:
+    _required_parameters = ["estimator"]
     """Mixin class for all meta estimators in scikit-learn."""
-    # this is just a tag for the moment
 
 
-###############################################################################
+class MultiOutputMixin(object):
+    """Mixin to mark estimators that support multioutput."""
+    def _more_tags(self):
+        return {'multioutput': True}
+
+
+def _is_32bit():
+    """Detect if process is 32bit Python."""
+    return struct.calcsize('P') * 8 == 32
+
+
+class _UnstableOn32BitMixin(object):
+    """Mark estimators that are non-determinstic on 32bit."""
+    def _more_tags(self):
+        return {'non_deterministic': _is_32bit()}
+
 
 def is_classifier(estimator):
     """Returns True if the given estimator is (probably) a classifier.
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index d4b6a71b51269..1136c93721871 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -158,6 +158,7 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
            [0.5, 0.5, 0. , 1. ]])
 
     """
+    _required_parameters = ['transformers']
 
     def __init__(self, transformers, remainder='drop', sparse_threshold=0.3,
                  n_jobs=None, transformer_weights=None):
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index e6eff27460f03..5213605defd30 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -233,3 +233,6 @@ def predict(self, X):
             pred_trans = pred_trans.squeeze(axis=1)
 
         return pred_trans
+
+    def _more_tags(self):
+        return {'poor_score': True, 'no_validation': True}
diff --git a/sklearn/cross_decomposition/cca_.py b/sklearn/cross_decomposition/cca_.py
index 572853a771c5d..67c132592f306 100644
--- a/sklearn/cross_decomposition/cca_.py
+++ b/sklearn/cross_decomposition/cca_.py
@@ -1,9 +1,10 @@
 from .pls_ import _PLS
+from ..base import _UnstableOn32BitMixin
 
 __all__ = ['CCA']
 
 
-class CCA(_PLS):
+class CCA(_PLS, _UnstableOn32BitMixin):
     """CCA Canonical Correlation Analysis.
 
     CCA inherits from PLS with mode="B" and deflation_mode="canonical".
diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
index 459f297e4be2b..d8b464483cf66 100644
--- a/sklearn/cross_decomposition/pls_.py
+++ b/sklearn/cross_decomposition/pls_.py
@@ -13,6 +13,7 @@
 from scipy.sparse.linalg import svds
 
 from ..base import BaseEstimator, RegressorMixin, TransformerMixin
+from ..base import MultiOutputMixin
 from ..utils import check_array, check_consistent_length
 from ..utils.extmath import svd_flip
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
@@ -116,7 +117,7 @@ def _center_scale_xy(X, Y, scale=True):
     return X, Y, x_mean, y_mean, x_std, y_std
 
 
-class _PLS(BaseEstimator, TransformerMixin, RegressorMixin,
+class _PLS(BaseEstimator, TransformerMixin, RegressorMixin, MultiOutputMixin,
            metaclass=ABCMeta):
     """Partial Least Squares (PLS)
 
@@ -454,6 +455,9 @@ def fit_transform(self, X, y=None):
         """
         return self.fit(X, y).transform(X, y)
 
+    def _more_tags(self):
+        return {'poor_score': True}
+
 
 class PLSRegression(_PLS):
     """PLS regression
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index 133717e13f677..bb91d0cbbad6c 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -10,12 +10,12 @@
 from ..utils import check_random_state
 from ..utils.validation import check_is_fitted, check_array
 from ..exceptions import NotFittedError
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _UnstableOn32BitMixin
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
 
 
-class KernelPCA(BaseEstimator, TransformerMixin):
+class KernelPCA(BaseEstimator, TransformerMixin, _UnstableOn32BitMixin):
     """Kernel Principal component analysis (KPCA)
 
     Non-linear dimensionality reduction through the use of kernels (see
diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
index cbaa5e19008fd..0cb3ce192e24f 100644
--- a/sklearn/decomposition/truncated_svd.py
+++ b/sklearn/decomposition/truncated_svd.py
@@ -156,7 +156,8 @@ def fit_transform(self, X, y=None):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = check_array(X, accept_sparse=['csr', 'csc'],
+                        ensure_min_features=2)
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index b17d44670ba45..a777bbe4848e4 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -8,6 +8,7 @@
 import scipy.sparse as sp
 
 from .base import BaseEstimator, ClassifierMixin, RegressorMixin
+from .base import MultiOutputMixin
 from .utils import check_random_state
 from .utils.validation import _num_samples
 from .utils.validation import check_array
@@ -18,7 +19,7 @@
 from .utils.multiclass import class_distribution
 
 
-class DummyClassifier(BaseEstimator, ClassifierMixin):
+class DummyClassifier(BaseEstimator, ClassifierMixin, MultiOutputMixin):
     """
     DummyClassifier is a classifier that makes predictions using simple rules.
 
@@ -180,7 +181,7 @@ def predict(self, X):
         classes_ = self.classes_
         class_prior_ = self.class_prior_
         constant = self.constant
-        if self.n_outputs_ == 1:
+        if self.n_outputs_ == 1 and not self.output_2d_:
             # Get same type even for self.n_outputs_ == 1
             n_classes_ = [n_classes_]
             classes_ = [classes_]
@@ -189,7 +190,7 @@ def predict(self, X):
         # Compute probability only once
         if self.strategy == "stratified":
             proba = self.predict_proba(X)
-            if self.n_outputs_ == 1:
+            if self.n_outputs_ == 1 and not self.output_2d_:
                 proba = [proba]
 
         if self.sparse_output_:
@@ -315,6 +316,9 @@ def predict_log_proba(self, X):
         else:
             return [np.log(p) for p in proba]
 
+    def _more_tags(self):
+        return {'poor_score': True, 'no_validation': True}
+
     def score(self, X, y, sample_weight=None):
         """Returns the mean accuracy on the given test data and labels.
 
@@ -347,7 +351,7 @@ def score(self, X, y, sample_weight=None):
         return super().score(X, y, sample_weight)
 
 
-class DummyRegressor(BaseEstimator, RegressorMixin):
+class DummyRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
     """
     DummyRegressor is a regressor that makes predictions using
     simple rules.
@@ -504,6 +508,9 @@ def predict(self, X, return_std=False):
 
         return (y, y_std) if return_std else y
 
+    def _more_tags(self):
+        return {'poor_score': True, 'no_validation': True}
+
     def score(self, X, y, sample_weight=None):
         """Returns the coefficient of determination R^2 of the prediction.
 
diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py
index 08252d392dfe2..7ac1dd4f72613 100644
--- a/sklearn/ensemble/base.py
+++ b/sklearn/ensemble/base.py
@@ -83,6 +83,8 @@ class BaseEnsemble(BaseEstimator, MetaEstimatorMixin, metaclass=ABCMeta):
     estimators_ : list of estimators
         The collection of fitted base estimators.
     """
+    # overwrite _required_parameters from MetaEstimatorMixin
+    _required_parameters = []
 
     @abstractmethod
     def __init__(self, base_estimator, n_estimators=10,
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 4ff2cd6c44ac4..aae9dd8c72349 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -48,7 +48,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import issparse
 from scipy.sparse import hstack as sparse_hstack
 
-from ..base import ClassifierMixin, RegressorMixin
+from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin
 from ..utils._joblib import Parallel, delayed
 from ..metrics import r2_score
 from ..preprocessing import OneHotEncoder
@@ -62,6 +62,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
 
+
 __all__ = ["RandomForestClassifier",
            "RandomForestRegressor",
            "ExtraTreesClassifier",
@@ -121,7 +122,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
     return tree
 
 
-class BaseForest(BaseEnsemble, metaclass=ABCMeta):
+class BaseForest(BaseEnsemble, MultiOutputMixin, metaclass=ABCMeta):
     """Base class for forests of trees.
 
     Warning: This class should not be used directly. Use derived classes
diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py
index 811331bee3f6f..63e0ee94a97b5 100644
--- a/sklearn/ensemble/voting_classifier.py
+++ b/sklearn/ensemble/voting_classifier.py
@@ -121,6 +121,7 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin):
     >>> print(eclf3.transform(X).shape)
     (6, 6)
     """
+    _required_parameters = ['estimators']
 
     def __init__(self, estimators, voting='hard', weights=None, n_jobs=None,
                  flatten_transform=True):
diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py
index 29d4ae58e57c5..8273834acdb20 100644
--- a/sklearn/feature_extraction/dict_vectorizer.py
+++ b/sklearn/feature_extraction/dict_vectorizer.py
@@ -57,8 +57,8 @@ class DictVectorizer(BaseEstimator, TransformerMixin):
         Whether transform should produce scipy.sparse matrices.
         True by default.
     sort : boolean, optional.
-        Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.
-        True by default.
+        Whether ``feature_names_`` and ``vocabulary_`` should be
+        sorted when fitting. True by default.
 
     Attributes
     ----------
@@ -362,3 +362,6 @@ def restrict(self, support, indices=False):
                                                     key=itemgetter(1))]
 
         return self
+
+    def _more_tags(self):
+        return {'X_types': ["dict"]}
diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
index 744a073090bad..3c381d130ae87 100644
--- a/sklearn/feature_extraction/hashing.py
+++ b/sklearn/feature_extraction/hashing.py
@@ -178,3 +178,6 @@ def transform(self, raw_X):
         if self.non_negative:
             np.abs(X.data, X.data)
         return X
+
+    def _more_tags(self):
+        return {'X_types': [self.input_type]}
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 7bb9e6a14effc..657254fc23d00 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -534,3 +534,6 @@ def transform(self, X):
             patches[ii * n_patches:(ii + 1) * n_patches] = extract_patches_2d(
                 image, patch_size, self.max_patches, self.random_state)
         return patches
+
+    def _more_tags(self):
+        return {'X_types': ['3darray']}
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index c630155f9bc72..ddad21a946df8 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -652,6 +652,9 @@ def _get_hasher(self):
                              alternate_sign=self.alternate_sign,
                              non_negative=self.non_negative)
 
+    def _more_tags(self):
+        return {'X_types': ['string']}
+
 
 def _document_frequency(X):
     """Count the number of non-zero values for each feature in sparse X."""
@@ -1125,6 +1128,9 @@ def get_feature_names(self):
         return [t for t, i in sorted(self.vocabulary_.items(),
                                      key=itemgetter(1))]
 
+    def _more_tags(self):
+        return {'X_types': ['string']}
+
 
 def _make_int_array():
     """Construct an array.array of a type suitable for scipy.sparse indices."""
@@ -1305,6 +1311,9 @@ def idf_(self, value):
         self._idf_diag = sp.spdiags(value, diags=0, m=n_features,
                                     n=n_features, format='csr')
 
+    def _more_tags(self):
+        return {'X_types': 'sparse'}
+
 
 class TfidfVectorizer(CountVectorizer):
     """Convert a collection of raw documents to a matrix of TF-IDF features.
@@ -1638,3 +1647,6 @@ def transform(self, raw_documents, copy=True):
 
         X = super().transform(raw_documents)
         return self._tfidf.transform(X, copy=False)
+
+    def _more_tags(self):
+        return {'X_types': ['string'], '_skip_test': True}
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index 55f6dba7dfa3e..89a23bbb6cd16 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -125,6 +125,10 @@ def __init__(self, estimator, n_features_to_select=None, step=1,
     def _estimator_type(self):
         return self.estimator._estimator_type
 
+    @property
+    def classes_(self):
+        return self.estimator_.classes_
+
     def fit(self, X, y):
         """Fit the RFE model and then the underlying estimator on the selected
            features.
@@ -145,7 +149,7 @@ def _fit(self, X, y, step_score=None):
         # and is used when implementing RFECV
         # self.scores_ will not be calculated when calling _fit through fit
 
-        X, y = check_X_y(X, y, "csc")
+        X, y = check_X_y(X, y, "csc", ensure_min_features=2)
         # Initialization
         n_features = X.shape[1]
         if self.n_features_to_select is None:
@@ -320,6 +324,9 @@ def predict_log_proba(self, X):
         check_is_fitted(self, 'estimator_')
         return self.estimator_.predict_log_proba(self.transform(X))
 
+    def _more_tags(self):
+        return {'poor_score': True}
+
 
 class RFECV(RFE, MetaEstimatorMixin):
     """Feature ranking with recursive feature elimination and cross-validated
@@ -471,7 +478,7 @@ def fit(self, X, y, groups=None):
             Group labels for the samples used while splitting the dataset into
             train/test set.
         """
-        X, y = check_X_y(X, y, "csr")
+        X, y = check_X_y(X, y, "csr", ensure_min_features=2)
 
         # Initialization
         cv = check_cv(self.cv, y, is_classifier(self.estimator))
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index c5ff9674bc575..c2c1884a50d7b 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -12,13 +12,15 @@
 from scipy.optimize import fmin_l_bfgs_b
 
 from sklearn.base import BaseEstimator, RegressorMixin, clone
+from sklearn.base import MultiOutputMixin
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_X_y, check_array
 from sklearn.exceptions import ConvergenceWarning
 
 
-class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
+class GaussianProcessRegressor(BaseEstimator, RegressorMixin,
+                               MultiOutputMixin):
     """Gaussian process regression (GPR).
 
     The implementation is based on Algorithm 2.1 of Gaussian Processes
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 3bb0bdd9eff15..d993d8b6ce34c 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -422,6 +422,9 @@ def transform(self, X):
 
         return X
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 class IterativeImputer(BaseEstimator, TransformerMixin):
     """Multivariate imputer that estimates each feature from all the others.
@@ -1034,6 +1037,9 @@ def fit(self, X, y=None):
         self.fit_transform(X)
         return self
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 class MissingIndicator(BaseEstimator, TransformerMixin):
     """Binary indicators for missing values.
@@ -1265,3 +1271,7 @@ def fit_transform(self, X, y=None):
 
         """
         return self.fit(X, y).transform(X)
+
+    def _more_tags(self):
+        return {'allow_nan': True,
+                'X_types': ['2darray', 'str']}
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index aa6b4f3cc47d9..901ceec889aac 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -389,3 +389,6 @@ def __setstate__(self, state):
         super().__setstate__(state)
         if hasattr(self, '_necessary_X_') and hasattr(self, '_necessary_y_'):
             self._build_f(self._necessary_X_, self._necessary_y_)
+
+    def _more_tags(self):
+        return {'X_types': ['1darray']}
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index e2341c94a9509..25b2ae5f3eb14 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -435,6 +435,9 @@ def _transform_sparse(self, X):
 
         return sp.hstack(X_new)
 
+    def _more_tags(self):
+        return {'stateless': True}
+
 
 class Nystroem(BaseEstimator, TransformerMixin):
     """Approximate a kernel map using a subset of the training data.
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 91e693fae4f33..aeb5fd45f413f 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -6,14 +6,14 @@
 
 import numpy as np
 
-from .base import BaseEstimator, RegressorMixin
+from .base import BaseEstimator, RegressorMixin, MultiOutputMixin
 from .metrics.pairwise import pairwise_kernels
 from .linear_model.ridge import _solve_cholesky_kernel
 from .utils import check_array, check_X_y
 from .utils.validation import check_is_fitted
 
 
-class KernelRidge(BaseEstimator, RegressorMixin):
+class KernelRidge(BaseEstimator, RegressorMixin, MultiOutputMixin):
     """Kernel ridge regression.
 
     Kernel ridge regression (KRR) combines ridge regression (linear least
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 93dd4c05a9783..eb474f8386189 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -24,7 +24,8 @@
 from scipy.special import expit
 
 from ..utils._joblib import Parallel, delayed
-from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
+from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
+                    MultiOutputMixin)
 from ..utils import check_array, check_X_y
 from ..utils.validation import FLOAT_DTYPES
 from ..utils import check_random_state
@@ -355,7 +356,7 @@ def sparsify(self):
         return self
 
 
-class LinearRegression(LinearModel, RegressorMixin):
+class LinearRegression(LinearModel, RegressorMixin, MultiOutputMixin):
     """
     Ordinary least squares Linear Regression.
 
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index f414c3ba4b32f..581c7022b46d3 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -13,7 +13,7 @@
 from scipy import sparse
 
 from .base import LinearModel, _pre_fit
-from ..base import RegressorMixin
+from ..base import RegressorMixin, MultiOutputMixin
 from .base import _preprocess_data
 from ..utils import check_array, check_X_y
 from ..utils.validation import check_random_state
@@ -506,7 +506,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
 # ElasticNet model
 
 
-class ElasticNet(LinearModel, RegressorMixin):
+class ElasticNet(LinearModel, RegressorMixin, MultiOutputMixin):
     """Linear regression with combined L1 and L2 priors as regularizer.
 
     Minimizes the objective function::
@@ -1048,7 +1048,7 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
     return this_mses
 
 
-class LinearModelCV(LinearModel, metaclass=ABCMeta):
+class LinearModelCV(LinearModel, MultiOutputMixin, metaclass=ABCMeta):
     """Base class for iterative model fitting along a regularization path"""
 
     @abstractmethod
@@ -1820,6 +1820,9 @@ def fit(self, X, y):
         # return self for chaining fit and predict calls
         return self
 
+    def _more_tags(self):
+        return {'multioutput_only': True}
+
 
 class MultiTaskLasso(MultiTaskElasticNet):
     """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
@@ -2130,6 +2133,9 @@ def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         self.random_state = random_state
         self.selection = selection
 
+    def _more_tags(self):
+        return {'multioutput_only': True}
+
 
 class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
     """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
@@ -2288,3 +2294,6 @@ def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
             max_iter=max_iter, tol=tol, copy_X=copy_X,
             cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state,
             selection=selection)
+
+    def _more_tags(self):
+        return {'multioutput_only': True}
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 0aa9a9c14ea94..a55bc4ab6f98c 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -17,7 +17,7 @@
 from scipy.linalg.lapack import get_lapack_funcs
 
 from .base import LinearModel
-from ..base import RegressorMixin
+from ..base import RegressorMixin, MultiOutputMixin
 from ..utils import arrayfuncs, as_float_array, check_X_y
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
@@ -487,7 +487,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
 ###############################################################################
 # Estimator classes
 
-class Lars(LinearModel, RegressorMixin):
+class Lars(LinearModel, RegressorMixin, MultiOutputMixin):
     """Least Angle Regression model a.k.a. LAR
 
     Read more in the :ref:`User Guide <least_angle_regression>`.
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index 95d80fb423e14..d9ee49cd37698 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -13,7 +13,7 @@
 from scipy.linalg.lapack import get_lapack_funcs
 
 from .base import LinearModel, _pre_fit
-from ..base import RegressorMixin
+from ..base import RegressorMixin, MultiOutputMixin
 from ..utils import as_float_array, check_array, check_X_y
 from ..model_selection import check_cv
 from ..utils._joblib import Parallel, delayed
@@ -539,7 +539,7 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
         return np.squeeze(coef)
 
 
-class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
+class OrthogonalMatchingPursuit(LinearModel, RegressorMixin, MultiOutputMixin):
     """Orthogonal Matching Pursuit model (OMP)
 
     Read more in the :ref:`User Guide <omp>`.
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index f929533e871a8..0205b75df55cf 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -8,6 +8,7 @@
 import warnings
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
+from ..base import MultiOutputMixin
 from ..utils import check_random_state, check_array, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted
@@ -52,7 +53,8 @@ def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
     return abs(float(np.ceil(np.log(nom) / np.log(denom))))
 
 
-class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin):
+class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin,
+                      MultiOutputMixin):
     """RANSAC (RANdom SAmple Consensus) algorithm.
 
     RANSAC is an iterative algorithm for the robust estimation of parameters
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index f71bebf8420c6..eed636622dcdc 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -19,7 +19,7 @@
 
 from .base import LinearClassifierMixin, LinearModel, _rescale_data
 from .sag import sag_solver
-from ..base import RegressorMixin
+from ..base import RegressorMixin, MultiOutputMixin
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import row_norms
 from ..utils import check_X_y
@@ -463,8 +463,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         return coef
 
 
-class _BaseRidge(LinearModel, metaclass=ABCMeta):
-
+class _BaseRidge(LinearModel, MultiOutputMixin, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, solver="auto",
@@ -1107,7 +1106,7 @@ def identity_estimator():
         return self
 
 
-class _BaseRidgeCV(LinearModel):
+class _BaseRidgeCV(LinearModel, MultiOutputMixin):
     def __init__(self, alphas=(0.1, 1.0, 10.0),
                  fit_intercept=True, normalize=False, scoring=None,
                  cv=None, gcv_mode=None,
diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
index 3c706d005a033..17130d7335d81 100644
--- a/sklearn/manifold/locally_linear.py
+++ b/sklearn/manifold/locally_linear.py
@@ -9,7 +9,7 @@
 from scipy.sparse import eye, csr_matrix
 from scipy.sparse.linalg import eigsh
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _UnstableOn32BitMixin
 from ..utils import check_random_state, check_array
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
@@ -518,7 +518,8 @@ def locally_linear_embedding(
                       tol=tol, max_iter=max_iter, random_state=random_state)
 
 
-class LocallyLinearEmbedding(BaseEstimator, TransformerMixin):
+class LocallyLinearEmbedding(BaseEstimator, TransformerMixin,
+                             _UnstableOn32BitMixin):
     """Locally Linear Embedding
 
     Read more in the :ref:`User Guide <locally_linear_embedding>`.
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 70492c9dc7b16..49ac0e0f860ce 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -711,8 +711,8 @@ def test_classification_inf_nan_input(metric):
     # Classification metrics all raise a mixed input exception
     for y_true, y_score in invalids:
         assert_raise_message(ValueError,
-                             "Classification metrics can't handle a mix "
-                             "of binary and continuous targets",
+                             "Input contains NaN, infinity or a "
+                             "value too large",
                              metric, y_true, y_score)
 
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 1f27ab7bf1f74..8b43a65ccc5aa 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1129,6 +1129,7 @@ class GridSearchCV(BaseSearchCV):
         Make a scorer from a performance metric or loss function.
 
     """
+    _required_parameters = ["estimator", "param_grid"]
 
     def __init__(self, estimator, param_grid, scoring=None,
                  n_jobs=None, iid='warn', refit=True, cv='warn', verbose=0,
@@ -1444,6 +1445,7 @@ class RandomizedSearchCV(BaseSearchCV):
         param_distributions.
 
     """
+    _required_parameters = ["estimator", "param_distributions"]
 
     def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
                  n_jobs=None, iid='warn', refit=True,
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index f7076cd117921..1cf80f8142f29 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -40,6 +40,7 @@
 import itertools
 
 from .base import BaseEstimator, ClassifierMixin, clone, is_classifier
+from .base import MultiOutputMixin
 from .base import MetaEstimatorMixin, is_regressor
 from .preprocessing import LabelBinarizer
 from .metrics.pairwise import euclidean_distances
@@ -129,7 +130,8 @@ def predict_proba(self, X):
                          X.shape[0], axis=0)
 
 
-class OneVsRestClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+class OneVsRestClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin,
+                          MultiOutputMixin):
     """One-vs-the-rest (OvR) multiclass/multilabel strategy
 
     Also known as one-vs-all, this strategy consists in fitting one classifier
@@ -628,6 +630,10 @@ def _pairwise(self):
         """Indicate if wrapped estimator is using a precomputed Gram matrix"""
         return getattr(self.estimator, "_pairwise", False)
 
+    def _more_tags(self):
+        # FIXME Remove once #10440 is merged
+        return {'_skip_test': True}
+
 
 class OutputCodeClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
     """(Error-Correcting) Output-Code multiclass strategy
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index cabecf3689b26..9329da2368ad0 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -144,7 +144,8 @@ def fit(self, X, y, sample_weight=None):
         """
 
         if not hasattr(self.estimator, "fit"):
-            raise ValueError("The base estimator should implement a fit method")
+            raise ValueError("The base estimator should implement"
+                             "  a fit method")
 
         X, y = check_X_y(X, y,
                          multi_output=True,
@@ -195,6 +196,9 @@ def predict(self, X):
 
         return np.asarray(y).T
 
+    def _more_tags(self):
+        return {'multioutput_only': True}
+
 
 class MultiOutputRegressor(MultiOutputEstimator, RegressorMixin):
     """Multi target regression
@@ -366,6 +370,10 @@ def score(self, X, y):
         y_pred = self.predict(X)
         return np.mean(np.all(y == y_pred, axis=1))
 
+    def _more_tags(self):
+        # FIXME
+        return {'_skip_test': True}
+
 
 class _BaseChain(BaseEstimator, metaclass=ABCMeta):
     def __init__(self, base_estimator, order=None, cv=None, random_state=None):
@@ -627,6 +635,9 @@ def decision_function(self, X):
 
         return Y_decision
 
+    def _more_tags(self):
+        return {'_skip_test': True}
+
 
 class RegressorChain(_BaseChain, RegressorMixin, MetaEstimatorMixin):
     """A multi-label model that arranges regressions into a chain.
@@ -709,3 +720,7 @@ def fit(self, X, Y):
         """
         super().fit(X, Y)
         return self
+
+    def _more_tags(self):
+        # FIXME
+        return {'_skip_test': True}
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 6554c465f58b5..5427c00027f1d 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -623,6 +623,9 @@ def _get_intercept(self):
     coef_ = property(_get_coef)
     intercept_ = property(_get_intercept)
 
+    def _more_tags(self):
+        return {'poor_score': True}
+
 
 class MultinomialNB(BaseDiscreteNB):
     """
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index 77cea86560177..f3d2e352a4a09 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -17,7 +17,7 @@
 
 from .ball_tree import BallTree
 from .kd_tree import KDTree
-from ..base import BaseEstimator
+from ..base import BaseEstimator, MultiOutputMixin
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 from ..utils import check_X_y, check_array, gen_even_slices
@@ -101,7 +101,7 @@ def _get_weights(dist, weights):
                          "'distance', or a callable function")
 
 
-class NeighborsBase(BaseEstimator, metaclass=ABCMeta):
+class NeighborsBase(BaseEstimator, MultiOutputMixin, metaclass=ABCMeta):
     """Base class for nearest neighbors estimators."""
 
     @abstractmethod
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 4eb1d6076c966..eeba9857205af 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -113,6 +113,7 @@ class Pipeline(_BaseComposition):
     """
 
     # BaseEstimator interface
+    _required_parameters = ['steps']
 
     def __init__(self, steps, memory=None):
         self.steps = steps
@@ -676,6 +677,8 @@ class FeatureUnion(_BaseComposition, TransformerMixin):
     array([[ 1.5       ,  3.0...,  0.8...],
            [-1.5       ,  5.7..., -0.4...]])
     """
+    _required_parameters = ["transformer_list"]
+
     def __init__(self, transformer_list, n_jobs=None,
                  transformer_weights=None):
         self.transformer_list = transformer_list
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index a03da8c653d06..be3e8a9967cfe 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -847,3 +847,6 @@ def inverse_transform(self, X):
             X_tr[:, i] = self.categories_[i][labels]
 
         return X_tr
+
+    def _more_tags(self):
+        return {'X_types': ['categorical']}
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 66034f6740a8e..d157b2f9bb0f4 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -143,6 +143,8 @@ def transform(self, X):
         X : array-like, shape (n_samples, n_features)
             Input array.
 
+
+
         Returns
         -------
         X_out : array-like, shape (n_samples, n_features)
@@ -158,6 +160,8 @@ def inverse_transform(self, X):
         X : array-like, shape (n_samples, n_features)
             Input array.
 
+
+
         Returns
         -------
         X_out : array-like, shape (n_samples, n_features)
@@ -173,3 +177,7 @@ def _transform(self, X, func=None, kw_args=None):
             func = _identity
 
         return func(X, **(kw_args if kw_args else {}))
+
+    def _more_tags(self):
+        return {'no_validation': True,
+                'stateless': True}
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index af7010f927cd0..696c6cb7afacd 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -407,6 +407,9 @@ def inverse_transform(self, X):
         X /= self.scale_
         return X
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
     """Transforms features by scaling each feature to a given range.
@@ -808,6 +811,9 @@ def inverse_transform(self, X, copy=None):
                 X += self.mean_
         return X
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 class MaxAbsScaler(BaseEstimator, TransformerMixin):
     """Scale each feature by its maximum absolute value.
@@ -975,6 +981,9 @@ def inverse_transform(self, X):
             X *= self.scale_
         return X
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 def maxabs_scale(X, axis=0, copy=True):
     """Scale each feature to the [-1, 1] range without breaking the sparsity.
@@ -1230,6 +1239,9 @@ def inverse_transform(self, X):
                 X += self.center_
         return X
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True):
@@ -1722,6 +1734,9 @@ def transform(self, X, copy=None):
         X = check_array(X, accept_sparse='csr')
         return normalize(X, norm=self.norm, axis=1, copy=copy)
 
+    def _more_tags(self):
+        return {'stateless': True}
+
 
 def binarize(X, threshold=0.0, copy=True):
     """Boolean thresholding of array-like or scipy.sparse matrix
@@ -1854,6 +1869,9 @@ def transform(self, X, copy=None):
         copy = copy if copy is not None else self.copy
         return binarize(X, threshold=self.threshold, copy=copy)
 
+    def _more_tags(self):
+        return {'stateless': True}
+
 
 class KernelCenterer(BaseEstimator, TransformerMixin):
     """Center a kernel matrix
@@ -2380,6 +2398,9 @@ def inverse_transform(self, X):
 
         return self._transform(X, inverse=True)
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 def quantile_transform(X, axis=0, n_quantiles=1000,
                        output_distribution='uniform',
@@ -2841,6 +2862,9 @@ def _check_input(self, X, check_positive=False, check_shape=False,
 
         return X
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
 
 def power_transform(X, method='warn', standardize=True, copy=True):
     """
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 380af58cc1d40..80337b1e14dcc 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -367,3 +367,6 @@ def transform(self, X):
             X[coordinates] = values
 
         return X
+
+    def _more_tags(self):
+        return {'allow_nan': True}
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 38b44d71359d2..f7cffa1e663b5 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -101,7 +101,11 @@ def _encode(values, uniques=None, encode=False):
 
     """
     if values.dtype == object:
-        return _encode_python(values, uniques, encode)
+        try:
+            res = _encode_python(values, uniques, encode)
+        except TypeError:
+            raise TypeError("argument must be a string or number")
+        return res
     else:
         return _encode_numpy(values, uniques, encode)
 
@@ -278,6 +282,9 @@ def inverse_transform(self, y):
         y = np.asarray(y)
         return self.classes_[y]
 
+    def _more_tags(self):
+        return {'X_types': ['1dlabels']}
+
 
 class LabelBinarizer(BaseEstimator, TransformerMixin):
     """Binarize labels in a one-vs-all fashion
@@ -511,6 +518,9 @@ def inverse_transform(self, Y, threshold=None):
 
         return y_inv
 
+    def _more_tags(self):
+        return {'X_types': ['1dlabels']}
+
 
 def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False):
     """Binarize labels in a one-vs-all fashion
@@ -977,3 +987,6 @@ def inverse_transform(self, yt):
                                  'Also got {0}'.format(unexpected))
             return [tuple(self.classes_.compress(indicators)) for indicators
                     in yt]
+
+    def _more_tags(self):
+        return {'X_types': ['2dlabels']}
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index ec1c1356c3367..cf1f9739d6384 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import scipy.sparse as sp
+import pytest
 
 import sklearn
 from sklearn.utils.testing import assert_array_equal
@@ -49,6 +50,25 @@ def __init__(self, a=None, b=None):
         self.b = b
 
 
+class NaNTag(BaseEstimator):
+    def _more_tags(self):
+        return {'allow_nan': True}
+
+
+class NoNaNTag(BaseEstimator):
+    def _more_tags(self):
+        return {'allow_nan': False}
+
+
+class OverrideTag(NaNTag):
+    def _more_tags(self):
+        return {'allow_nan': False}
+
+
+class DiamondOverwriteTag(NaNTag, NoNaNTag):
+    pass
+
+
 class ModifyInitParams(BaseEstimator):
     """Deprecated behavior.
     Equal parameters but with a type cast.
@@ -449,3 +469,20 @@ def test_pickling_works_when_getstate_is_overwritten_in_the_child_class():
     estimator_restored = pickle.loads(serialized)
     assert_equal(estimator_restored.attribute_pickled, 5)
     assert_equal(estimator_restored._attribute_not_pickled, None)
+
+
+def test_tag_inheritance():
+    # test that changing tags by inheritance is not allowed
+
+    nan_tag_est = NaNTag()
+    no_nan_tag_est = NoNaNTag()
+    assert nan_tag_est._get_tags()['allow_nan']
+    assert not no_nan_tag_est._get_tags()['allow_nan']
+
+    invalid_tags_est = OverrideTag()
+    with pytest.raises(TypeError, match="Inconsistent values for tag"):
+        invalid_tags_est._get_tags()
+
+    diamond_tag_est = DiamondOverwriteTag()
+    with pytest.raises(TypeError, match="Inconsistent values for tag"):
+        diamond_tag_est._get_tags()
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 95b142e3deb81..4f9eb34527e05 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -18,18 +18,21 @@
 from sklearn.utils.testing import clean_warning_registry
 from sklearn.utils.testing import all_estimators
 from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_in
 from sklearn.utils.testing import ignore_warnings
-from sklearn.exceptions import ConvergenceWarning
+from sklearn.exceptions import ConvergenceWarning, SkipTestWarning
 
 import sklearn
+from sklearn.base import RegressorMixin
 from sklearn.cluster.bicluster import BiclusterMixin
 
 from sklearn.linear_model.base import LinearClassifierMixin
+from sklearn.linear_model import Ridge
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.utils import IS_PYPY
 from sklearn.utils.estimator_checks import (
     _yield_all_checks,
+    _safe_tags,
     set_checking_parameters,
     check_parameters_default_constructible,
     check_no_attributes_set_in_init,
@@ -44,38 +47,46 @@ def test_all_estimator_no_base_class():
         assert not name.lower().startswith('base'), msg
 
 
-def test_all_estimators():
-    estimators = all_estimators(include_meta_estimators=True)
-
-    # Meta sanity-check to make sure that the estimator introspection runs
-    # properly
-    assert_greater(len(estimators), 0)
-
-
 @pytest.mark.parametrize(
         'name, Estimator',
-        all_estimators(include_meta_estimators=True)
+        all_estimators()
 )
 def test_parameters_default_constructible(name, Estimator):
     # Test that estimators are default-constructible
     check_parameters_default_constructible(name, Estimator)
 
 
-def _tested_non_meta_estimators():
+def _tested_estimators():
     for name, Estimator in all_estimators():
         if issubclass(Estimator, BiclusterMixin):
             continue
         if name.startswith("_"):
             continue
-        yield name, Estimator
+        # FIXME _skip_test should be used here (if we could)
+
+        required_parameters = getattr(Estimator, "_required_parameters", [])
+        if len(required_parameters):
+            if required_parameters in (["estimator"], ["base_estimator"]):
+                if issubclass(Estimator, RegressorMixin):
+                    estimator = Estimator(Ridge())
+                else:
+                    estimator = Estimator(LinearDiscriminantAnalysis())
+            else:
+                warnings.warn("Can't instantiate estimator {} which requires "
+                              "parameters {}".format(name,
+                                                     required_parameters),
+                              SkipTestWarning)
+                continue
+        else:
+            estimator = Estimator()
+        yield name, estimator
 
 
 def _generate_checks_per_estimator(check_generator, estimators):
     with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        for name, Estimator in estimators:
-            estimator = Estimator()
+        for name, estimator in estimators:
             for check in check_generator(name, estimator):
-                yield name, Estimator, check
+                yield estimator, check
 
 
 def _rename_partial(val):
@@ -83,30 +94,38 @@ def _rename_partial(val):
         kwstring = "".join(["{}={}".format(k, v)
                             for k, v in val.keywords.items()])
         return "{}({})".format(val.func.__name__, kwstring)
+    # FIXME once we have short reprs we can use them here!
+    if hasattr(val, "get_params") and not isinstance(val, type):
+        return type(val).__name__
 
 
 @pytest.mark.parametrize(
-        "name, Estimator, check",
+        "estimator, check",
         _generate_checks_per_estimator(_yield_all_checks,
-                                       _tested_non_meta_estimators()),
+                                       _tested_estimators()),
         ids=_rename_partial
 )
-def test_non_meta_estimators(name, Estimator, check):
-    # Common tests for non-meta estimators
+def test_estimators(estimator, check):
+    # Common tests for estimator instances
     with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning,
                                    UserWarning, FutureWarning)):
-        estimator = Estimator()
         set_checking_parameters(estimator)
+        name = estimator.__class__.__name__
         check(name, estimator)
 
 
-@pytest.mark.parametrize("name, Estimator",
-                         _tested_non_meta_estimators())
-def test_no_attributes_set_in_init(name, Estimator):
-    # input validation etc for non-meta estimators
+@pytest.mark.parametrize("name, estimator",
+                         _tested_estimators())
+def test_no_attributes_set_in_init(name, estimator):
+    # input validation etc for all estimators
     with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning,
                                    UserWarning, FutureWarning)):
-        estimator = Estimator()
+        tags = _safe_tags(estimator)
+        if tags['_skip_test']:
+            warnings.warn("Explicit SKIP via _skip_test tag for "
+                          "{}.".format(name),
+                          SkipTestWarning)
+            return
         # check this on class
         check_no_attributes_set_in_init(name, estimator)
 
@@ -143,6 +162,11 @@ def _tested_linear_classifiers():
     clean_warning_registry()
     with warnings.catch_warnings(record=True):
         for name, clazz in classifiers:
+            required_parameters = getattr(clazz, "_required_parameters", [])
+            if len(required_parameters):
+                # FIXME
+                continue
+
             if ('class_weight' in clazz().get_params().keys() and
                     issubclass(clazz, LinearClassifierMixin)):
                 yield name, clazz
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index f377df78c6952..a07e6a0ca5d9a 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -29,6 +29,7 @@
 from ..base import ClassifierMixin
 from ..base import RegressorMixin
 from ..base import is_classifier
+from ..base import MultiOutputMixin
 from ..utils import check_array
 from ..utils import check_random_state
 from ..utils import compute_sample_weight
@@ -70,7 +71,7 @@
 # =============================================================================
 
 
-class BaseDecisionTree(BaseEstimator, metaclass=ABCMeta):
+class BaseDecisionTree(BaseEstimator, MultiOutputMixin, metaclass=ABCMeta):
     """Base class for decision trees.
 
     Warning: This class should not be used directly.
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 6c4196d919aa0..6b411fca2a2a4 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -11,9 +11,8 @@
 from scipy import sparse
 from scipy.stats import rankdata
 
-from sklearn.utils import IS_PYPY, _IS_32BIT
+from sklearn.utils import IS_PYPY
 from sklearn.utils import _joblib
-from sklearn.utils._joblib import Memory
 from sklearn.utils.testing import assert_raises, _get_args
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_raise_message
@@ -22,10 +21,10 @@
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_in
 from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_allclose_dense_sparse
 from sklearn.utils.testing import assert_warns_message
-from sklearn.utils.testing import META_ESTIMATORS
 from sklearn.utils.testing import set_random_state
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_greater_equal
@@ -35,16 +34,16 @@
 from sklearn.utils.testing import create_memmap_backed_data
 from sklearn.utils import is_scalar_nan
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.linear_model import Ridge
 
 
-from sklearn.base import (clone, ClusterMixin,
-                          is_classifier, is_regressor, is_outlier_detector)
+from sklearn.base import (clone, ClusterMixin, is_classifier, is_regressor,
+                          _DEFAULT_TAGS, RegressorMixin, is_outlier_detector)
 
 from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score
 
 from sklearn.random_projection import BaseRandomProjection
 from sklearn.feature_selection import SelectKBest
-from sklearn.svm.base import BaseLibSVM
 from sklearn.linear_model.stochastic_gradient import BaseSGD
 from sklearn.pipeline import make_pipeline
 from sklearn.exceptions import DataConversionWarning
@@ -63,50 +62,48 @@
 
 BOSTON = None
 CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
-MULTI_OUTPUT = ['CCA', 'DecisionTreeRegressor', 'ElasticNet',
-                'ExtraTreeRegressor', 'ExtraTreesRegressor',
-                'GaussianProcessRegressor', 'TransformedTargetRegressor',
-                'KNeighborsRegressor', 'KernelRidge', 'Lars', 'Lasso',
-                'LassoLars', 'LinearRegression', 'MultiTaskElasticNet',
-                'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV',
-                'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
-                'RANSACRegressor', 'RadiusNeighborsRegressor',
-                'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
-ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
-             'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
-             'PowerTransformer', 'QuantileTransformer', 'IterativeImputer']
 
-SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']
+def _safe_tags(estimator, key=None):
+    # if estimator doesn't have _get_tags, use _DEFAULT_TAGS
+    # if estimator has tags but not key, use _DEFAULT_TAGS[key]
+    if hasattr(estimator, "_get_tags"):
+        if key is not None:
+            return estimator._get_tags().get(key, _DEFAULT_TAGS[key])
+        tags = estimator._get_tags()
+        return {key: tags.get(key, _DEFAULT_TAGS[key])
+                for key in _DEFAULT_TAGS.keys()}
+    if key is not None:
+        return _DEFAULT_TAGS[key]
+    return _DEFAULT_TAGS
 
 
-def _yield_non_meta_checks(name, estimator):
+def _yield_checks(name, estimator):
+    tags = _safe_tags(estimator)
     yield check_estimators_dtypes
     yield check_fit_score_takes_y
-    yield check_dtype_object
     yield check_sample_weights_pandas_series
     yield check_sample_weights_list
     yield check_sample_weights_invariance
     yield check_estimators_fit_returns_self
     yield partial(check_estimators_fit_returns_self, readonly_memmap=True)
-    yield check_complex_data
 
     # Check that all estimator yield informative messages when
     # trained on empty datasets
-    yield check_estimators_empty_data_messages
+    if not tags["no_validation"]:
+        yield check_complex_data
+        yield check_dtype_object
+        yield check_estimators_empty_data_messages
 
-    if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']:
-        # SpectralEmbedding is non-deterministic,
-        # see issue #4236
+    if name not in CROSS_DECOMPOSITION:
         # cross-decomposition's "transform" returns X and Y
         yield check_pipeline_consistency
 
-    if name not in ALLOW_NAN:
+    if not tags["allow_nan"] and not tags["no_validation"]:
         # Test that all estimators check their input for NaN's and infs
         yield check_estimators_nan_inf
 
     yield check_estimators_overwrite_params
-
     if hasattr(estimator, 'sparsify'):
         yield check_sparsify_coefficients
 
@@ -118,6 +115,8 @@ def _yield_non_meta_checks(name, estimator):
 
 
 def _yield_classifier_checks(name, classifier):
+    tags = _safe_tags(classifier)
+
     # test classifiers can handle non-array data
     yield check_classifier_data_not_an_array
     # test classifiers trained on a single label always return this label
@@ -128,15 +127,9 @@ def _yield_classifier_checks(name, classifier):
     yield check_classifiers_train
     yield partial(check_classifiers_train, readonly_memmap=True)
     yield check_classifiers_regression_target
-    if (name not in ["MultinomialNB", "ComplementNB", "LabelPropagation",
-                     "LabelSpreading"] and
-        # TODO some complication with -1 label
-            name not in ["DecisionTreeClassifier", "ExtraTreeClassifier"]):
-        # We don't raise a warning in these classifiers, as
-        # the column y interface is used by the forests.
-
+    if not tags["no_validation"]:
+        yield check_supervised_y_no_nan
         yield check_supervised_y_2d
-    yield check_supervised_y_no_nan
     yield check_estimators_unfitted
     if 'class_weight' in classifier.get_params().keys():
         yield check_class_weight_classifiers
@@ -170,6 +163,7 @@ def check_supervised_y_no_nan(name, estimator_orig):
 
 
 def _yield_regressor_checks(name, regressor):
+    tags = _safe_tags(regressor)
     # TODO: test with intercept
     # TODO: test with multiple responses
     # basic testing
@@ -178,29 +172,25 @@ def _yield_regressor_checks(name, regressor):
     yield check_regressor_data_not_an_array
     yield check_estimators_partial_fit_n_features
     yield check_regressors_no_decision_function
-    yield check_supervised_y_2d
+    if not tags["no_validation"]:
+        yield check_supervised_y_2d
     yield check_supervised_y_no_nan
     if name != 'CCA':
         # check that the regressor handles int input
         yield check_regressors_int
-    if name != "GaussianProcessRegressor":
-        # test if NotFittedError is raised
-        yield check_estimators_unfitted
+    yield check_estimators_unfitted
     yield check_non_transformer_estimators_n_iter
 
 
 def _yield_transformer_checks(name, transformer):
     # All transformers should either deal with sparse data or raise an
     # exception with type TypeError and an intelligible error message
-    if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer',
-                    'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']:
-        yield check_transformer_data_not_an_array
+    yield check_transformer_data_not_an_array
     # these don't actually fit the data, so don't raise errors
-    if name not in ['AdditiveChi2Sampler', 'Binarizer',
-                    'FunctionTransformer', 'Normalizer']:
-        # basic tests
-        yield check_transformer_general
-        yield partial(check_transformer_general, readonly_memmap=True)
+    yield check_transformer_general
+    yield partial(check_transformer_general, readonly_memmap=True)
+
+    if not _safe_tags(transformer, "stateless"):
         yield check_transformers_unfitted
     # Dependent on external solvers and hence accessing the iter
     # param is non-trivial.
@@ -238,7 +228,19 @@ def _yield_outliers_checks(name, estimator):
 
 
 def _yield_all_checks(name, estimator):
-    for check in _yield_non_meta_checks(name, estimator):
+    tags = _safe_tags(estimator)
+    if "2darray" not in tags["X_types"]:
+        warnings.warn("Can't test estimator {} which requires input "
+                      " of type {}".format(name, tags["X_types"]),
+                      SkipTestWarning)
+        return
+    if tags["_skip_test"]:
+        warnings.warn("Explicit SKIP via _skip_test tag for estimator "
+                      "{}.".format(name),
+                      SkipTestWarning)
+        return
+
+    for check in _yield_checks(name, estimator):
         yield check
     if is_classifier(estimator):
         for check in _yield_classifier_checks(name, estimator):
@@ -322,8 +324,8 @@ def set_checking_parameters(estimator):
     # set parameters to speed up some estimators and
     # avoid deprecated behaviour
     params = estimator.get_params()
-    if ("n_iter" in params and estimator.__class__.__name__ != "TSNE"
-            and not isinstance(estimator, BaseSGD)):
+    name = estimator.__class__.__name__
+    if ("n_iter" in params and name != "TSNE"):
         estimator.set_params(n_iter=5)
     if "max_iter" in params:
         if estimator.max_iter is not None:
@@ -355,15 +357,26 @@ def set_checking_parameters(estimator):
     if "n_init" in params:
         # K-Means
         estimator.set_params(n_init=2)
-    if "decision_function_shape" in params:
-        # SVC
-        estimator.set_params(decision_function_shape='ovo')
 
-    if estimator.__class__.__name__ == "SelectFdr":
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 2
+
+    if name == 'TruncatedSVD':
+        # TruncatedSVD doesn't run with n_components = n_features
+        # This is ugly :-/
+        estimator.n_components = 1
+
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = min(estimator.n_clusters, 2)
+
+    if hasattr(estimator, "n_best"):
+        estimator.n_best = 1
+
+    if name == "SelectFdr":
         # be tolerant of noisy datasets (not actually speed)
         estimator.set_params(alpha=.5)
 
-    if estimator.__class__.__name__ == "TheilSenRegressor":
+    if name == "TheilSenRegressor":
         estimator.max_subpopulation = 100
 
     if estimator.__class__.__name__ == "IsolationForest":
@@ -494,17 +507,19 @@ def check_estimator_sparse_data(name, estimator_orig):
     for matrix_format, X in _generate_sparse_matrix(X_csr):
         # catch deprecation warnings
         with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+            estimator = clone(estimator_orig)
             if name in ['Scaler', 'StandardScaler']:
-                estimator = clone(estimator).set_params(with_mean=False)
-            else:
-                estimator = clone(estimator)
+                estimator.set_params(with_mean=False)
         # fit and predict
         try:
             with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
                 estimator.fit(X, y)
             if hasattr(estimator, "predict"):
                 pred = estimator.predict(X)
-                assert_equal(pred.shape, (X.shape[0],))
+                if _safe_tags(estimator, "multioutput_only"):
+                    assert_equal(pred.shape, (X.shape[0], 1))
+                else:
+                    assert_equal(pred.shape, (X.shape[0],))
             if hasattr(estimator, 'predict_proba'):
                 probs = estimator.predict_proba(X)
                 assert_equal(probs.shape, (X.shape[0], 4))
@@ -541,6 +556,8 @@ def check_sample_weights_pandas_series(name, estimator_orig):
             X = pd.DataFrame(pairwise_estimator_convert_X(X, estimator_orig))
             y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2])
             weights = pd.Series([1] * 8)
+            if _safe_tags(estimator, "multioutput_only"):
+                y = pd.DataFrame(y)
             try:
                 estimator.fit(X, y, sample_weight=weights)
             except ValueError:
@@ -588,6 +605,7 @@ def check_sample_weights_invariance(name, estimator_orig):
                       [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float'))
         y = np.array([1, 1, 1, 1, 2, 2, 2, 2,
                       1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int'))
+        y = multioutput_estimator_convert_y_2d(estimator1, y)
 
         estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)))
         estimator2.fit(X, y=y, sample_weight=None)
@@ -596,6 +614,9 @@ def check_sample_weights_invariance(name, estimator_orig):
             if hasattr(estimator_orig, method):
                 X_pred1 = getattr(estimator1, method)(X)
                 X_pred2 = getattr(estimator2, method)(X)
+                if sparse.issparse(X_pred1):
+                    X_pred1 = X_pred1.toarray()
+                    X_pred2 = X_pred2.toarray()
                 assert_allclose(X_pred1, X_pred2,
                                 err_msg="For %s sample_weight=None is not"
                                         " equivalent to sample_weight=ones"
@@ -625,9 +646,10 @@ def check_dtype_object(name, estimator_orig):
         if "Unknown label type" not in str(e):
             raise
 
-    if name not in SUPPORT_STRING:
+    tags = _safe_tags(estimator)
+    if 'str' not in tags['X_types']:
         X[0, 0] = {'foo': 'bar'}
-        msg = "argument must be a string or a number"
+        msg = "argument must be a string.* number"
         assert_raises_regex(TypeError, msg, estimator.fit, X, y)
     else:
         # Estimators supporting string will not call np.asarray to convert the
@@ -761,6 +783,10 @@ def check_fit2d_predict1d(name, estimator_orig):
 
     set_random_state(estimator, 1)
     estimator.fit(X, y)
+    tags = _safe_tags(estimator)
+    if tags["no_validation"]:
+        # FIXME this is a bit loose
+        return
 
     for method in ["predict", "transform", "decision_function",
                    "predict_proba"]:
@@ -812,8 +838,10 @@ def check_methods_subset_invariance(name, estimator_orig):
                "to a subset.").format(method=method, name=name)
         # TODO remove cases when corrected
         if (name, method) in [('SVC', 'decision_function'),
+                              ('NuSVC', 'decision_function'),
                               ('SparsePCA', 'transform'),
                               ('MiniBatchSparsePCA', 'transform'),
+                              ('DummyClassifier', 'predict'),
                               ('BernoulliRBM', 'score_samples')]:
             raise SkipTest(msg)
 
@@ -893,6 +921,10 @@ def check_fit1d(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20))
     y = X.astype(np.int)
     estimator = clone(estimator_orig)
+    tags = _safe_tags(estimator)
+    if tags["no_validation"]:
+        # FIXME this is a bit loose
+        return
     y = multioutput_estimator_convert_y_2d(estimator, y)
 
     if hasattr(estimator, "n_components"):
@@ -944,14 +976,6 @@ def check_transformers_unfitted(name, transformer):
 
 
 def _check_transformer(name, transformer_orig, X, y):
-    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _IS_32BIT:
-        # Those transformers yield non-deterministic output when executed on
-        # a 32bit Python. The same transformers are stable on 64bit Python.
-        # FIXME: try to isolate a minimalistic reproduction case only depending
-        # on numpy & scipy and/or maybe generate a test dataset that does not
-        # cause such unstable behaviors.
-        msg = name + ' is non deterministic on 32bit Python'
-        raise SkipTest(msg)
     n_samples, n_features = np.asarray(X).shape
     transformer = clone(transformer_orig)
     set_random_state(transformer)
@@ -983,6 +1007,10 @@ def _check_transformer(name, transformer_orig, X, y):
         else:
             X_pred2 = transformer.transform(X)
             X_pred3 = transformer.fit_transform(X, y=y_)
+
+        if _safe_tags(transformer_orig, 'non_deterministic'):
+            msg = name + ' is non deterministic'
+            raise SkipTest(msg)
         if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
             for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                 assert_allclose_dense_sparse(
@@ -1010,7 +1038,7 @@ def _check_transformer(name, transformer_orig, X, y):
             assert_equal(_num_samples(X_pred3), n_samples)
 
         # raises error on malformed input for transform
-        if hasattr(X, 'T'):
+        if hasattr(X, 'T') and not _safe_tags(transformer, "stateless"):
             # If it's not an array, it does not have a 'T' property
             with assert_raises(ValueError, msg="The transformer {} does "
                                "not raise an error when the number of "
@@ -1022,13 +1050,8 @@ def _check_transformer(name, transformer_orig, X, y):
 
 @ignore_warnings
 def check_pipeline_consistency(name, estimator_orig):
-    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _IS_32BIT:
-        # Those transformers yield non-deterministic output when executed on
-        # a 32bit Python. The same transformers are stable on 64bit Python.
-        # FIXME: try to isolate a minimalistic reproduction case only depending
-        # scipy and/or maybe generate a test dataset that does not
-        # cause such unstable behaviors.
-        msg = name + ' is non deterministic on 32bit Python'
+    if _safe_tags(estimator_orig, 'non_deterministic'):
+        msg = name + ' is non deterministic'
         raise SkipTest(msg)
 
     # check that make_pipeline(est) gives same score as est
@@ -1212,8 +1235,9 @@ def check_estimators_pickle(name, estimator_orig):
     X -= X.min()
     X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
 
+    tags = _safe_tags(estimator_orig)
     # include NaN values when the estimator should deal with them
-    if name in ALLOW_NAN:
+    if tags['allow_nan']:
         # set randomly 10 elements to np.nan
         rng = np.random.RandomState(42)
         mask = rng.choice(X.size, 10, replace=False)
@@ -1303,9 +1327,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False):
     pred = clusterer.labels_
     assert_equal(pred.shape, (n_samples,))
     assert_greater(adjusted_rand_score(pred, y), 0.4)
-    # fit another time with ``fit_predict`` and compare results
-    if name == 'SpectralClustering':
-        # there is no way to make Spectral clustering deterministic :(
+    if _safe_tags(clusterer, 'non_deterministic'):
         return
     set_random_state(clusterer)
     with warnings.catch_warnings(record=True):
@@ -1340,12 +1362,10 @@ def check_clusterer_compute_labels_predict(name, clusterer_orig):
     """Check that predict is invariant of compute_labels"""
     X, y = make_blobs(n_samples=20, random_state=0)
     clusterer = clone(clusterer_orig)
+    set_random_state(clusterer)
 
     if hasattr(clusterer, "compute_labels"):
         # MiniBatchKMeans
-        if hasattr(clusterer, "random_state"):
-            clusterer.set_params(random_state=0)
-
         X_pred1 = clusterer.fit(X).predict(X)
         clusterer.set_params(compute_labels=False)
         X_pred2 = clusterer.fit(X).predict(X)
@@ -1394,6 +1414,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
     # generate binary problem from multi-class one
     y_b = y_m[y_m != 2]
     X_b = X_m[y_m != 2]
+    tags = _safe_tags(classifier_orig)
 
     if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB']:
         X_m -= X_m.min()
@@ -1407,15 +1428,20 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
         n_classes = len(classes)
         n_samples, n_features = X.shape
         classifier = clone(classifier_orig)
-        X = pairwise_estimator_convert_X(X, classifier_orig)
+        X = pairwise_estimator_convert_X(X, classifier)
+        y = multioutput_estimator_convert_y_2d(classifier, y)
+
         set_random_state(classifier)
         # raises error on malformed input for fit
-        with assert_raises(ValueError, msg="The classifier {} does not"
-                           " raise an error when incorrect/malformed input "
-                           "data for fit is passed. The number of training "
-                           "examples is not the same as the number of labels."
-                           " Perhaps use check_X_y in fit.".format(name)):
-            classifier.fit(X, y[:-1])
+        if not tags["no_validation"]:
+            with assert_raises(
+                ValueError,
+                msg="The classifier {} does not "
+                    "raise an error when incorrect/malformed input "
+                    "data for fit is passed. The number of training "
+                    "examples is not the same as the number of labels. "
+                    "Perhaps use check_X_y in fit.".format(name)):
+                classifier.fit(X, y[:-1])
 
         # fit
         classifier.fit(X, y)
@@ -1423,78 +1449,75 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
         classifier.fit(X.tolist(), y.tolist())
         assert hasattr(classifier, "classes_")
         y_pred = classifier.predict(X)
+
         assert_equal(y_pred.shape, (n_samples,))
         # training set performance
-        if name not in ['BernoulliNB', 'MultinomialNB', 'ComplementNB']:
+        if not tags['poor_score']:
             assert_greater(accuracy_score(y, y_pred), 0.83)
 
         # raises error on malformed input for predict
-        if _is_pairwise(classifier):
-            with assert_raises(ValueError, msg="The classifier {} does not"
-                               " raise an error when shape of X"
-                               "in predict is not equal to (n_test_samples,"
-                               "n_training_samples)".format(name)):
-                classifier.predict(X.reshape(-1, 1))
-        else:
-            with assert_raises(ValueError, msg="The classifier {} does not"
-                               " raise an error when the number of features "
-                               "in predict is different from the number of"
-                               " features in fit.".format(name)):
-                classifier.predict(X.T)
+        msg_pairwise = (
+            "The classifier {} does not raise an error when shape of X in "
+            " {} is not equal to (n_test_samples, n_training_samples)")
+        msg = ("The classifier {} does not raise an error when the number of "
+               "features in {} is different from the number of features in "
+               "fit.")
+
+        if not tags["no_validation"]:
+            if _is_pairwise(classifier):
+                with assert_raises(ValueError,
+                                   msg=msg_pairwise.format(name, "predict")):
+                    classifier.predict(X.reshape(-1, 1))
+            else:
+                with assert_raises(ValueError,
+                                   msg=msg.format(name, "predict")):
+                    classifier.predict(X.T)
         if hasattr(classifier, "decision_function"):
             try:
                 # decision_function agrees with predict
                 decision = classifier.decision_function(X)
                 if n_classes == 2:
-                    assert_equal(decision.shape, (n_samples,))
+                    if not tags["multioutput_only"]:
+                        assert_equal(decision.shape, (n_samples,))
+                    else:
+                        assert_equal(decision.shape, (n_samples, 1))
                     dec_pred = (decision.ravel() > 0).astype(np.int)
                     assert_array_equal(dec_pred, y_pred)
-                if (n_classes == 3 and
-                        # 1on1 of LibSVM works differently
-                        not isinstance(classifier, BaseLibSVM)):
+                else:
                     assert_equal(decision.shape, (n_samples, n_classes))
                     assert_array_equal(np.argmax(decision, axis=1), y_pred)
 
                 # raises error on malformed input for decision_function
-                if _is_pairwise(classifier):
-                    with assert_raises(ValueError, msg="The classifier {} does"
-                                       " not raise an error when the  "
-                                       "shape of X in decision_function is "
-                                       "not equal to (n_test_samples, "
-                                       "n_training_samples) in fit."
-                                       .format(name)):
-                        classifier.decision_function(X.reshape(-1, 1))
-                else:
-                    with assert_raises(ValueError, msg="The classifier {} does"
-                                       " not raise an error when the number "
-                                       "of features in decision_function is "
-                                       "different from the number of features"
-                                       " in fit.".format(name)):
-                        classifier.decision_function(X.T)
+                if not tags["no_validation"]:
+                    if _is_pairwise(classifier):
+                        with assert_raises(ValueError, msg=msg_pairwise.format(
+                                name, "decision_function")):
+                            classifier.decision_function(X.reshape(-1, 1))
+                    else:
+                        with assert_raises(ValueError, msg=msg.format(
+                                name, "decision_function")):
+                            classifier.decision_function(X.T)
             except NotImplementedError:
                 pass
+
         if hasattr(classifier, "predict_proba"):
             # predict_proba agrees with predict
             y_prob = classifier.predict_proba(X)
             assert_equal(y_prob.shape, (n_samples, n_classes))
             assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
             # check that probas for all classes sum to one
-            assert_allclose(np.sum(y_prob, axis=1), np.ones(n_samples))
-            # raises error on malformed input for predict_proba
-            if _is_pairwise(classifier_orig):
-                with assert_raises(ValueError, msg="The classifier {} does not"
-                                   " raise an error when the shape of X"
-                                   "in predict_proba is not equal to "
-                                   "(n_test_samples, n_training_samples)."
-                                   .format(name)):
-                    classifier.predict_proba(X.reshape(-1, 1))
-            else:
-                with assert_raises(ValueError, msg="The classifier {} does not"
-                                   " raise an error when the number of "
-                                   "features in predict_proba is different "
-                                   "from the number of features in fit."
-                                   .format(name)):
-                    classifier.predict_proba(X.T)
+            assert_array_almost_equal(np.sum(y_prob, axis=1),
+                                      np.ones(n_samples))
+            if not tags["no_validation"]:
+                # raises error on malformed input for predict_proba
+                if _is_pairwise(classifier_orig):
+                    with assert_raises(ValueError, msg=msg_pairwise.format(
+                            name, "predict_proba")):
+                        classifier.predict_proba(X.reshape(-1, 1))
+                else:
+                    with assert_raises(ValueError, msg=msg.format(
+                            name, "predict_proba")):
+                        classifier.predict_proba(X.T)
             if hasattr(classifier, "predict_log_proba"):
                 # predict_log_proba is a transformation of predict_proba
                 y_log_prob = classifier.predict_log_proba(X)
@@ -1604,8 +1627,22 @@ def check_estimators_unfitted(name, estimator_orig):
     estimator = clone(estimator_orig)
 
     msg = "fit"
-
     if hasattr(estimator, 'predict'):
+        can_predict = False
+        try:
+            # some models can predict without fitting
+            # like GaussianProcess regressors
+            # in this case, we skip this test
+            pred = estimator.predict(X)
+            assert pred.shape[0] == X.shape[0]
+            can_predict = True
+        except ValueError:
+            pass
+        if can_predict:
+            raise SkipTest(
+                "{} can predict without fitting, skipping "
+                "check_estimator_unfitted.".format(name))
+
         assert_raise_message((AttributeError, ValueError), msg,
                              estimator.predict, X)
 
@@ -1624,7 +1661,7 @@ def check_estimators_unfitted(name, estimator_orig):
 
 @ignore_warnings(category=(DeprecationWarning, FutureWarning))
 def check_supervised_y_2d(name, estimator_orig):
-    if "MultiTask" in name:
+    if _safe_tags(estimator_orig, "multioutput_only"):
         # These only work on 2d, so this test makes no sense
         return
     rnd = np.random.RandomState(0)
@@ -1646,7 +1683,7 @@ def check_supervised_y_2d(name, estimator_orig):
     y_pred_2d = estimator.predict(X)
     msg = "expected 1 DataConversionWarning, got: %s" % (
         ", ".join([str(w_x) for w_x in w]))
-    if name not in MULTI_OUTPUT:
+    if not _safe_tags(estimator, "multioutput"):
         # check that we warned if we don't support multi-output
         assert_greater(len(w), 0, msg)
         assert "DataConversionWarning('A column-vector y" \
@@ -1805,7 +1842,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False):
     # TODO: find out why PLS and CCA fail. RANSAC is random
     # and furthermore assumes the presence of outliers, hence
     # skipped
-    if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'):
+    if not _safe_tags(regressor, "poor_score"):
         assert_greater(regressor.score(X, y_), 0.5)
 
 
@@ -1967,7 +2004,6 @@ def check_estimators_overwrite_params(name, estimator_orig):
                      % (name, param_name, original_value, new_value))
 
 
-@ignore_warnings(category=(DeprecationWarning, FutureWarning))
 def check_no_attributes_set_in_init(name, estimator):
     """Check setting during init. """
 
@@ -2066,12 +2102,20 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y):
 
 def check_parameters_default_constructible(name, Estimator):
     # this check works on classes, not instances
-    classifier = LinearDiscriminantAnalysis()
     # test default-constructibility
     # get rid of deprecation warnings
     with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        if name in META_ESTIMATORS:
-            estimator = Estimator(classifier)
+        required_parameters = getattr(Estimator, "_required_parameters", [])
+        if required_parameters:
+            if required_parameters in (["base_estimator"], ["estimator"]):
+                if issubclass(Estimator, RegressorMixin):
+                    estimator = Estimator(Ridge())
+                else:
+                    estimator = Estimator(LinearDiscriminantAnalysis())
+            else:
+                raise SkipTest("Can't instantiate estimator {} which"
+                               " requires parameters {}".format(
+                                   name, required_parameters))
         else:
             estimator = Estimator()
         # test cloning
@@ -2105,8 +2149,7 @@ def param_filter(p):
             # true for mixins
             return
         params = estimator.get_params()
-
-        if name in META_ESTIMATORS:
+        if required_parameters == ["estimator"]:
             # they can need a non-default argument
             init_params = init_params[1:]
 
@@ -2114,9 +2157,12 @@ def param_filter(p):
             assert_not_equal(init_param.default, init_param.empty,
                              "parameter %s for %s has no default value"
                              % (init_param.name, type(estimator).__name__))
-            assert_in(type(init_param.default),
-                      [str, int, float, bool, tuple, type(None),
-                       np.float64, types.FunctionType, Memory])
+            if type(init_param.default) is type:
+                assert_in(init_param.default, [np.float64, np.int64])
+            else:
+                assert_in(type(init_param.default),
+                          [str, int, float, bool, tuple, type(None),
+                           np.float64, types.FunctionType, _joblib.Memory])
             if init_param.name not in params.keys():
                 # deprecated parameter, not in get_params
                 assert init_param.default is None
@@ -2141,7 +2187,7 @@ def param_filter(p):
 def multioutput_estimator_convert_y_2d(estimator, y):
     # Estimators in mono_output_task_error raise ValueError if y is of 1-D
     # Convert into a 2-D y for those estimators.
-    if "MultiTask" in estimator.__class__.__name__:
+    if _safe_tags(estimator, "multioutput_only"):
         return np.reshape(y, (-1, 1))
     return y
 
@@ -2283,7 +2329,8 @@ def check_classifiers_regression_target(name, estimator_orig):
     X, y = boston.data, boston.target
     e = clone(estimator_orig)
     msg = 'Unknown label type: '
-    assert_raises_regex(ValueError, msg, e.fit, X, y)
+    if not _safe_tags(e, "no_validation"):
+        assert_raises_regex(ValueError, msg, e.fit, X, y)
 
 
 @ignore_warnings(category=(DeprecationWarning, FutureWarning))
diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py
index 53c7960786d23..9c059f2ed2ed9 100644
--- a/sklearn/utils/mocking.py
+++ b/sklearn/utils/mocking.py
@@ -132,3 +132,6 @@ def score(self, X=None, Y=None):
         else:
             score = 0.
         return score
+
+    def _more_tags(self):
+        return {'_skip_test': True, 'X_types': ['1dlabel']}
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index a9582ab28fd0c..62bd22f66d918 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from .validation import check_array
+from .validation import check_array, _assert_all_finite
 
 
 def _unique_multiclass(y):
@@ -281,6 +281,7 @@ def type_of_target(y):
     # check float and contains non-integer float values
     if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+        _assert_all_finite(y)
         return 'continuous' + suffix
 
     if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 66e0299f1c378..3e12559067411 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -577,38 +577,9 @@ def uninstall_mldata_mock():
     datasets.mldata.urlopen = urlopen
 
 
-# Meta estimators need another estimator to be instantiated.
-META_ESTIMATORS = ["OneVsOneClassifier", "MultiOutputEstimator",
-                   "MultiOutputRegressor", "MultiOutputClassifier",
-                   "OutputCodeClassifier", "OneVsRestClassifier",
-                   "RFE", "RFECV", "BaseEnsemble", "ClassifierChain",
-                   "RegressorChain"]
-# estimators that there is no way to default-construct sensibly
-OTHER = ["Pipeline", "FeatureUnion",
-         "GridSearchCV", "RandomizedSearchCV",
-         "SelectFromModel", "ColumnTransformer"]
-
-# some strange ones
-DONT_TEST = ['SparseCoder', 'DictVectorizer',
-             'LabelBinarizer', 'LabelEncoder',
-             'MultiLabelBinarizer', 'TfidfTransformer',
-             'TfidfVectorizer', 'IsotonicRegression',
-             'OneHotEncoder', 'RandomTreesEmbedding', 'OrdinalEncoder',
-             'FeatureHasher', 'DummyClassifier', 'DummyRegressor',
-             'TruncatedSVD', 'PolynomialFeatures',
-             'GaussianRandomProjectionHash', 'HashingVectorizer',
-             'CheckingClassifier', 'PatchExtractor', 'CountVectorizer',
-             # GradientBoosting base estimators, maybe should
-             # exclude them in another way
-             'ZeroEstimator', 'ScaledLogOddsEstimator',
-             'QuantileEstimator', 'MeanEstimator',
-             'LogOddsEstimator', 'PriorProbabilityEstimator',
-             '_SigmoidCalibration', 'VotingClassifier']
-
-
-def all_estimators(include_meta_estimators=False,
-                   include_other=False, type_filter=None,
-                   include_dont_test=False):
+def all_estimators(include_meta_estimators=None,
+                   include_other=None, type_filter=None,
+                   include_dont_test=None):
     """Get a list of all estimators from sklearn.
 
     This function crawls the module and gets all classes that inherit
@@ -619,15 +590,16 @@ def all_estimators(include_meta_estimators=False,
     Parameters
     ----------
     include_meta_estimators : boolean, default=False
-        Whether to include meta-estimators that can be constructed using
-        an estimator as their first argument. These are currently
-        BaseEnsemble, OneVsOneClassifier, OutputCodeClassifier,
-        OneVsRestClassifier, RFE, RFECV.
+        Deprecated, ignored.
+        .. deprecated:: 0.21
+           ``include_meta_estimators`` has been deprecated and has no effect in
+           0.21 and will be removed in 0.23.
 
     include_other : boolean, default=False
-        Wether to include meta-estimators that are somehow special and can
-        not be default-constructed sensibly. These are currently
-        Pipeline, FeatureUnion and GridSearchCV
+        Deprecated, ignored.
+        .. deprecated:: 0.21
+           ``include_other`` has been deprecated and has not effect in 0.21 and
+           will be removed in 0.23.
 
     type_filter : string, list of string,  or None, default=None
         Which kind of estimators should be returned. If None, no filter is
@@ -637,7 +609,10 @@ def all_estimators(include_meta_estimators=False,
         get the estimators that fit at least one of the types.
 
     include_dont_test : boolean, default=False
-        Whether to include "special" label estimator or test processors.
+        Deprecated, ignored.
+        .. deprecated:: 0.21
+           ``include_dont_test`` has been deprecated and has no effect in 0.21
+           and will be removed in 0.23.
 
     Returns
     -------
@@ -652,6 +627,21 @@ def is_abstract(c):
             return False
         return True
 
+    if include_other is not None:
+        warnings.warn("include_other was deprecated in version 0.21,"
+                      " has no effect and will be removed in 0.23",
+                      DeprecationWarning)
+
+    if include_dont_test is not None:
+        warnings.warn("include_dont_test was deprecated in version 0.21,"
+                      " has no effect and will be removed in 0.23",
+                      DeprecationWarning)
+
+    if include_meta_estimators is not None:
+        warnings.warn("include_meta_estimators was deprecated in version 0.21,"
+                      " has no effect and will be removed in 0.23",
+                      DeprecationWarning)
+
     all_classes = []
     # get parent folder
     path = sklearn.__path__
@@ -674,14 +664,6 @@ def is_abstract(c):
     # get rid of abstract base classes
     estimators = [c for c in estimators if not is_abstract(c[1])]
 
-    if not include_dont_test:
-        estimators = [c for c in estimators if not c[0] in DONT_TEST]
-
-    if not include_other:
-        estimators = [c for c in estimators if not c[0] in OTHER]
-    # possibly get rid of meta estimators
-    if not include_meta_estimators:
-        estimators = [c for c in estimators if not c[0] in META_ESTIMATORS]
     if type_filter is not None:
         if not isinstance(type_filter, list):
             type_filter = [type_filter]
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 8492f7f09cc7d..fa0e0abf77ce5 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -283,7 +283,7 @@ def test_check_estimator():
     assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator)
     assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
     # check that fit does input validation
-    msg = "TypeError not raised"
+    msg = "ValueError not raised"
     assert_raises_regex(AssertionError, msg, check_estimator,
                         BaseBadClassifier)
     assert_raises_regex(AssertionError, msg, check_estimator,