scikit-learn · glemaitre · Feb 23, 2019 · Dec 8, 2016 · Dec 8, 2016 · Dec 8, 2016
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
@@ -1419,22 +1419,18 @@ advised to maintain notes on the `GitHub wiki
 Specific models
 ---------------
 
-Classifiers should accept ``y`` (target) arguments to ``fit``
-that are sequences (lists, arrays) of either strings or integers.
-They should not assume that the class labels
-are a contiguous range of integers;
-instead, they should store a list of classes
-in a ``classes_`` attribute or property.
-The order of class labels in this attribute
-should match the order in which ``predict_proba``, ``predict_log_proba``
-and ``decision_function`` return their values.
-The easiest way to achieve this is to put::
+Classifiers should accept ``y`` (target) arguments to ``fit`` that are
+sequences (lists, arrays) of either strings or integers.  They should not
+assume that the class labels are a contiguous range of integers; instead, they
+should store a list of classes in a ``classes_`` attribute or property.  The
+order of class labels in this attribute should match the order in which
+``predict_proba``, ``predict_log_proba`` and ``decision_function`` return their
+values.  The easiest way to achieve this is to put::
 
     self.classes_, y = np.unique(y, return_inverse=True)
 
-in ``fit``.
-This returns a new ``y`` that contains class indexes, rather than labels,
-in the range [0, ``n_classes``).
+in ``fit``.  This returns a new ``y`` that contains class indexes, rather than
+labels, in the range [0, ``n_classes``).
 
 A classifier's ``predict`` method should return
 arrays containing class labels from ``classes_``.
@@ -1445,14 +1441,89 @@ this can be achieved with::
         D = self.decision_function(X)
         return self.classes_[np.argmax(D, axis=1)]
 
-In linear models, coefficients are stored in an array called ``coef_``,
-and the independent term is stored in ``intercept_``.
-``sklearn.linear_model.base`` contains a few base classes and mixins
-that implement common linear model patterns.
+In linear models, coefficients are stored in an array called ``coef_``, and the
+independent term is stored in ``intercept_``.  ``sklearn.linear_model.base``
+contains a few base classes and mixins that implement common linear model
+patterns.
 
 The :mod:`sklearn.utils.multiclass` module contains useful functions
 for working with multiclass and multilabel problems.
 
+Estimator Tags
+--------------
+.. warning::
+
+    The estimator tags are experimental and the API is subject to change.
+
+Scikit-learn introduced estimator tags in version 0.21.  These are annotations
+of estimators that allow programmatic inspection of their capabilities, such as
+sparse matrix support, supported output types and supported methods.  The
+estimator tags are a dictionary returned by the method ``_get_tags()``.  These
+tags are used by the common tests and the :func:`sklearn.utils.estomator_checks.check_estimator` function to
+decide what tests to run and what input data is appropriate. Tags can depends on
+estimator parameters or even system architecture and can in general only be
+determined at runtime.
+
+The default value of all tags except for ``X_types`` is ``False``.
+
+The current set of estimator tags are:
+
+non_deterministic
+    whether the estimator is not deterministic given a fixed ``random_state``
+
+requires_positive_data - unused for now
+    whether the estimator requires positive X.
+
+no_validation
+    whether the estimator skips input-validation. This is only meant for stateless and dummy transformers!
+
+multioutput - unused for now
+    whether a regressor supports multi-target outputs or a classifier supports multi-class multi-output.
+
+multilabel
+    whether the estimator supports multilabel output
+
+stateless
+    whether the estimator needs access to data for fitting. Even though
+    an estimator is stateless, it might still need a call to ``fit`` for initialization.
+
+allow_nan
+    whether the estimator supports data with missing values encoded as np.NaN
+
+poor_score
+    whether the estimator fails to provide a "reasonable" test-set score, which
+    currently for regression is an R2 of 0.5 on a subset of the boston housing
+    dataset, and for classification an accuracy of 0.83 on
+    ``make_blobs(n_samples=300, random_state=0)``. These datasets and values
+    are based on current estimators in sklearn and might be replaced by
+    something more systematic.
+
+multioutput_only
+    whether estimator supports only multi-output classification or regression.
+
+_skip_test
+    whether to skip common tests entirely. Don't use this unless you have a *very good* reason.
+
+X_types
+    Supported input types for X as list of strings. Tests are currently only run if '2darray' is contained
+    in the list, signifying that the estimator takes continuous 2d numpy arrays as input. The default
+    value is ['2darray']. Other possible types are ``'string'``, ``'sparse'``,
+    ``'categorical'``, ``dict``, ``'1dlabels'`` and ``'2dlabels'``.
+    The goals is that in the future the supported input type will determine the
+    data used during testsing, in particular for ``'string'``, ``'sparse'`` and
+    ``'categorical'`` data.  For now, the test for sparse data do not make use
+    of the ``'sparse'`` tag.
+
+
+In addition to the tags, estimators are also need to declare any non-optional
+parameters to ``__init__`` in the ``_required_parameters`` class attribute,
+which is a list or tuple.  If ``_required_parameters`` is only
+``["estimator"]`` or ``["base_estimator"]``, then the estimator will be
+instantiated with an instance of ``LinearDiscriminantAnalysis`` (or
+``RidgeRegression`` if the estimator is a regressor) in the tests. The choice
+of these two models is somewhat idiosyncratic but both should provide robust
+closed-form solutions.
+
 .. _reading-code:
 
 Reading the existing code base

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -6,12 +6,25 @@
 import copy
 import warnings
 from collections import defaultdict
-from inspect import signature
+import struct
+import inspect
 
 import numpy as np
 
 from . import __version__
 
+_DEFAULT_TAGS = {
+    'non_deterministic': False,
+    'requires_positive_data': False,
+    'X_types': ['2darray'],
+    'poor_score': False,
+    'no_validation': False,
+    'multioutput': False,
+    "allow_nan": False,
+    'stateless': False,
+    'multilabel': False,
+    '_skip_test': False,
+    'multioutput_only': False}
 
 
 def clone(estimator, safe=True):
@@ -61,7 +74,6 @@ def clone(estimator, safe=True):
     return new_object
 
 
-###############################################################################
 def _pprint(params, offset=0, printer=repr):
     """Pretty print the dictionary 'params'
 
@@ -112,7 +124,17 @@ def _pprint(params, offset=0, printer=repr):
     return lines
 
 
-###############################################################################
+def _update_if_consistent(dict1, dict2):
+    common_keys = set(dict1.keys()).intersection(dict2.keys())
+    for key in common_keys:
+        if dict1[key] != dict2[key]:
+            raise TypeError("Inconsistent values for tag {}: {} != {}".format(
+                key, dict1[key], dict2[key]
+            ))
+    dict1.update(dict2)
+    return dict1
+
+
 class BaseEstimator:
     """Base class for all estimators in scikit-learn
 
@@ -135,7 +157,7 @@ def _get_param_names(cls):
 
         # introspect the constructor arguments to find the model parameters
         # to represent
-        init_signature = signature(init)
+        init_signature = inspect.signature(init)
         # Consider the constructor parameters excluding 'self'
         parameters = [p for p in init_signature.parameters.values()
                       if p.name != 'self' and p.kind != p.VAR_KEYWORD]
@@ -255,8 +277,22 @@ def __setstate__(self, state):
         except AttributeError:
             self.__dict__.update(state)
 
+    def _get_tags(self):
+        collected_tags = {}
+        for base_class in inspect.getmro(self.__class__):
+            if (hasattr(base_class, '_more_tags')
+                    and base_class != self.__class__):
+                more_tags = base_class._more_tags(self)
+                collected_tags = _update_if_consistent(collected_tags,
+                                                       more_tags)
+        if hasattr(self, '_more_tags'):
+            more_tags = self._more_tags()
+            collected_tags = _update_if_consistent(collected_tags, more_tags)
+        tags = _DEFAULT_TAGS.copy()
+        tags.update(collected_tags)
+        return tags
+
 
-###############################################################################
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""
     _estimator_type = "classifier"
@@ -289,7 +325,6 @@ def score(self, X, y, sample_weight=None):
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
 
-###############################################################################
 class RegressorMixin:
     """Mixin class for all regression estimators in scikit-learn."""
     _estimator_type = "regressor"
@@ -330,7 +365,6 @@ def score(self, X, y, sample_weight=None):
                         multioutput='variance_weighted')
 
 
-###############################################################################
 class ClusterMixin:
     """Mixin class for all cluster estimators in scikit-learn."""
     _estimator_type = "clusterer"
@@ -432,7 +466,6 @@ def get_submatrix(self, i, data):
         return data[row_ind[:, np.newaxis], col_ind]
 
 
-###############################################################################
 class TransformerMixin:
     """Mixin class for all transformers in scikit-learn."""
 
@@ -510,13 +543,27 @@ def fit_predict(self, X, y=None):
         return self.fit(X).predict(X)
 
 
-###############################################################################
 class MetaEstimatorMixin:
+    _required_parameters = ["estimator"]
     """Mixin class for all meta estimators in scikit-learn."""
-    # this is just a tag for the moment
 
 
-###############################################################################
+class MultiOutputMixin(object):
+    """Mixin to mark estimators that support multioutput."""
+    def _more_tags(self):
+        return {'multioutput': True}
+
+
+def _is_32bit():
+    """Detect if process is 32bit Python."""
+    return struct.calcsize('P') * 8 == 32
+
+
+class _UnstableOn32BitMixin(object):
+    """Mark estimators that are non-determinstic on 32bit."""
+    def _more_tags(self):
+        return {'non_deterministic': _is_32bit()}
+
 
 def is_classifier(estimator):
     """Returns True if the given estimator is (probably) a classifier.

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -158,6 +158,7 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
            [0.5, 0.5, 0. , 1. ]])
 
     """
+    _required_parameters = ['transformers']
 
     def __init__(self, transformers, remainder='drop', sparse_threshold=0.3,
                  n_jobs=None, transformer_weights=None):

diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
@@ -233,3 +233,6 @@ def predict(self, X):
             pred_trans = pred_trans.squeeze(axis=1)
 
         return pred_trans
+
+    def _more_tags(self):
+        return {'poor_score': True, 'no_validation': True}
diff --git a/sklearn/cross_decomposition/cca_.py b/sklearn/cross_decomposition/cca_.py
@@ -1,9 +1,10 @@
 from .pls_ import _PLS
+from ..base import _UnstableOn32BitMixin
 
 __all__ = ['CCA']
 
 
-class CCA(_PLS):
+class CCA(_PLS, _UnstableOn32BitMixin):
     """CCA Canonical Correlation Analysis.
 
     CCA inherits from PLS with mode="B" and deflation_mode="canonical".

diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
@@ -13,6 +13,7 @@
 from scipy.sparse.linalg import svds
 
 from ..base import BaseEstimator, RegressorMixin, TransformerMixin
+from ..base import MultiOutputMixin
 from ..utils import check_array, check_consistent_length
 from ..utils.extmath import svd_flip
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
@@ -116,7 +117,7 @@ def _center_scale_xy(X, Y, scale=True):
     return X, Y, x_mean, y_mean, x_std, y_std
 
 
-class _PLS(BaseEstimator, TransformerMixin, RegressorMixin,
+class _PLS(BaseEstimator, TransformerMixin, RegressorMixin, MultiOutputMixin,
            metaclass=ABCMeta):
     """Partial Least Squares (PLS)
 
@@ -454,6 +455,9 @@ def fit_transform(self, X, y=None):
         """
         return self.fit(X, y).transform(X, y)
 
+    def _more_tags(self):
+        return {'poor_score': True}
+
 
 class PLSRegression(_PLS):
     """PLS regression

diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
@@ -10,12 +10,12 @@
 from ..utils import check_random_state
 from ..utils.validation import check_is_fitted, check_array
 from ..exceptions import NotFittedError
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _UnstableOn32BitMixin
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
 
 
-class KernelPCA(BaseEstimator, TransformerMixin):
+class KernelPCA(BaseEstimator, TransformerMixin, _UnstableOn32BitMixin):
     """Kernel Principal component analysis (KPCA)
 
     Non-linear dimensionality reduction through the use of kernels (see

diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
@@ -156,7 +156,8 @@ def fit_transform(self, X, y=None):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = check_array(X, accept_sparse=['csr', 'csc'],
+                        ensure_min_features=2)
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":