From 7d9dcc4f03a13eb5116f6c0299e2822a9b0011a9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 9 Apr 2019 14:10:47 -0400 Subject: [PATCH 01/53] Basic validate_X and validate_X_y methods for _n_features_in attribute --- sklearn/base.py | 26 ++++++++++++++++++++++++++ sklearn/preprocessing/data.py | 14 ++++++++------ sklearn/tests/test_base.py | 15 +++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 167baaf2b7ebd..ba380f2a70902 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -293,6 +293,32 @@ def _get_tags(self): tags.update(collected_tags) return tags + def _validate_n_features(self, X, check_n_features): + if check_n_features: + if not hasattr(self, '_n_features_in'): + raise RuntimeError( + "check_n_features is True but there is no _n_features_in " + "attribute." + ) + if X.shape[1] != self._n_features_in: + raise ValueError( + 'X has {} features, but this {} is expecting {} features ' + 'as input.'.format(X.shape[1], self.__class__.__name__, + self._n_features_in) + ) + self._n_features_in = X.shape[1] + + def validate_X(self, X, check_n_features=False, **check_array_params): + from .utils.validation import check_array + X = check_array(X, **check_array_params) + self._validate_n_features(X, check_n_features) + return X + + def validate_X_y(self, X, y, check_n_features=False, **check_X_y_params): + from .utils.validation import check_X_y + X, y = check_X_y(X, **check_X_y_params) + self._validate_n_features(X, check_n_features) + return X, y class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bab41f3bdd492..a6ad1ddd930ce 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -658,9 +658,9 @@ def partial_fit(self, X, y=None): y Ignored """ - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - warn_on_dtype=False, estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + X = self.validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy, + warn_on_dtype=False, estimator=self, + dtype=FLOAT_DTYPES, force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -753,9 +753,11 @@ def transform(self, X, copy=None): check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy - X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=False, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self.validate_X(X, check_n_features=True, + accept_sparse='csr', copy=copy, + warn_on_dtype=False, estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 316b01ff33415..b808b2190f238 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -19,6 +19,7 @@ from sklearn.svm import SVC from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor @@ -509,3 +510,17 @@ def test_regressormixin_score_multioutput(): "built-in scorer 'r2' uses " "multioutput='uniform_average').") assert_warns_message(FutureWarning, msg, reg.score, X, y) + + +def test_validate_X(): + # Make sure ValueError is raised when there is a n_features mismatch + # between fit and predict/transform + + X = [[0, 1], [2, 3]] + + ss = StandardScaler().fit(X) + ss.transform(X) # All good + + with pytest.raises(ValueError, match="X has 3 features, but"): + X_more_features = [[0, 1, 4], [2, 3, 5]] + ss.transform(X_more_features) From f117745a01415fefd678cfc48cf2ce9694d036d2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Apr 2019 15:36:24 -0400 Subject: [PATCH 02/53] created NonRectangularInputMixin --- sklearn/base.py | 23 +++++++++++++------ sklearn/feature_extraction/dict_vectorizer.py | 5 ++-- sklearn/feature_extraction/text.py | 4 ++-- sklearn/tests/test_base.py | 22 ++++++++++++++++++ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index ba380f2a70902..94ba53ffa7789 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -13,6 +13,8 @@ from . import __version__ from sklearn.utils import _IS_32BIT +from .utils.validation import check_X_y +from .utils.validation import check_array _DEFAULT_TAGS = { 'non_deterministic': False, @@ -295,27 +297,25 @@ def _get_tags(self): def _validate_n_features(self, X, check_n_features): if check_n_features: - if not hasattr(self, '_n_features_in'): + if not hasattr(self, 'n_features_in_'): raise RuntimeError( - "check_n_features is True but there is no _n_features_in " + "check_n_features is True but there is no n_features_in_ " "attribute." ) - if X.shape[1] != self._n_features_in: + if X.shape[1] != self.n_features_in_: raise ValueError( 'X has {} features, but this {} is expecting {} features ' 'as input.'.format(X.shape[1], self.__class__.__name__, - self._n_features_in) + self.n_features_in_) ) - self._n_features_in = X.shape[1] + self.n_features_in_ = X.shape[1] def validate_X(self, X, check_n_features=False, **check_array_params): - from .utils.validation import check_array X = check_array(X, **check_array_params) self._validate_n_features(X, check_n_features) return X def validate_X_y(self, X, y, check_n_features=False, **check_X_y_params): - from .utils.validation import check_X_y X, y = check_X_y(X, **check_X_y_params) self._validate_n_features(X, check_n_features) return X, y @@ -551,6 +551,15 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y, **fit_params).transform(X) +class NonRectangularInputMixin: + """Mixin class for all estimators with non-rectangular input. + + For now only vectorizers are relevant for this mixin. + """ + + n_features_in_ = None + + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" _estimator_type = "DensityEstimator" diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py index 8273834acdb20..f21b3ede1f931 100644 --- a/sklearn/feature_extraction/dict_vectorizer.py +++ b/sklearn/feature_extraction/dict_vectorizer.py @@ -9,7 +9,7 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin from ..utils import check_array, tosequence @@ -21,7 +21,8 @@ def _tosequence(X): return tosequence(X) -class DictVectorizer(BaseEstimator, TransformerMixin): +class DictVectorizer(BaseEstimator, TransformerMixin, + NonRectangularInputMixin): """Transforms lists of feature-value mappings to vectors. This transformer turns lists of mappings (dict-like objects) of feature diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9cdbace6224aa..1e0db090cf456 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -24,7 +24,7 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS @@ -113,7 +113,7 @@ def _check_stop_list(stop): return frozenset(stop) -class VectorizerMixin: +class VectorizerMixin(NonRectangularInputMixin): """Provides common code for text vectorizers (tokenization logic).""" _white_spaces = re.compile(r"\s\s+") diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index b808b2190f238..32b9aab3da42d 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -20,6 +20,7 @@ from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler +from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor @@ -524,3 +525,24 @@ def test_validate_X(): with pytest.raises(ValueError, match="X has 3 features, but"): X_more_features = [[0, 1, 4], [2, 3, 5]] ss.transform(X_more_features) + + +def test_n_features_in_attribute(): + # Make sure n_features_in_ is correctly set. + # Note that n_features_in_ is always None for vectorizers, while for other + # estimators the attribute doesn't exist until fit() is called. + X_2 = [[0, 1], [2, 3]] + X_3 = [[0, 1, 4], [2, 3, 5]] + + ss = StandardScaler() + assert not hasattr(ss, 'n_features_in_') + ss.fit(X_2) + assert ss.n_features_in_ == 2 + ss = ss.fit(X_3) + assert ss.n_features_in_ == 3 + + dv = DictVectorizer() + assert dv.n_features_in_ is None + d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + dv.fit(d) + assert dv.n_features_in_ is None From e56592b7f3bc3103eb7326e83605e8858f0d5b6f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Apr 2019 15:38:20 -0400 Subject: [PATCH 03/53] resolved conflicts --- sklearn/preprocessing/data.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index fdd04087e40cd..225f7c2794cf8 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -658,15 +658,9 @@ def partial_fit(self, X, y=None): y Ignored """ -<<<<<<< HEAD X = self.validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy, - warn_on_dtype=False, estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') -======= - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') ->>>>>>> upstream/master + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -759,17 +753,10 @@ def transform(self, X, copy=None): check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy -<<<<<<< HEAD X = self.validate_X(X, check_n_features=True, accept_sparse='csr', copy=copy, - warn_on_dtype=False, estimator=self, - dtype=FLOAT_DTYPES, + estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') -======= - X = check_array(X, accept_sparse='csr', copy=copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') ->>>>>>> upstream/master if sparse.issparse(X): if self.with_mean: From 8ecc690d47da03e08ca2b4b1326940cecd4296c7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 30 May 2019 15:26:10 -0400 Subject: [PATCH 04/53] _validate** is not private --- sklearn/base.py | 6 +++--- sklearn/preprocessing/data.py | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 9cc73d3c7998a..3aa28b210fdc4 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -342,13 +342,13 @@ def _validate_n_features(self, X, check_n_features): ) self.n_features_in_ = X.shape[1] - def validate_X(self, X, check_n_features=False, **check_array_params): + def _validate_X(self, X, check_n_features=False, **check_array_params): X = check_array(X, **check_array_params) self._validate_n_features(X, check_n_features) return X - def validate_X_y(self, X, y, check_n_features=False, **check_X_y_params): - X, y = check_X_y(X, **check_X_y_params) + def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params): + X, y = check_X_y(X, y, **check_X_y_params) self._validate_n_features(X, check_n_features) return X, y diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 45b7aed4ae605..1b924aa8ecf4d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -658,9 +658,9 @@ def partial_fit(self, X, y=None): y Ignored """ - X = self.validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_X(X, accept_sparse=('csr', 'csc'), copy=self.copy, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -753,10 +753,10 @@ def transform(self, X, copy=None): check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy - X = self.validate_X(X, check_n_features=True, - accept_sparse='csr', copy=copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_X(X, check_n_features=True, + accept_sparse='csr', copy=copy, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: From 60e4cea50ae74ccc8e91659eb51fcf2ff5c4dfe1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 30 May 2019 17:02:05 -0400 Subject: [PATCH 05/53] Added support for pipeline and grid search --- .../gradient_boosting.py | 2 +- sklearn/model_selection/_search.py | 5 +++ sklearn/pipeline.py | 4 +++ sklearn/tests/test_base.py | 31 +++++++++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 466181f445ee8..6eb692d743af0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -95,7 +95,7 @@ def fit(self, X, y): acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. - X, y = check_X_y(X, y, dtype=[X_DTYPE]) + X, y = self._validate_X_y(X, y, dtype=[X_DTYPE]) y = self._encode_y(y) rng = check_random_state(self.random_state) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 0447f4857fa9d..cc1f9fc2a6f1c 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -540,6 +540,11 @@ def inverse_transform(self, Xt): self._check_is_fitted('inverse_transform') return self.best_estimator_.inverse_transform(Xt) + @property + def n_features_in_(self): + check_is_fitted(self, 'best_estimator_') + return self.best_estimator_.n_features_in_ + @property def classes_(self): self._check_is_fitted("classes_") diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 55df0de701db4..9eddfd9e51d30 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -609,6 +609,10 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + @property + def n_features_in_(self): + return self.steps[0][1].n_features_in_ + def _name_estimators(estimators): """Generate names for estimators.""" diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 32b9aab3da42d..ae09dfd501bd2 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -21,6 +21,11 @@ from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction import DictVectorizer +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.datasets import make_classification +from sklearn.pipeline import make_pipeline +from sklearn.exceptions import NotFittedError from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor @@ -546,3 +551,29 @@ def test_n_features_in_attribute(): d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] dv.fit(d) assert dv.n_features_in_ is None + + # meta estimator need specific ways of dealing with the attribute: + # grid search delegates this to # the best estimator + n_features = 4 + X, y = make_classification(n_features=n_features) + gbdt = HistGradientBoostingClassifier() + param_grid = {'max_iter': [3, 4]} + gs = GridSearchCV(gbdt, param_grid) + assert hasattr(ss, 'n_features_in_') # that might be a bit unintuitive + with pytest.raises(NotFittedError): + gs.n_features_in_ + gs.fit(X, y) + assert gs.n_features_in_ == n_features + + # pipelines delegate to the first step + pipe = make_pipeline(gbdt) + assert not hasattr(pipe, 'n_features_in_') + pipe.fit(X, y) + assert pipe.n_features_in_ == n_features + + dv = DictVectorizer() + pipe = make_pipeline(dv) + assert pipe.n_features_in_ is None + d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + dv.fit(d) + assert pipe.n_features_in_ is None From ff19f2226d150b127f698c0c2f90e09caae8a8f3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 31 May 2019 08:53:30 -0400 Subject: [PATCH 06/53] pep8 --- sklearn/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index ae09dfd501bd2..036cf69a3d24d 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -21,7 +21,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction import DictVectorizer -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.datasets import make_classification from sklearn.pipeline import make_pipeline From a44318b3b6b8b0abef81cfbd3c583763dab9c22d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 31 May 2019 11:35:05 -0400 Subject: [PATCH 07/53] Trigger CI?? From abdc94e3b8e92851ed63c0467cf7b6c4218071e5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 26 Jun 2019 13:23:52 -0400 Subject: [PATCH 08/53] Added to decision tree for gridsearch tests to pass --- sklearn/tree/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 77b0ad6f6592f..367359805fce6 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -125,7 +125,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, random_state = check_random_state(self.random_state) if check_input: - X = check_array(X, dtype=DTYPE, accept_sparse="csc") + X = self._validate_X(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() From 62fc42e2065edfa4e2ae9cc7c4feb754a6e665a8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 1 Aug 2019 17:34:20 -0400 Subject: [PATCH 09/53] Added support for ColumnTransformer and FeatureUnion --- sklearn/compose/_column_transformer.py | 2 + .../compose/tests/test_column_transformer.py | 12 +++++ .../tests/test_dict_vectorizer.py | 10 +++++ sklearn/model_selection/tests/test_search.py | 15 +++++++ sklearn/pipeline.py | 5 +++ sklearn/tests/test_base.py | 45 +++++-------------- sklearn/tests/test_pipeline.py | 45 +++++++++++++++++++ 7 files changed, 99 insertions(+), 35 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c0f537776cb6a..25e473dbc51ac 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -471,6 +471,7 @@ def fit_transform(self, X, y=None): """ X = _check_X(X) + self._validate_n_features(X, check_n_features=False) self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) @@ -518,6 +519,7 @@ def transform(self, X): """ check_is_fitted(self, 'transformers_') X = _check_X(X) + self._validate_n_features(X, check_n_features=True) if self._n_features > X.shape[1]: raise ValueError('Number of features of the input must be equal ' diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index ae7ef31d6c7f1..9d133fbf0a4d7 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1108,3 +1108,15 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): err_msg = 'Specifying the columns' with pytest.raises(ValueError, match=err_msg): tf.transform(X_array) + + +def test_n_features_in(): + # make sure n_features_in is what is passed as input to the column + # transformer. + + X = [[1, 2], [3, 4], [5, 6]] + ct = ColumnTransformer([('a', DoubleTrans(), [0]), + ('b', DoubleTrans(), [1])]) + assert not hasattr(ct, 'n_features_in_') + ct.fit(X) + assert ct.n_features_in_ == 2 diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 7e7481a369646..32a14fe82be5b 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -110,3 +110,13 @@ def test_deterministic_vocabulary(): v_2 = DictVectorizer().fit([d_shuffled]) assert v_1.vocabulary_ == v_2.vocabulary_ + + +def test_n_features_in(): + # For vectorizers, n_features_in_ does not make sense and it is always + # None + dv = DictVectorizer() + assert dv.n_features_in_ is None + d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + dv.fit(d) + assert dv.n_features_in_ is None diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 90a837e7f49f1..42841dcb248a8 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -63,6 +63,8 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import Ridge, SGDClassifier +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection.tests.common import OneTimeSplitter @@ -1762,3 +1764,16 @@ def get_n_splits(self, *args, **kw): 'inconsistent results. Expected \\d+ ' 'splits, got \\d+'): ridge.fit(X[:train_size], y[:train_size]) + + +def test_n_features_in(): + # make sure grid search delegates n_features_in to the best estimator + n_features = 4 + X, y = make_classification(n_features=n_features) + gbdt = HistGradientBoostingClassifier() + param_grid = {'max_iter': [3, 4]} + gs = GridSearchCV(gbdt, param_grid) + with pytest.raises(NotFittedError): + gs.n_features_in_ + gs.fit(X, y) + assert gs.n_features_in_ == n_features diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 344a0d11210ad..7649c9225a390 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -985,6 +985,11 @@ def _update_transformer_list(self, transformers): else next(transformers)) for name, old in self.transformer_list] + @property + def n_features_in_(self): + # X is passed to all transformers so we just delegate to the first one + return self.transformer_list[0][1].n_features_in_ + def make_union(*transformers, **kwargs): """Construct a FeatureUnion from the given transformers. diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 6d2bf83b8b6e1..198570c588cc5 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -514,7 +514,7 @@ def test_regressormixin_score_multioutput(): assert_warns_message(FutureWarning, msg, reg.score, X, y) -def test_validate_X(): +def test_validate_X_n_feature_mismatch(): # Make sure ValueError is raised when there is a n_features mismatch # between fit and predict/transform @@ -528,10 +528,17 @@ def test_validate_X(): ss.transform(X_more_features) +def test_validate_X_bad_kwargs(): + + est = BaseEstimator() + with pytest.raises(TypeError, + match="check_array\(\) got an unexpected keyword"): + est._validate_X([1], bad_param=4) + + def test_n_features_in_attribute(): # Make sure n_features_in_ is correctly set. - # Note that n_features_in_ is always None for vectorizers, while for other - # estimators the attribute doesn't exist until fit() is called. + # TODO: eventually move this in estimator_checks X_2 = [[0, 1], [2, 3]] X_3 = [[0, 1, 4], [2, 3, 5]] @@ -542,38 +549,6 @@ def test_n_features_in_attribute(): ss = ss.fit(X_3) assert ss.n_features_in_ == 3 - dv = DictVectorizer() - assert dv.n_features_in_ is None - d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] - dv.fit(d) - assert dv.n_features_in_ is None - - # meta estimator need specific ways of dealing with the attribute: - # grid search delegates this to # the best estimator - n_features = 4 - X, y = make_classification(n_features=n_features) - gbdt = HistGradientBoostingClassifier() - param_grid = {'max_iter': [3, 4]} - gs = GridSearchCV(gbdt, param_grid) - assert hasattr(ss, 'n_features_in_') # that might be a bit unintuitive - with pytest.raises(NotFittedError): - gs.n_features_in_ - gs.fit(X, y) - assert gs.n_features_in_ == n_features - - # pipelines delegate to the first step - pipe = make_pipeline(gbdt) - assert not hasattr(pipe, 'n_features_in_') - pipe.fit(X, y) - assert pipe.n_features_in_ == n_features - - dv = DictVectorizer() - pipe = make_pipeline(dv) - assert pipe.n_features_in_ is None - d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] - dv.fit(d) - assert pipe.n_features_in_ is None - def test_warns_on_get_params_non_attribute(): class MyEstimator(BaseEstimator): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index e02b5ef96b7b0..bfdac25f50e32 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -34,6 +34,8 @@ from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingClassifier JUNK_FOOD_DOCS = ( @@ -1161,3 +1163,46 @@ def test_verbose(est, method, pattern, capsys): est.set_params(verbose=True) func(X, y) assert re.match(pattern, capsys.readouterr().out) + + +def test_n_features_in_pipeline(): + # make sure pipelines delegate n_features_in to the first step + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + ss = StandardScaler() + gbdt = HistGradientBoostingClassifier() + pipe = make_pipeline(ss, gbdt) + assert not hasattr(pipe, 'n_features_in_') + pipe.fit(X, y) + assert pipe.n_features_in_ == ss.n_features_in_ == 2 + + # if the first step has the n_features_in attribute then the pipeline also + # has it, even though it isn't fitted. + ss = StandardScaler() + gbdt = HistGradientBoostingClassifier() + pipe = make_pipeline(ss, gbdt) + ss.fit(X, y) + assert pipe.n_features_in_ == ss.n_features_in_ == 2 + assert not hasattr(gbdt, 'n_features_in_') + + +def test_n_features_in_feature_union(): + # make sure FeatureUnion delegates n_features_in to the first transformer + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + ss = StandardScaler() + fu = make_union(ss) + assert not hasattr(fu, 'n_features_in_') + fu.fit(X, y) + assert fu.n_features_in_ == ss.n_features_in_ == 2 + + # if the first step has the n_features_in attribute then the feature_union + # also has it, even though it isn't fitted. + ss = StandardScaler() + fu = make_union(ss) + ss.fit(X, y) + assert fu.n_features_in_ == ss.n_features_in_ == 2 From 6845788aa8c665ea444517d8856177cb404b6c9d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 1 Aug 2019 17:39:35 -0400 Subject: [PATCH 10/53] pep8 --- sklearn/model_selection/tests/test_search.py | 2 +- sklearn/tests/test_base.py | 8 +------- sklearn/tests/test_pipeline.py | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 42841dcb248a8..17a7493c8675a 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -63,7 +63,7 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import Ridge, SGDClassifier -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection.tests.common import OneTimeSplitter diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 198570c588cc5..4096680362d2b 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -17,12 +17,6 @@ from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler -from sklearn.feature_extraction import DictVectorizer -from sklearn.experimental import enable_hist_gradient_boosting # noqa -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.datasets import make_classification -from sklearn.pipeline import make_pipeline -from sklearn.exceptions import NotFittedError from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor @@ -532,7 +526,7 @@ def test_validate_X_bad_kwargs(): est = BaseEstimator() with pytest.raises(TypeError, - match="check_array\(\) got an unexpected keyword"): + match="got an unexpected keyword"): est._validate_X([1], bad_param=4) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index bfdac25f50e32..4fffcc0a4dc70 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -34,7 +34,7 @@ from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier From ee2598bf143da1696f9cc380480a9d496f6db4b2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 12 Aug 2019 09:01:30 -0400 Subject: [PATCH 11/53] BaseSearchCV now raises AttributeError --- sklearn/model_selection/_search.py | 11 ++++++++++- sklearn/model_selection/tests/test_search.py | 3 +-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 590d6e5d3558a..1f12bf4096708 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -565,7 +565,16 @@ def inverse_transform(self, Xt): @property def n_features_in_(self): - check_is_fitted(self, 'best_estimator_') + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the search estimator isn't fitted. + try: + check_is_fitted(self, 'best_estimator_') + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__ ) + ) from nfe + return self.best_estimator_.n_features_in_ @property diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 7a74438f2a5bc..d6a31ca96bbe4 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1786,7 +1786,6 @@ def test_n_features_in(): gbdt = HistGradientBoostingClassifier() param_grid = {'max_iter': [3, 4]} gs = GridSearchCV(gbdt, param_grid) - with pytest.raises(NotFittedError): - gs.n_features_in_ + assert not hasattr(gs, 'n_features_in_') gs.fit(X, y) assert gs.n_features_in_ == n_features From 25fda0ff2b2633b4455c111d5791099731f5bcb7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 2 Sep 2019 18:12:22 -0400 Subject: [PATCH 12/53] Added common test + used _validate_XXX on most estimators --- sklearn/base.py | 3 +- sklearn/calibration.py | 4 +- sklearn/cluster/affinity_propagation_.py | 2 +- sklearn/cluster/birch.py | 2 +- sklearn/cluster/dbscan_.py | 2 +- sklearn/cluster/hierarchical.py | 2 +- sklearn/cluster/mean_shift_.py | 2 +- sklearn/cluster/optics_.py | 2 +- sklearn/cluster/spectral.py | 4 +- sklearn/compose/_target.py | 15 ++++++ sklearn/covariance/empirical_covariance_.py | 2 +- sklearn/covariance/graph_lasso_.py | 6 +-- sklearn/covariance/robust_covariance.py | 2 +- sklearn/covariance/shrunk_covariance_.py | 6 +-- sklearn/cross_decomposition/pls_.py | 8 +-- sklearn/decomposition/dict_learning.py | 4 +- sklearn/decomposition/factor_analysis.py | 2 +- sklearn/decomposition/incremental_pca.py | 4 +- sklearn/decomposition/kernel_pca.py | 2 +- sklearn/decomposition/nmf.py | 2 +- sklearn/decomposition/pca.py | 4 +- sklearn/decomposition/sparse_pca.py | 4 +- sklearn/decomposition/truncated_svd.py | 4 +- sklearn/discriminant_analysis.py | 6 +-- sklearn/ensemble/bagging.py | 6 +-- sklearn/ensemble/forest.py | 2 +- sklearn/ensemble/gradient_boosting.py | 3 +- sklearn/feature_selection/from_model.py | 15 ++++++ sklearn/feature_selection/rfe.py | 6 ++- .../feature_selection/univariate_selection.py | 3 +- .../feature_selection/variance_threshold.py | 2 +- sklearn/gaussian_process/gpc.py | 2 +- sklearn/gaussian_process/gpr.py | 2 +- sklearn/impute/_iterative.py | 4 +- sklearn/kernel_approximation.py | 8 +-- sklearn/kernel_ridge.py | 4 +- sklearn/linear_model/base.py | 4 +- sklearn/linear_model/bayes.py | 6 +-- sklearn/linear_model/coordinate_descent.py | 19 ++++--- sklearn/linear_model/huber.py | 2 +- sklearn/linear_model/least_angle.py | 6 +-- sklearn/linear_model/logistic.py | 11 ++-- sklearn/linear_model/omp.py | 6 +-- sklearn/linear_model/ransac.py | 2 +- sklearn/linear_model/ridge.py | 21 ++++---- sklearn/linear_model/stochastic_gradient.py | 10 ++-- sklearn/linear_model/theil_sen.py | 2 +- sklearn/manifold/isomap.py | 2 +- sklearn/manifold/locally_linear.py | 2 +- sklearn/manifold/mds.py | 2 +- sklearn/manifold/spectral_embedding_.py | 2 +- sklearn/manifold/t_sne.py | 8 +-- sklearn/mixture/base.py | 1 + sklearn/model_selection/_search.py | 2 +- sklearn/multiclass.py | 18 ++++++- sklearn/multioutput.py | 6 +-- sklearn/naive_bayes.py | 4 +- sklearn/neighbors/base.py | 2 +- sklearn/neighbors/kde.py | 2 +- sklearn/neighbors/nca.py | 2 +- sklearn/neighbors/nearest_centroid.py | 4 +- .../neural_network/multilayer_perceptron.py | 8 +-- sklearn/neural_network/rbm.py | 2 +- sklearn/preprocessing/_discretization.py | 2 +- sklearn/preprocessing/data.py | 10 ++-- sklearn/random_projection.py | 2 +- sklearn/semi_supervised/label_propagation.py | 2 +- sklearn/svm/base.py | 6 +-- sklearn/svm/classes.py | 12 ++--- sklearn/utils/estimator_checks.py | 54 +++++++++++++++++++ 70 files changed, 252 insertions(+), 143 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index e39e3f2ec94df..dbe96a6365450 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -350,7 +350,8 @@ def _validate_n_features(self, X, check_n_features): 'as input.'.format(X.shape[1], self.__class__.__name__, self.n_features_in_) ) - self.n_features_in_ = X.shape[1] + else: + self.n_features_in_ = X.shape[1] def _validate_X(self, X, check_n_features=False, **check_array_params): X = check_array(X, **check_array_params) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index b88a8b8eb37ef..73d56d40af9fa 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -130,8 +130,8 @@ def fit(self, X, y, sample_weight=None): self : object Returns an instance of self. """ - X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'], - force_all_finite=False, allow_nd=True) + X, y = self._validate_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'], + force_all_finite=False, allow_nd=True) X, y = indexable(X, y) le = LabelBinarizer().fit(y) self.classes_ = le.classes_ diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py index 89c6ce9fe8b34..0ae5496d0ff3f 100644 --- a/sklearn/cluster/affinity_propagation_.py +++ b/sklearn/cluster/affinity_propagation_.py @@ -372,7 +372,7 @@ def fit(self, X, y=None): accept_sparse = False else: accept_sparse = 'csr' - X = check_array(X, accept_sparse=accept_sparse) + X = self._validate_X(X, accept_sparse=accept_sparse) if self.affinity == "precomputed": self.affinity_matrix_ = X elif self.affinity == "euclidean": diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 11bb0f17a1dc6..87bb6e1695be8 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -445,7 +445,7 @@ def fit(self, X, y=None): return self._fit(X) def _fit(self, X): - X = check_array(X, accept_sparse='csr', copy=self.copy) + X = self._validate_X(X, accept_sparse='csr', copy=self.copy) threshold = self.threshold branching_factor = self.branching_factor diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 9f4a55d3ad5b3..3927a532d17bd 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -352,7 +352,7 @@ def fit(self, X, y=None, sample_weight=None): self """ - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, accept_sparse='csr') clust = dbscan(X, sample_weight=sample_weight, **self.get_params()) self.core_sample_indices_, self.labels_ = clust diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index edf4dae76cd49..3c183c24f9a95 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -790,7 +790,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ensure_min_samples=2, estimator=self) + X = self._validate_X(X, ensure_min_samples=2, estimator=self) memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index e588ccd6df1c8..0fc6e8cdc9292 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -414,7 +414,7 @@ def fit(self, X, y=None): y : Ignored """ - X = check_array(X) + X = self._validate_X(X) self.cluster_centers_, self.labels_ = \ mean_shift(X, bandwidth=self.bandwidth, seeds=self.seeds, min_bin_freq=self.min_bin_freq, diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index ecf5fa6a2bcc0..a4b97323a86c5 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -233,7 +233,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float) + X = self._validate_X(X, dtype=np.float) if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 0398ec0df006f..262275e6ab1ba 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -466,8 +466,8 @@ def fit(self, X, y=None): self """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64, ensure_min_samples=2) + X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, ensure_min_samples=2) if X.shape[0] == X.shape[1] and self.affinity != "precomputed": warnings.warn("The spectral clustering API has changed. ``fit``" "now constructs an affinity matrix from data. To use" diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 35b7ed6af962a..4f0cc12a272f1 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -10,6 +10,7 @@ from ..utils.validation import check_is_fitted from ..utils import check_array, safe_indexing from ..preprocessing import FunctionTransformer +from ..exceptions import NotFittedError __all__ = ['TransformedTargetRegressor'] @@ -236,3 +237,17 @@ def predict(self, X): def _more_tags(self): return {'poor_score': True, 'no_validation': True} + + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.regressor_.n_features_in_ diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py index 924f7edd7ffee..aa78d788142e2 100644 --- a/sklearn/covariance/empirical_covariance_.py +++ b/sklearn/covariance/empirical_covariance_.py @@ -191,7 +191,7 @@ def fit(self, X, y=None): self : object """ - X = check_array(X) + X = self._validate_X(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py index e78950bd60421..874b5d5576c50 100644 --- a/sklearn/covariance/graph_lasso_.py +++ b/sklearn/covariance/graph_lasso_.py @@ -378,8 +378,8 @@ def fit(self, X, y=None): y : (ignored) """ # Covariance does not make sense for a single feature - X = check_array(X, ensure_min_features=2, ensure_min_samples=2, - estimator=self) + X = self._validate_X(X, ensure_min_features=2, ensure_min_samples=2, + estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) @@ -645,7 +645,7 @@ def fit(self, X, y=None): y : (ignored) """ # Covariance does not make sense for a single feature - X = check_array(X, ensure_min_features=2, estimator=self) + X = self._validate_X(X, ensure_min_features=2, estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py index 173794e5340c2..6057c4c6058d0 100644 --- a/sklearn/covariance/robust_covariance.py +++ b/sklearn/covariance/robust_covariance.py @@ -636,7 +636,7 @@ def fit(self, X, y=None): self : object """ - X = check_array(X, ensure_min_samples=2, estimator='MinCovDet') + X = self._validate_X(X, ensure_min_samples=2, estimator='MinCovDet') random_state = check_random_state(self.random_state) n_samples, n_features = X.shape # check that the empirical covariance is full rank diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py index 6a0c80d2e4ff6..26b8ce237cbb5 100644 --- a/sklearn/covariance/shrunk_covariance_.py +++ b/sklearn/covariance/shrunk_covariance_.py @@ -143,7 +143,7 @@ def fit(self, X, y=None): self : object """ - X = check_array(X) + X = self._validate_X(X) # Not calling the parent object to fit, to avoid a potential # matrix inversion when setting the precision if self.assume_centered: @@ -419,7 +419,7 @@ def fit(self, X, y=None): """ # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) - X = check_array(X) + X = self._validate_X(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: @@ -572,7 +572,7 @@ def fit(self, X, y=None): self : object """ - X = check_array(X) + X = self._validate_X(X) # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) if self.assume_centered: diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index af45d4fa53a09..c2714db3af1ba 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -252,8 +252,8 @@ def fit(self, X, Y): # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_X(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) @@ -828,8 +828,8 @@ def fit(self, X, Y): """ # copy since this will contains the centered data check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_X(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 1a5a42d526917..6ed224c344b3d 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -1217,7 +1217,7 @@ def fit(self, X, y=None): Returns the object itself """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_X(X) if self.n_components is None: n_components = X.shape[1] else: @@ -1423,7 +1423,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_X(X) U, (A, B), self.n_iter_ = dict_learning_online( X, self.n_components, self.alpha, diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index ba624140ce1fc..c42c0ef617d60 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -167,7 +167,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, copy=self.copy, dtype=np.float64) + X = self._validate_X(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape n_components = self.n_components diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index c6d611dcd5fea..815a912f92f5d 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -192,8 +192,8 @@ def fit(self, X, y=None): self.singular_values_ = None self.noise_variance_ = None - X = check_array(X, accept_sparse=['csr', 'csc', 'lil'], - copy=self.copy, dtype=[np.float64, np.float32]) + X = self._validate_X(X, accept_sparse=['csr', 'csc', 'lil'], + copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 59785fed3ac0e..ea907210fd5d3 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -271,7 +271,7 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - X = check_array(X, accept_sparse='csr', copy=self.copy_X) + X = self._validate_X(X, accept_sparse='csr', copy=self.copy_X) self._centerer = KernelCenterer() K = self._get_kernel(X) self._fit_transform(K) diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 0cc8713679136..3b28f6c638961 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -1266,7 +1266,7 @@ def fit_transform(self, X, y=None, W=None, H=None): W : array, shape (n_samples, n_components) Transformed data. """ - X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) + X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=float) W, H, n_iter_ = non_negative_factorization( X=X, W=W, H=H, n_components=self.n_components, init=self.init, diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 1bf3d6e6b19e6..001bf5b0c3953 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -385,8 +385,8 @@ def _fit(self, X): raise TypeError('PCA does not support sparse input. See ' 'TruncatedSVD for a possible alternative.') - X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, - copy=self.copy) + X = self._validate_X(X, dtype=[np.float64, np.float32], ensure_2d=True, + copy=self.copy) # Handle n_components==None if self.n_components is None: diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py index 3ca14cb528bb8..3e9afd4df598d 100644 --- a/sklearn/decomposition/sparse_pca.py +++ b/sklearn/decomposition/sparse_pca.py @@ -166,7 +166,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_X(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ @@ -364,7 +364,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_X(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index ce79fba2fad1d..3211fc39f0eec 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -155,8 +155,8 @@ def fit_transform(self, X, y=None): X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - X = check_array(X, accept_sparse=['csr', 'csc'], - ensure_min_features=2) + X = self._validate_X(X, accept_sparse=['csr', 'csc'], + ensure_min_features=2) random_state = check_random_state(self.random_state) if self.algorithm == "arpack": diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index efe39b8c3fb9a..bf33d1803493e 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -424,8 +424,8 @@ def fit(self, X, y): Target values. """ # FIXME: Future warning to be removed in 0.23 - X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self, - dtype=[np.float64, np.float32]) + X, y = self._validate_X_y(X, y, ensure_min_samples=2, estimator=self, + dtype=[np.float64, np.float32]) self.classes_ = unique_labels(y) n_samples, _ = X.shape n_classes = len(self.classes_) @@ -656,7 +656,7 @@ def fit(self, X, y): y : array, shape = [n_samples] Target values (integers) """ - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py index 37dc5a97b4e67..c2fe356452a06 100644 --- a/sklearn/ensemble/bagging.py +++ b/sklearn/ensemble/bagging.py @@ -277,9 +277,9 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) - X, y = check_X_y( - X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, - multi_output=True + X, y = self._validate_X_y( + X, y, accept_sparse=['csr', 'csc'], dtype=None, + force_all_finite=False, multi_output=True ) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index df24411c4a974..25c75064d279e 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -240,7 +240,7 @@ def fit(self, X, y, sample_weight=None): self : object """ # Validate or convert input data - X = check_array(X, accept_sparse="csc", dtype=DTYPE) + X = self._validate_X(X, accept_sparse="csc", dtype=DTYPE) y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index ec5f9a111ccf1..cdb57fc70dfd3 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1436,7 +1436,8 @@ def fit(self, X, y, sample_weight=None, monitor=None): # Check input # Since check_array converts both X and y to the same dtype, but the # trees use different types for X and y, checking them separately. - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) + X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=DTYPE) n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index fb26f9d685688..e258a56e8a771 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -6,6 +6,7 @@ from .base import SelectorMixin from ..base import BaseEstimator, clone, MetaEstimatorMixin +from ..utils.validation import check_is_fitted from ..exceptions import NotFittedError from ..utils.metaestimators import if_delegate_has_method @@ -227,3 +228,17 @@ def partial_fit(self, X, y=None, **fit_params): self.estimator_ = clone(self.estimator) self.estimator_.partial_fit(X, y, **fit_params) return self + + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.estimator_.n_features_in_ diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 4e957e8463a7c..9562148227c05 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -150,7 +150,8 @@ def _fit(self, X, y, step_score=None): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit - X, y = check_X_y(X, y, "csc", ensure_min_features=2) + X, y = self._validate_X_y(X, y, accept_sparse="csc", + ensure_min_features=2) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -479,7 +480,8 @@ def fit(self, X, y, groups=None): train/test set. Only used in conjunction with a "Group" `cv` instance (e.g., `GroupKFold`). """ - X, y = check_X_y(X, y, "csr", ensure_min_features=2) + X, y = self._validate_X_y(X, y, accept_sparse="csr", + ensure_min_features=2) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 5b1cae1823e9c..970bda28b7e46 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -338,7 +338,8 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if not callable(self.score_func): raise TypeError("The score function should be a callable, %s (%s) " diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index c9eb973dc86c3..77d2bf8ee14b0 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -61,7 +61,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ('csr', 'csc'), dtype=np.float64) + X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py index 5421f7e408472..1adf6af0148b3 100644 --- a/sklearn/gaussian_process/gpc.py +++ b/sklearn/gaussian_process/gpc.py @@ -612,7 +612,7 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, multi_output=False) + X, y = self._validate_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py index cc9806cd1c41e..673a558b2d566 100644 --- a/sklearn/gaussian_process/gpr.py +++ b/sklearn/gaussian_process/gpr.py @@ -182,7 +182,7 @@ def fit(self, X, y): self._rng = check_random_state(self.random_state) - X, y = check_X_y(X, y, multi_output=True, y_numeric=True) + X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True) # Normalize target value if self.normalize_y: diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 05e2f1484fccf..bb74580517e59 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -472,8 +472,8 @@ def _initial_imputation(self, X): else: force_all_finite = True - X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite=force_all_finite) + X = self._validate_X(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) mask_missing_values = _get_mask(X, self.missing_values) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 7a2b404304daf..bb04fa85998ca 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -91,7 +91,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, accept_sparse='csr') random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -197,7 +197,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = check_array(X) + X = self._validate_X(X) random_state = check_random_state(self.random_state) n_features = X.shape[1] uniform = random_state.uniform(size=(n_features, self.n_components)) @@ -324,7 +324,7 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - check_array(X, accept_sparse='csr') + self._validate_X(X, accept_sparse='csr') if self.sample_interval is None: # See reference, figure 2 c) if self.sample_steps == 1: @@ -540,7 +540,7 @@ def fit(self, X, y=None): X : array-like, shape=(n_samples, n_feature) Training data. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, accept_sparse='csr') rnd = check_random_state(self.random_state) n_samples = X.shape[0] diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 3d69066e342d6..ba1b9867956fa 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -148,8 +148,8 @@ def fit(self, X, y=None, sample_weight=None): self : returns an instance of self. """ # Convert data - X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, - y_numeric=True) + X, y = self._validate_X_y(X, y, accept_sparse=("csr", "csc"), + multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = check_array(sample_weight, ensure_2d=False) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index d2af98d07ac09..8def021d68974 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -464,8 +464,8 @@ def fit(self, X, y, sample_weight=None): """ n_jobs_ = self.n_jobs - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=True) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=True) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py index 7b19ed3ce607f..72d956b555945 100644 --- a/sklearn/linear_model/bayes.py +++ b/sklearn/linear_model/bayes.py @@ -189,7 +189,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError('n_iter should be greater than or equal to 1.' ' Got {!r}.'.format(self.n_iter)) - X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True) + X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True) X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) @@ -516,8 +516,8 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True, - ensure_min_samples=2) + X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True, + ensure_min_samples=2) n_samples, n_features = X.shape coef_ = np.zeros(n_features) diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index 35bfcb692ca2f..24366a5ee0463 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -696,9 +696,11 @@ def fit(self, X, y, check_input=True): # when bypassing checks if check_input: X_copied = self.copy_X and self.fit_intercept - X, y = check_X_y(X, y, accept_sparse='csc', - order='F', dtype=[np.float64, np.float32], - copy=X_copied, multi_output=True, y_numeric=True) + X, y = self._validate_X_y(X, y, accept_sparse='csc', + order='F', + dtype=[np.float64, np.float32], + copy=X_copied, multi_output=True, + y_numeric=True) y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False) @@ -1120,7 +1122,7 @@ def fit(self, X, y): # Let us not impose fortran ordering so far: it is # not useful for the cross-validation loop and will be done # by the model fitting itself - X = check_array(X, 'csc', copy=False) + X = self._validate_X(X, accept_sparse='csc', copy=False) if sparse.isspmatrix(X): if (hasattr(reference_to_old_X, "data") and not np.may_share_memory(reference_to_old_X.data, X.data)): @@ -1131,8 +1133,9 @@ def fit(self, X, y): copy_X = False del reference_to_old_X else: - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) + X = self._validate_X(X, accept_sparse='csc', + dtype=[np.float64, np.float32], order='F', + copy=copy_X) copy_X = False if X.shape[0] != y.shape[0]: @@ -1752,8 +1755,8 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - X = check_array(X, dtype=[np.float64, np.float32], order='F', - copy=self.copy_X and self.fit_intercept) + X = self._validate_X(X, dtype=[np.float64, np.float32], order='F', + copy=self.copy_X and self.fit_intercept) y = check_array(y, dtype=X.dtype.type, ensure_2d=False) if hasattr(self, 'l1_ratio'): diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index e518feae29b78..7ac5b3d0e19c4 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y( + X, y = self._validate_X_y( X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 2df43cca9365f..54abd87b27b7d 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -954,7 +954,7 @@ def fit(self, X, y, Xy=None): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True, multi_output=True) + X, y = self._validate_X_y(X, y, y_numeric=True, multi_output=True) alpha = getattr(self, 'alpha', 0.) if hasattr(self, 'n_nonzero_coefs'): @@ -1374,7 +1374,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_X_y(X, y, y_numeric=True) X = as_float_array(X, copy=self.copy_X) y = as_float_array(y, copy=self.copy_X) @@ -1752,7 +1752,7 @@ def fit(self, X, y, copy_X=None): """ if copy_X is None: copy_X = self.copy_X - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_X_y(X, y, y_numeric=True) X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, copy_X) diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 5ba3eb99fa25e..50a2a6c91a535 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -1511,8 +1511,9 @@ def fit(self, X, y, sample_weight=None): else: _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=_dtype, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) n_samples, n_features = X.shape @@ -1981,9 +1982,9 @@ def fit(self, X, y, sample_weight=None): "LogisticRegressionCV." ) - X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, - order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=np.float64, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) class_weight = self.class_weight diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py index df6e44f5708e0..ca45c72879380 100644 --- a/sklearn/linear_model/omp.py +++ b/sklearn/linear_model/omp.py @@ -641,7 +641,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, multi_output=True, y_numeric=True) + X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True) n_features = X.shape[1] X, y, X_offset, y_offset, X_scale, Gram, Xy = \ @@ -879,8 +879,8 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True, ensure_min_features=2, - estimator=self) + X, y = self._validate_X_y(X, y, y_numeric=True, ensure_min_features=2, + estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) cv = check_cv(self.cv, classifier=False) max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index e868a31d17c8d..a3dc3fb24a983 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -251,7 +251,7 @@ def fit(self, X, y, sample_weight=None): `max_trials` randomly chosen sub-samples. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, accept_sparse='csr') y = check_array(y, ensure_2d=False) check_consistent_length(X, y) diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index b1c24a5860227..425293c9ee8ca 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -541,10 +541,10 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = check_X_y(X, y, - accept_sparse=_accept_sparse, - dtype=_dtype, - multi_output=True, y_numeric=True) + X, y = self._validate_X_y(X, y, + accept_sparse=_accept_sparse, + dtype=_dtype, + multi_output=True, y_numeric=True) if sparse.issparse(X) and self.fit_intercept: if self.solver not in ['auto', 'sparse_cg', 'sag']: raise ValueError( @@ -921,7 +921,8 @@ def fit(self, X, y, sample_weight=None): """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True) + self._validate_X_y(X, y, accept_sparse=_accept_sparse, + multi_output=True) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) @@ -1418,9 +1419,9 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], - dtype=[np.float64], - multi_output=True, y_numeric=True) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float64], + multi_output=True, y_numeric=True) if np.any(self.alphas <= 0): raise ValueError( @@ -1829,8 +1830,8 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True) + self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index c56792de96172..809aae841ba5b 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -509,8 +509,9 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, if hasattr(self, "classes_"): self.classes_ = None - X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_X_y(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class @@ -1079,8 +1080,9 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init): - X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64, - accept_large_sparse=False) + X, y = self._validate_X_y(X, y, accept_sparse="csr", copy=False, + order='C', dtype=np.float64, + accept_large_sparse=False) y = y.astype(np.float64, copy=False) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py index 941c51196cc4a..ab4e840376e75 100644 --- a/sklearn/linear_model/theil_sen.py +++ b/sklearn/linear_model/theil_sen.py @@ -358,7 +358,7 @@ def fit(self, X, y): self : returns an instance of self. """ random_state = check_random_state(self.random_state) - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_X_y(X, y, y_numeric=True) n_samples, n_features = X.shape n_subsamples, self.n_subpopulation_ = self._check_subparams(n_samples, n_features) diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py index 88c979c0e1fdb..d6f0f6a1bf2a9 100644 --- a/sklearn/manifold/isomap.py +++ b/sklearn/manifold/isomap.py @@ -114,7 +114,7 @@ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', self.n_jobs = n_jobs def _fit_transform(self, X): - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, accept_sparse='csr') self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, n_jobs=self.n_jobs) diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index 4e90d4876f4df..0186fcac53a39 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -656,7 +656,7 @@ def _fit_transform(self, X): n_jobs=self.n_jobs) random_state = check_random_state(self.random_state) - X = check_array(X, dtype=float) + X = self._validate_X(X, dtype=float) self.nbrs_.fit(X) self.embedding_, self.reconstruction_error_ = \ locally_linear_embedding( diff --git a/sklearn/manifold/mds.py b/sklearn/manifold/mds.py index 5238c67e93dfd..0ddf8dda7f31c 100644 --- a/sklearn/manifold/mds.py +++ b/sklearn/manifold/mds.py @@ -414,7 +414,7 @@ def fit_transform(self, X, y=None, init=None): algorithm. By default, the algorithm is initialized with a randomly chosen array. """ - X = check_array(X) + X = self._validate_X(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn("The MDS API has changed. ``fit`` now constructs an" " dissimilarity matrix from data. To use a custom " diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 9142237fd5042..e6a646d13ffd0 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -522,7 +522,7 @@ def fit(self, X, y=None): Returns the instance itself. """ - X = check_array(X, ensure_min_samples=2, estimator=self) + X = self._validate_X(X, ensure_min_samples=2, estimator=self) random_state = check_random_state(self.random_state) if isinstance(self.affinity, str): diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 987f3af05a941..e460438d9641a 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -682,11 +682,11 @@ def _fit(self, X, skip_num_points=0): 'memory. Otherwise consider dimensionality ' 'reduction techniques (e.g. TruncatedSVD)') if self.method == 'barnes_hut': - X = check_array(X, ensure_min_samples=2, - dtype=[np.float32, np.float64]) + X = self._validate_X(X, ensure_min_samples=2, + dtype=[np.float32, np.float64]) else: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = self.validate_X(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float32, np.float64]) if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py index 26410fc5256af..ad79972f21263 100644 --- a/sklearn/mixture/base.py +++ b/sklearn/mixture/base.py @@ -216,6 +216,7 @@ def fit_predict(self, X, y=None): Component labels. """ X = _check_X(X, self.n_components, ensure_min_samples=2) + self._validate_n_features(X, check_n_features=False) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index a10f10d077b4f..f65c8292031ce 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -572,7 +572,7 @@ def n_features_in_(self): except NotFittedError as nfe: raise AttributeError( "{} object has no n_features_in_ attribute." - .format(self.__class__.__name__ ) + .format(self.__class__.__name__) ) from nfe return self.best_estimator_.n_features_in_ diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 217f6ce87cba6..fdb768e25502f 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -52,6 +52,7 @@ check_classification_targets, _ovr_decision_function) from .utils.metaestimators import _safe_split, if_delegate_has_method +from .exceptions import NotFittedError from joblib import Parallel, delayed @@ -409,6 +410,19 @@ def _pairwise(self): def _first_estimator(self): return self.estimators_[0] + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the OVR estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + return self.estimators_[0].n_features_in_ + def _fit_ovo_binary(estimator, X, y, i, j): """Fit a single binary estimator (one-vs-one).""" @@ -497,7 +511,7 @@ def fit(self, X, y): ------- self """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc']) check_classification_targets(y) self.classes_ = np.unique(y) @@ -724,7 +738,7 @@ def fit(self, X, y): ------- self """ - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {0}" "".format(self.code_size)) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 3b5a95349868e..b54b951b451ea 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -148,9 +148,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The base estimator should implement" " a fit method") - X, y = check_X_y(X, y, - multi_output=True, - accept_sparse=True) + X, y = self._validate_X_y(X, y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(y) @@ -431,7 +429,7 @@ def fit(self, X, Y): ------- self : object """ - X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True) + X, Y = self._validate_X_y(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 904a5afecc67e..509fe78d150ea 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -192,7 +192,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight) @@ -591,7 +591,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, 'csr') + X, y = self._validate_X_y(X, y, accept_sparse='csr') _, n_features = X.shape labelbin = LabelBinarizer() diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 4f7ef38a4ae14..ab6aa296aec74 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -206,7 +206,7 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: diff --git a/sklearn/neighbors/kde.py b/sklearn/neighbors/kde.py index be5002e579423..baac14518954a 100644 --- a/sklearn/neighbors/kde.py +++ b/sklearn/neighbors/kde.py @@ -125,7 +125,7 @@ def fit(self, X, y=None, sample_weight=None): List of sample weights attached to the data X. """ algorithm = self._choose_algorithm(self.algorithm, self.metric) - X = check_array(X, order='C', dtype=DTYPE) + X = self._validate_X(X, order='C', dtype=DTYPE) if sample_weight is not None: sample_weight = check_array(sample_weight, order='C', dtype=DTYPE, diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py index 68a72c92da865..67c94185030a7 100644 --- a/sklearn/neighbors/nca.py +++ b/sklearn/neighbors/nca.py @@ -297,7 +297,7 @@ def _validate_params(self, X, y): """ # Validate the inputs X and y, and converts y to numerical classes. - X, y = check_X_y(X, y, ensure_min_samples=2) + X, y = self._validate_X_y(X, y, ensure_min_samples=2) check_classification_targets(y) y = LabelEncoder().fit_transform(y) diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py index 3e1577469c920..a765141cd5c6f 100644 --- a/sklearn/neighbors/nearest_centroid.py +++ b/sklearn/neighbors/nearest_centroid.py @@ -104,9 +104,9 @@ def fit(self, X, y): # If X is sparse and the metric is "manhattan", store it in a csc # format is easier to calculate the median. if self.metric == 'manhattan': - X, y = check_X_y(X, y, ['csc']) + X, y = self._validate_X_y(X, y, accept_sparse=['csc']) else: - X, y = check_X_y(X, y, ['csr', 'csc']) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc']) is_X_sparse = sp.issparse(X) if is_X_sparse and self.shrink_threshold: raise ValueError("threshold shrinking not supported" diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index 11e682a448240..5293937c8c0a1 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -928,8 +928,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", n_iter_no_change=n_iter_no_change, max_fun=max_fun) def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) @@ -1336,8 +1336,8 @@ def predict(self, X): return y_pred def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True, y_numeric=True) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True, y_numeric=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) return X, y diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py index 3018e31f7d04d..43e7eba48db61 100644 --- a/sklearn/neural_network/rbm.py +++ b/sklearn/neural_network/rbm.py @@ -336,7 +336,7 @@ def fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - X = check_array(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_X(X, accept_sparse='csr', dtype=np.float64) n_samples = X.shape[0] rng = check_random_state(self.random_state) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 1be7499f783ec..4bfa0631be77a 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -133,7 +133,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, dtype='numeric') + X = self._validate_X(X, dtype='numeric') valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index ed352da361007..bf7dcfd2e17a7 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1159,8 +1159,8 @@ def fit(self, X, y=None): """ # at fit, convert sparse matrices to csc for optimized computation of # the quantiles - X = check_array(X, accept_sparse='csc', estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + X = self._validate_X(X, accept_sparse='csc', estimator=self, + dtype=FLOAT_DTYPES, force_all_finite='allow-nan') q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: @@ -1468,7 +1468,7 @@ def fit(self, X, y=None): ------- self : instance """ - n_samples, n_features = check_array(X, accept_sparse=True).shape + n_samples, n_features = self._validate_X(X, accept_sparse=True).shape combinations = self._combinations(n_features, self.degree, self.interaction_only, self.include_bias) @@ -1774,7 +1774,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + self._validate_X(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -1908,7 +1908,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + self._validate_X(X, accept_sparse='csr') return self def transform(self, X, copy=None): diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 4f8c8af1283b2..7e087bcbd2b84 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -341,7 +341,7 @@ def fit(self, X, y=None): self """ - X = check_array(X, accept_sparse=['csr', 'csc']) + X = self._validate_X(X, accept_sparse=['csr', 'csc']) n_samples, n_features = X.shape diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 704a075d95932..c4cc523336433 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -220,7 +220,7 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) self.X_ = X check_classification_targets(y) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index b2723cc7e0c2b..43afcbc1602f6 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -142,9 +142,9 @@ def fit(self, X, y, sample_weight=None): raise TypeError("Sparse precomputed kernels are not supported.") self._sparse = sparse and not callable(self.kernel) - X, y = check_X_y(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = self._validate_X_y(X, y, dtype=np.float64, + order='C', accept_sparse='csr', + accept_large_sparse=False) y = self._validate_targets(y) sample_weight = np.asarray([] diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 39c7d2f334de2..7967bf34207e1 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -229,9 +229,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = check_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_X_y(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) check_classification_targets(y) self.classes_ = np.unique(y) @@ -418,9 +418,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = check_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_X_y(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) penalty = 'l2' # SVR only accepts l2 penalty self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear( X, y, self.C, self.fit_intercept, self.intercept_scaling, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 458e0818def4a..9d00cb826f58d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -271,6 +271,7 @@ def _yield_all_checks(name, estimator): yield check_dict_unchanged yield check_dont_overwrite_parameters yield check_fit_idempotent + yield check_n_features_in if tags["requires_positive_X"]: yield check_fit_non_negative @@ -2648,3 +2649,56 @@ def check_fit_idempotent(name, estimator_orig): atol=max(tol, 1e-9), rtol=max(tol, 1e-7), err_msg="Idempotency check failed for method {}".format(method) ) + + +def check_n_features_in(name, estimator_orig): + # Make sure that n_features_in_ attribute doesn't exist until fit is + # called. + + if 'Dummy' in name: + # Dummy estimators don't validate X at all + return + if any(x in name for x in ('FastICA', 'KMeans')): + # fit calls public function helper and validates there. No way to + # access `self` from the helper. + return + if 'FunctionTransformer' in name: + # Validation is optional and False by default + return + if 'KernelCenterer' in name: + # Takes kernel K with shape (n_samples, n_samples) as input, not X + return + if any(x in name for x in ('LatentDirichlet', 'MissingIndicator', + 'PowerTransformer', 'QuantileTransformer', + 'SimpleImputer', 'AdaBoost')): + # fit calls private validation method, which is also called for + # predict, transform, etc + return + if any(x in name for x in ('MaxAbsScaler', 'MinMaxScaler')): + # Fit directly calls partial_fit. Don't know what to do with + # partial_fit. + return + if name in 'RidgeCV': + # Uses aggregation from an estimator that is not an attribute. There is + # no way to delegate to this estimator. + return + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + if 'warm_start' in estimator.get_params().keys(): + estimator.set_params(warm_start=False) + + n_samples = 100 + X = rng.normal(loc=100, size=(n_samples, 2)) + X = pairwise_estimator_convert_X(X, estimator) + if is_regressor(estimator_orig): + y = rng.normal(size=n_samples) + else: + y = rng.randint(low=0, high=2, size=n_samples) + y = enforce_estimator_tags_y(estimator, y) + + assert not hasattr(estimator, 'n_features_in_') + estimator.fit(X, y) + assert hasattr(estimator, 'n_features_in_') From 9bdfb65c0506838fcb8f356e71cf82a3d8c2ad48 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 2 Sep 2019 18:46:35 -0400 Subject: [PATCH 13/53] Fixed some test --- sklearn/ensemble/voting.py | 15 +++++++++++++++ sklearn/manifold/t_sne.py | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py index 69381a39d9ce3..48d4fe413c5d2 100644 --- a/sklearn/ensemble/voting.py +++ b/sklearn/ensemble/voting.py @@ -29,6 +29,7 @@ from ..utils.metaestimators import _BaseComposition from ..utils.multiclass import check_classification_targets from ..utils.validation import column_or_1d +from ..exceptions import NotFittedError def _parallel_fit_estimator(estimator, X, y, sample_weight=None): @@ -144,6 +145,20 @@ def get_params(self, deep=True): """ return self._get_params('estimators', deep=deep) + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.estimators_[0].n_features_in_ + class VotingClassifier(_BaseVoting, ClassifierMixin): """Soft Voting/Majority Rule classifier for unfitted estimators. diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index e460438d9641a..70732b8d6ac16 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -685,8 +685,8 @@ def _fit(self, X, skip_num_points=0): X = self._validate_X(X, ensure_min_samples=2, dtype=[np.float32, np.float64]) else: - X = self.validate_X(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float32, np.float64]) if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " From be76ef49c4cd8fc72b05111a8b9d4f769b6f0733 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 4 Sep 2019 18:04:48 -0400 Subject: [PATCH 14/53] fixed issues for some estimators --- sklearn/cluster/hierarchical.py | 11 +++- sklearn/decomposition/online_lda.py | 26 +++++--- sklearn/ensemble/weight_boosting.py | 29 +++------ sklearn/impute/_base.py | 25 ++++---- sklearn/linear_model/ridge.py | 1 + .../preprocessing/_function_transformer.py | 4 +- sklearn/preprocessing/data.py | 59 +++++++++++-------- sklearn/utils/estimator_checks.py | 28 +-------- 8 files changed, 88 insertions(+), 95 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 3c183c24f9a95..0da4a6b2fc0fa 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -1034,9 +1034,14 @@ def fit(self, X, y=None, **params): ------- self """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - ensure_min_features=2, estimator=self) - return AgglomerativeClustering.fit(self, X.T, **params) + X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], + ensure_min_features=2, estimator=self) + n_features_in_ = self.n_features_in_ + AgglomerativeClustering.fit(self, X.T, **params) + # Need to restore n_features_in_ attribute that was overridden in + # AgglomerativeClustering since we passed it X.T. + self.n_features_in_ = n_features_in_ + return self @property def fit_predict(self): diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 694893b6b2dc4..36d4d8a4f785b 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -469,7 +469,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def _more_tags(self): return {'requires_positive_X': True} - def _check_non_neg_array(self, X, whom): + def _check_non_neg_array(self, X, check_n_features, whom): """check X format check X format and make sure no negative value in X. @@ -479,7 +479,8 @@ def _check_non_neg_array(self, X, whom): X : array-like or sparse matrix """ - X = check_array(X, accept_sparse='csr') + X = self._validate_X(X, check_n_features=check_n_features, + accept_sparse='csr') check_non_negative(X, whom) return X @@ -498,13 +499,15 @@ def partial_fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, + first_time = not hasattr(self, 'components_') + check_n_features = not first_time + X = self._check_non_neg_array(X, check_n_features, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size # initialize parameters or check - if not hasattr(self, 'components_'): + if first_time: self._init_latent_vars(n_features) if n_features != self.components_.shape[1]: @@ -542,7 +545,8 @@ def fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit") + X = self._check_non_neg_array(X, check_n_features=False, + whom="LatentDirichletAllocation.fit") n_samples, n_features = X.shape max_iter = self.max_iter evaluate_every = self.evaluate_every @@ -611,7 +615,9 @@ def _unnormalized_transform(self, X): check_is_fitted(self) # make sure feature size is the same in fitted model and in X - X = self._check_non_neg_array(X, "LatentDirichletAllocation.transform") + X = self._check_non_neg_array( + X, check_n_features=True, + whom="LatentDirichletAllocation.transform") n_samples, n_features = X.shape if n_features != self.components_.shape[1]: raise ValueError( @@ -735,7 +741,8 @@ def score(self, X, y=None): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, "LatentDirichletAllocation.score") + X = self._check_non_neg_array(X, check_n_features=True, + whom="LatentDirichletAllocation.score") doc_topic_distr = self._unnormalized_transform(X) score = self._approx_bound(X, doc_topic_distr, sub_sampling=False) @@ -764,8 +771,9 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, """ check_is_fitted(self) - X = self._check_non_neg_array(X, - "LatentDirichletAllocation.perplexity") + X = self._check_non_neg_array( + X, check_n_features=True, + whom="LatentDirichletAllocation.perplexity") if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index b0a634ce1be6f..f437f5a8dafd1 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -70,25 +70,9 @@ def __init__(self, self.learning_rate = learning_rate self.random_state = random_state - def _validate_data(self, X, y=None): - - # Accept or convert to these sparse matrix formats so we can - # use safe_indexing - accept_sparse = ['csr', 'csc'] - if y is None: - ret = check_array(X, - accept_sparse=accept_sparse, - ensure_2d=False, - allow_nd=True, - dtype=None) - else: - ret = check_X_y(X, y, - accept_sparse=accept_sparse, - ensure_2d=False, - allow_nd=True, - dtype=None, - y_numeric=is_regressor(self)) - return ret + def _validate_data(self, X): + return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=False, + allow_nd=True, dtype=None) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). @@ -115,7 +99,12 @@ def fit(self, X, y, sample_weight=None): if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") - X, y = self._validate_data(X, y) + X, y = self._validate_X_y(X, y, + accept_sparse=['csr', 'csc'], + ensure_2d=False, + allow_nd=True, + dtype=None, + y_numeric=is_regressor(self)) if sample_weight is None: # Initialize weights to 1 / n_samples diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index f7be7fd49cb64..9b7b239d6b7d3 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -177,7 +177,7 @@ def __init__(self, missing_values=np.nan, strategy="mean", self.copy = copy self.add_indicator = add_indicator - def _validate_input(self, X): + def _validate_input(self, X, in_fit): allowed_strategies = ["mean", "median", "most_frequent", "constant"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " @@ -195,8 +195,11 @@ def _validate_input(self, X): force_all_finite = "allow-nan" try: - X = check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, copy=self.copy) + check_n_features = not in_fit + X = self._validate_X(X, check_n_features=check_n_features, + accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, + copy=self.copy) except ValueError as ve: if "could not convert" in str(ve): raise ValueError("Cannot use {0} strategy with non-numeric " @@ -229,7 +232,7 @@ def fit(self, X, y=None): ------- self : SimpleImputer """ - X = self._validate_input(X) + X = self._validate_input(X, in_fit=True) # default fill_value is 0 for numerical input and "missing_value" # otherwise @@ -374,7 +377,7 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_input(X) + X = self._validate_input(X, in_fit=False) statistics = self.statistics_ @@ -560,13 +563,15 @@ def _get_missing_features_info(self, X): return imputer_mask, features_indices - def _validate_input(self, X): + def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, - force_all_finite=force_all_finite) + check_n_features = not in_fit + X = self._validate_X(X, check_n_features=check_n_features, + accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("MissingIndicator does not support data with " @@ -601,7 +606,7 @@ def _fit(self, X, y=None): The imputer mask of the original data. """ - X = self._validate_input(X) + X = self._validate_input(X, in_fit=True) self._n_features = X.shape[1] if self.features not in ('missing-only', 'all'): @@ -653,7 +658,7 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_input(X) + X = self._validate_input(X, in_fit=False) if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 425293c9ee8ca..eab0370050abd 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -1574,6 +1574,7 @@ def fit(self, X, y, sample_weight=None): self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ + self.n_features_in_ = estimator.n_features_in_ return self diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 589a45a1e63d1..6680882bc2694 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -83,7 +83,7 @@ def __init__(self, func=None, inverse_func=None, validate=False, def _check_input(self, X): if self.validate: - return check_array(X, accept_sparse=self.accept_sparse) + return self._validate_X(X, accept_sparse=self.accept_sparse) return X def _check_inverse_transform(self, X): @@ -156,5 +156,5 @@ def _transform(self, X, func=None, kw_args=None): return func(X, **(kw_args if kw_args else {})) def _more_tags(self): - return {'no_validation': True, + return {'no_validation': not self.validate, 'stateless': True} diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bf7dcfd2e17a7..afed782a29dee 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -354,17 +354,17 @@ def partial_fit(self, X, y=None): raise TypeError("MinMaxScaler does no support sparse input. " "You may consider to use MaxAbsScaler instead.") - X = check_array(X, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan") + first_pass = not hasattr(self, 'n_samples_seen_') + check_n_features = not first_pass + X = self._validate_X(X, check_n_features=check_n_features, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite="allow-nan") data_min = np.nanmin(X, axis=0) data_max = np.nanmax(X, axis=0) - # First pass - if not hasattr(self, 'n_samples_seen_'): + if first_pass: self.n_samples_seen_ = X.shape[0] - # Next steps else: data_min = np.minimum(self.data_min_, data_min) data_max = np.maximum(self.data_max_, data_max) @@ -928,9 +928,11 @@ def partial_fit(self, X, y=None): y Ignored """ - X = check_array(X, accept_sparse=('csr', 'csc'), - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + first_pass = not hasattr(self, 'n_samples_seen_') + check_n_features = not first_pass + X = self._validate_X(X, check_n_features=check_n_features, + accept_sparse=('csr', 'csc'), estimator=self, + dtype=FLOAT_DTYPES, force_all_finite='allow-nan') if sparse.issparse(X): mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) @@ -938,10 +940,8 @@ def partial_fit(self, X, y=None): else: max_abs = np.nanmax(np.abs(X), axis=0) - # First pass - if not hasattr(self, 'n_samples_seen_'): + if first_pass: self.n_samples_seen_ = X.shape[0] - # Next passes else: max_abs = np.maximum(self.max_abs_, max_abs) self.n_samples_seen_ += X.shape[0] @@ -1988,7 +1988,7 @@ def fit(self, K, y=None): self : returns an instance of self. """ - K = check_array(K, dtype=FLOAT_DTYPES) + K = self._validate_X(K, dtype=FLOAT_DTYPES) if K.shape[0] != K.shape[1]: raise ValueError("Kernel matrix must be a square matrix." @@ -2298,7 +2298,7 @@ def fit(self, X, y=None): " and {} samples.".format(self.n_quantiles, self.subsample)) - X = self._check_inputs(X, copy=False) + X = self._check_inputs(X, in_fit=True, copy=False) n_samples = X.shape[0] if self.n_quantiles > n_samples: @@ -2389,11 +2389,13 @@ def _transform_col(self, X_col, quantiles, inverse): return X_col - def _check_inputs(self, X, accept_sparse_negative=False, copy=False): + def _check_inputs(self, X, in_fit, accept_sparse_negative=False, + copy=False): """Check inputs before fit and transform""" - X = check_array(X, accept_sparse='csc', copy=copy, - dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + check_n_features = not in_fit + X = self._validate_X(X, check_n_features=check_n_features, + accept_sparse='csc', copy=copy, + dtype=FLOAT_DTYPES, force_all_finite='allow-nan') # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. with np.errstate(invalid='ignore'): # hide NaN comparison warnings @@ -2469,7 +2471,7 @@ def transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, copy=self.copy) + X = self._check_inputs(X, in_fit=False, copy=self.copy) self._check_is_fitted(X) return self._transform(X, inverse=False) @@ -2490,7 +2492,8 @@ def inverse_transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, accept_sparse_negative=True, copy=self.copy) + X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, + copy=self.copy) self._check_is_fitted(X) return self._transform(X, inverse=True) @@ -2746,7 +2749,8 @@ def fit_transform(self, X, y=None): return self._fit(X, y, force_transform=True) def _fit(self, X, y=None, force_transform=False): - X = self._check_input(X, check_positive=True, check_method=True) + X = self._check_input(X, in_fit=True, check_positive=True, + check_method=True) if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace @@ -2788,7 +2792,8 @@ def transform(self, X): The transformed data. """ check_is_fitted(self) - X = self._check_input(X, check_positive=True, check_shape=True) + X = self._check_input(X, in_fit=False, check_positive=True, + check_shape=True) transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform @@ -2834,7 +2839,7 @@ def inverse_transform(self, X): The original data """ check_is_fitted(self) - X = self._check_input(X, check_shape=True) + X = self._check_input(X, in_fit=False, check_shape=True) if self.standardize: X = self._scaler.inverse_transform(X) @@ -2939,7 +2944,7 @@ def _neg_log_likelihood(lmbda): # choosing bracket -2, 2 like for boxcox return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) - def _check_input(self, X, check_positive=False, check_shape=False, + def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method=False): """Validate the input before fit and transform. @@ -2957,8 +2962,10 @@ def _check_input(self, X, check_positive=False, check_shape=False, check_method : bool If True, check that the transformation method is valid. """ - X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy, - force_all_finite='allow-nan') + check_n_features = not in_fit + X = self._validate_X(X, check_n_features=check_n_features, + ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan') with np.warnings.catch_warnings(): np.warnings.filterwarnings( diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9d00cb826f58d..4f09cbcc646cf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -271,7 +271,8 @@ def _yield_all_checks(name, estimator): yield check_dict_unchanged yield check_dont_overwrite_parameters yield check_fit_idempotent - yield check_n_features_in + if not tags["no_validation"]: + yield check_n_features_in if tags["requires_positive_X"]: yield check_fit_non_negative @@ -2655,33 +2656,10 @@ def check_n_features_in(name, estimator_orig): # Make sure that n_features_in_ attribute doesn't exist until fit is # called. - if 'Dummy' in name: - # Dummy estimators don't validate X at all - return if any(x in name for x in ('FastICA', 'KMeans')): # fit calls public function helper and validates there. No way to # access `self` from the helper. return - if 'FunctionTransformer' in name: - # Validation is optional and False by default - return - if 'KernelCenterer' in name: - # Takes kernel K with shape (n_samples, n_samples) as input, not X - return - if any(x in name for x in ('LatentDirichlet', 'MissingIndicator', - 'PowerTransformer', 'QuantileTransformer', - 'SimpleImputer', 'AdaBoost')): - # fit calls private validation method, which is also called for - # predict, transform, etc - return - if any(x in name for x in ('MaxAbsScaler', 'MinMaxScaler')): - # Fit directly calls partial_fit. Don't know what to do with - # partial_fit. - return - if name in 'RidgeCV': - # Uses aggregation from an estimator that is not an attribute. There is - # no way to delegate to this estimator. - return rng = np.random.RandomState(0) @@ -2701,4 +2679,4 @@ def check_n_features_in(name, estimator_orig): assert not hasattr(estimator, 'n_features_in_') estimator.fit(X, y) - assert hasattr(estimator, 'n_features_in_') + assert estimator.n_features_in_ == X.shape[1] From 70dc4ed8d5d5e3e58b86c97e80515d35e2b08da1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 5 Sep 2019 11:56:26 -0400 Subject: [PATCH 15/53] fixed tests in test_data.py --- sklearn/preprocessing/data.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 011a3c2ea6a58..1e4c421e68ae8 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2392,7 +2392,13 @@ def _transform_col(self, X_col, quantiles, inverse): def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): """Check inputs before fit and transform""" - check_n_features = not in_fit + # deactivating check for now (specific tests about error message would + # break) + # TODO: uncomment when addressing check_n_features in + # predict/transform/etc. + # check_n_features = not in_fit + check_n_features = False + X = self._validate_X(X, check_n_features=check_n_features, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -2962,7 +2968,12 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method : bool If True, check that the transformation method is valid. """ - check_n_features = not in_fit + # deactivating check for now (specific tests about error message would + # break) + # TODO: uncomment when addressing check_n_features in + # predict/transform/etc. + # check_n_features = not in_fit + check_n_features = False X = self._validate_X(X, check_n_features=check_n_features, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy, force_all_finite='allow-nan') From 988f9c4d93e4dafc14de7c9a35847ccfff8eebf1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 5 Sep 2019 15:26:54 -0400 Subject: [PATCH 16/53] Fixed some tests --- sklearn/decomposition/online_lda.py | 7 ++- sklearn/ensemble/weight_boosting.py | 4 +- sklearn/impute/_knn.py | 5 +- sklearn/utils/tests/test_estimator_checks.py | 55 +++++++++++--------- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index b288eb4e512f2..2a1bc9ac27f73 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -500,7 +500,12 @@ def partial_fit(self, X, y=None): """ self._check_params() first_time = not hasattr(self, 'components_') - check_n_features = not first_time + # deactivating check for now (specific tests about error message would + # break) + # TODO: uncomment when addressing check_n_features in + # predict/transform/etc. + # check_n_features = not in_fit + check_n_features = False X = self._check_non_neg_array(X, check_n_features, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index ff321de00186b..77f970926553c 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -71,7 +71,7 @@ def __init__(self, self.random_state = random_state def _validate_data(self, X): - return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=False, + return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, dtype=None) def fit(self, X, y, sample_weight=None): @@ -101,7 +101,7 @@ def fit(self, X, y, sample_weight=None): X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'], - ensure_2d=False, + ensure_2d=True, allow_nd=True, dtype=None, y_numeric=is_regressor(self)) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 0837cc9750e0a..55cc5072969f4 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -157,8 +157,9 @@ def fit(self, X, y=None): raise ValueError( "Expected n_neighbors > 0. Got {}".format(self.n_neighbors)) - X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, copy=self.copy) + X = self._validate_X(X, accept_sparse=False, dtype=FLOAT_DTYPES, + force_all_finite=force_all_finite, + copy=self.copy) _check_weights(self.weights) self._fit_X = X diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index e26a508566871..8c3a2a0bd4bf1 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -56,7 +56,7 @@ def __init__(self, key=0): self.key = key def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self def predict(self, X): @@ -71,7 +71,7 @@ def __init__(self, acceptable_key=0): def fit(self, X, y=None): self.wrong_attribute = 0 - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self @@ -81,14 +81,14 @@ def __init__(self, wrong_attribute=0): def fit(self, X, y=None): self.wrong_attribute = 1 - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self class ChangesUnderscoreAttribute(BaseEstimator): def fit(self, X, y=None): self._good_attribute = 1 - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self @@ -105,7 +105,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self @@ -122,7 +122,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self @@ -141,19 +141,19 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self class NoCheckinPredict(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) return self class NoSparseClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc']) if sp.issparse(X): raise ValueError("Nonsensical Error") return self @@ -165,7 +165,7 @@ def predict(self, X): class CorrectNotFittedErrorClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) self.coef_ = np.ones(X.shape[1]) return self @@ -178,10 +178,11 @@ def predict(self, X): class NoSampleWeightPandasSeriesType(BaseEstimator): def fit(self, X, y, sample_weight=None): # Convert data - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc"), - multi_output=True, - y_numeric=True) + X, y = self._validate_X_y( + X, y, + accept_sparse=("csr", "csc"), + multi_output=True, + y_numeric=True) # Function is only called after we verify that pandas is installed from pandas import Series if isinstance(sample_weight, Series): @@ -218,7 +219,7 @@ def fit(self, X, y): class BadTransformerWithoutMixin(BaseEstimator): def fit(self, X, y=None): - X = check_array(X) + X = self._validate_X(X) return self def transform(self, X): @@ -229,10 +230,11 @@ def transform(self, X): class NotInvariantPredict(BaseEstimator): def fit(self, X, y): # Convert data - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc"), - multi_output=True, - y_numeric=True) + X, y = self._validate_X_y( + X, y, + accept_sparse=("csr", "csc"), + multi_output=True, + y_numeric=True) return self def predict(self, X): @@ -245,11 +247,12 @@ def predict(self, X): class LargeSparseNotSupportedClassifier(BaseEstimator): def fit(self, X, y): - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc", "coo"), - accept_large_sparse=True, - multi_output=True, - y_numeric=True) + X, y = self._validate_X_y( + X, y, + accept_sparse=("csr", "csc", "coo"), + accept_large_sparse=True, + multi_output=True, + y_numeric=True) if sp.issparse(X): if X.getformat() == "coo": if X.row.dtype == "int64" or X.col.dtype == "int64": @@ -265,7 +268,7 @@ def fit(self, X, y): class SparseTransformer(BaseEstimator): def fit(self, X, y=None): - self.X_shape_ = check_array(X).shape + self.X_shape_ = self._validate_X(X).shape return self def fit_transform(self, X, y=None): @@ -296,7 +299,7 @@ def _more_tags(self): class RequiresPositiveYRegressor(LinearRegression): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_X_y(X, y) if (y <= 0).any(): raise ValueError('negative y values not supported!') return super().fit(X, y) From fd9b72ce28ab333d4e38628c6dc0ee406a6648e1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 5 Sep 2019 16:58:36 -0400 Subject: [PATCH 17/53] validate twice for Kmeans and FastICA --- sklearn/cluster/k_means_.py | 11 +++++++++-- sklearn/decomposition/fastica_.py | 3 +++ sklearn/utils/estimator_checks.py | 7 +------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index a83df9c836b86..fe332ff06c98c 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -953,6 +953,13 @@ def fit(self, X, y=None, sample_weight=None): """ random_state = check_random_state(self.random_state) + # This validates twice but there is not clean way to avoid validation + # in k_means. + order = "C" if self.copy_x else None + X = self._validate_X(X, accept_sparse='csr', + dtype=[np.float64, np.float32], order=order, + copy=self.copy_x) + self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ k_means( X, n_clusters=self.n_clusters, sample_weight=sample_weight, @@ -1482,8 +1489,8 @@ def fit(self, X, y=None, sample_weight=None): """ random_state = check_random_state(self.random_state) - X = check_array(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + X = self._validate_X(X, accept_sparse="csr", order='C', + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index dffce0dc0d8bc..c4dc9114eb5d6 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -501,6 +501,9 @@ def _fit(self, X, compute_sources=False): ------- X_new : array-like, shape (n_samples, n_components) """ + + X = self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args whitening, unmixing, sources, X_mean, self.n_iter_ = fastica( X=X, n_components=self.n_components, algorithm=self.algorithm, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4f09cbcc646cf..b29e8800b2dbf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2654,12 +2654,7 @@ def check_fit_idempotent(name, estimator_orig): def check_n_features_in(name, estimator_orig): # Make sure that n_features_in_ attribute doesn't exist until fit is - # called. - - if any(x in name for x in ('FastICA', 'KMeans')): - # fit calls public function helper and validates there. No way to - # access `self` from the helper. - return + # called, and that its value is correct. rng = np.random.RandomState(0) From 4f3d6fff2a578d3e36f81dc55eab9212ca16f47a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 5 Sep 2019 17:45:46 -0400 Subject: [PATCH 18/53] again --- sklearn/cluster/k_means_.py | 4 ++-- sklearn/decomposition/fastica_.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index fe332ff06c98c..4ad3980775ae6 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -1489,8 +1489,8 @@ def fit(self, X, y=None, sample_weight=None): """ random_state = check_random_state(self.random_state) - X = self._validate_X(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + self._validate_X(X, accept_sparse="csr", order='C', + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index c4dc9114eb5d6..b32ce59715fbe 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -502,8 +502,8 @@ def _fit(self, X, compute_sources=False): X_new : array-like, shape (n_samples, n_components) """ - X = self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T + self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args whitening, unmixing, sources, X_mean, self.n_iter_ = fastica( X=X, n_components=self.n_components, algorithm=self.algorithm, From 08f71924ae54f23054a468667877d48d54089e58 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 5 Sep 2019 18:09:42 -0400 Subject: [PATCH 19/53] and again --- sklearn/cluster/k_means_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 4ad3980775ae6..50af23b8d046c 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -956,9 +956,9 @@ def fit(self, X, y=None, sample_weight=None): # This validates twice but there is not clean way to avoid validation # in k_means. order = "C" if self.copy_x else None - X = self._validate_X(X, accept_sparse='csr', - dtype=[np.float64, np.float32], order=order, - copy=self.copy_x) + self._validate_X(X, accept_sparse='csr', + dtype=[np.float64, np.float32], order=order, + copy=self.copy_x) self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ k_means( @@ -1489,8 +1489,8 @@ def fit(self, X, y=None, sample_weight=None): """ random_state = check_random_state(self.random_state) - self._validate_X(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + X = self._validate_X(X, accept_sparse="csr", order='C', + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" From f0e7b413b9d8fb9ebe0b1f3a185a7e2b41df0c0d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 6 Sep 2019 09:17:57 -0400 Subject: [PATCH 20/53] should fix dep warning error --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 798c581add8d4..259fe89e712f7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -568,7 +568,7 @@ def n_features_in_(self): # For consistency with other estimators we raise a AttributeError so # that hasattr() fails if the search estimator isn't fitted. try: - check_is_fitted(self, 'best_estimator_') + check_is_fitted(self) except NotFittedError as nfe: raise AttributeError( "{} object has no n_features_in_ attribute." From 193fda1598011ea5f206c8857b0caa91954995ec Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 8 Sep 2019 17:07:14 -0400 Subject: [PATCH 21/53] removed superfluous tests --- sklearn/tests/test_base.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 372ed7d6ea47d..e25517b5122b7 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -514,20 +514,6 @@ def test_regressormixin_score_multioutput(): assert_warns_message(FutureWarning, msg, reg.score, X, y) -def test_validate_X_n_feature_mismatch(): - # Make sure ValueError is raised when there is a n_features mismatch - # between fit and predict/transform - - X = [[0, 1], [2, 3]] - - ss = StandardScaler().fit(X) - ss.transform(X) # All good - - with pytest.raises(ValueError, match="X has 3 features, but"): - X_more_features = [[0, 1, 4], [2, 3, 5]] - ss.transform(X_more_features) - - def test_validate_X_bad_kwargs(): est = BaseEstimator() @@ -536,20 +522,6 @@ def test_validate_X_bad_kwargs(): est._validate_X([1], bad_param=4) -def test_n_features_in_attribute(): - # Make sure n_features_in_ is correctly set. - # TODO: eventually move this in estimator_checks - X_2 = [[0, 1], [2, 3]] - X_3 = [[0, 1, 4], [2, 3, 5]] - - ss = StandardScaler() - assert not hasattr(ss, 'n_features_in_') - ss.fit(X_2) - assert ss.n_features_in_ == 2 - ss = ss.fit(X_3) - assert ss.n_features_in_ == 3 - - def test_warns_on_get_params_non_attribute(): class MyEstimator(BaseEstimator): def __init__(self, param=5): From 5b20a4c0b20352a15c61bdb448f64d7b2644fe91 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 8 Sep 2019 17:28:25 -0400 Subject: [PATCH 22/53] Added specific tests for vectorizers --- sklearn/feature_extraction/tests/test_text.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 7b7697ff47fff..a6170e4efbec2 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1343,3 +1343,16 @@ def test_unused_parameters_warn(Vectorizer, stop_words, ) with pytest.warns(UserWarning, match=msg): vect.fit(train_data) + + +@pytest.mark.parametrize('Vectorizer, X', ( + (HashingVectorizer, [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]), + (CountVectorizer, JUNK_FOOD_DOCS)) +) +def test_n_features_in(Vectorizer, X): + # For vectorizers, n_features_in_ does not make sense and it is always + # None + vectorizer = Vectorizer() + assert vectorizer.n_features_in_ is None + vectorizer.fit(X) + assert vectorizer.n_features_in_ is None From a49e5eaee5c3b1325353e6c4853ccda6cf1c2bd3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 8 Sep 2019 17:29:28 -0400 Subject: [PATCH 23/53] flake8 --- sklearn/tests/test_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index e25517b5122b7..0d365b9ba882a 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -16,7 +16,6 @@ from sklearn.svm import SVC from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV -from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor From 968fbff2deb0eb95ada2ad4efc80521d9a9d5ab8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Sep 2019 11:24:30 -0400 Subject: [PATCH 24/53] Dummies now have n_feautures_in_ to None and raise error if not fitted --- sklearn/dummy.py | 29 +++++++++++++++++++++++++++++ sklearn/tests/test_dummy.py | 10 ++++++++++ 2 files changed, 39 insertions(+) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index ab79321bd4fa3..b4d173b76bc41 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -17,6 +17,7 @@ from .utils.random import random_choice_csc from .utils.stats import _weighted_percentile from .utils.multiclass import class_distribution +from .exceptions import NotFittedError class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): @@ -352,6 +353,20 @@ def score(self, X, y, sample_weight=None): X = np.zeros(shape=(len(y), 1)) return super().score(X, y, sample_weight) + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return None # Dummies don't validate the input + class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): """ @@ -551,3 +566,17 @@ def score(self, X, y, sample_weight=None): if X is None: X = np.zeros(shape=(len(y), 1)) return super().score(X, y, sample_weight) + + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return None # Dummies don't validate the input diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 4301a4c07654f..1e9623a27e59d 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -737,3 +737,13 @@ def test_dtype_of_classifier_probas(strategy): probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64 + + +@pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier)) +def test_n_features_in_(Dummy): + X = [[1, 2]] + y = [0] + d = Dummy() + assert not hasattr(d, 'n_features_in_') + d.fit(X, y) + assert d.n_features_in_ is None From e4faf13bdb63f037fa4e0bd8d03efe1c0d006cdf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 9 Sep 2019 11:31:13 -0400 Subject: [PATCH 25/53] still don't check n_features_in_ for LDA (will be done in later PR) --- sklearn/decomposition/online_lda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 2a1bc9ac27f73..3c0bcb9372bd9 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -621,7 +621,7 @@ def _unnormalized_transform(self, X): # make sure feature size is the same in fitted model and in X X = self._check_non_neg_array( - X, check_n_features=True, + X, check_n_features=False, whom="LatentDirichletAllocation.transform") n_samples, n_features = X.shape if n_features != self.components_.shape[1]: @@ -746,7 +746,7 @@ def score(self, X, y=None): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, check_n_features=True, + X = self._check_non_neg_array(X, check_n_features=False, whom="LatentDirichletAllocation.score") doc_topic_distr = self._unnormalized_transform(X) @@ -777,7 +777,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, check_is_fitted(self) X = self._check_non_neg_array( - X, check_n_features=True, + X, check_n_features=False, whom="LatentDirichletAllocation.perplexity") if doc_topic_distr is None: From a88a4c5f40a9ab0e7d2774918fa44ae688224d54 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 11 Sep 2019 13:37:01 -0400 Subject: [PATCH 26/53] Added tests for some estimators --- sklearn/cluster/bicluster.py | 2 +- sklearn/cluster/tests/test_bicluster.py | 10 ++++++++++ sklearn/ensemble/tests/test_voting.py | 20 ++++++++++++++++++++ sklearn/model_selection/tests/test_search.py | 7 ++++++- sklearn/utils/estimator_checks.py | 1 + 5 files changed, 38 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 5bfd335549012..3b54df43fe295 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -115,7 +115,7 @@ def fit(self, X, y=None): y : Ignored """ - X = check_array(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_X(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() self._fit(X) return self diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 1d88769f238aa..6044c90d11412 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -256,3 +256,13 @@ def test_wrong_shape(): data = np.arange(27).reshape((3, 3, 3)) with pytest.raises(ValueError): model.fit(data) + + +@pytest.mark.parametrize('est', (SpectralBiclustering(), SpectralCoclustering())) +def test_n_features_in_(est): + + X, _, _ = make_biclusters((3, 3), 3, random_state=0) + + assert not hasattr(est, 'n_features_in_') + est.fit(X) + assert est.n_features_in_ == 3 diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index a02efe4d925d8..bbfb91751726a 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -528,3 +528,23 @@ def test_check_estimators_voting_estimator(estimator): # their testing parameters (for required parameters). check_estimator(estimator) check_no_attributes_set_in_init(estimator.__class__.__name__, estimator) + + +@pytest.mark.parametrize( + "est", + [VotingRegressor( + estimators=[('lr', LinearRegression()), + ('tree', DecisionTreeRegressor(random_state=0))]), + VotingClassifier( + estimators=[('lr', LogisticRegression(random_state=0)), + ('tree', DecisionTreeClassifier(random_state=0))])], + ids=['VotingRegressor', 'VotingClassifier'] +) +def test_n_features_in(est): + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + assert not hasattr(est, 'n_features_in_') + est.fit(X, y) + assert est.n_features_in_ == 2 diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index dd92f144fac32..3ca0cf4f4cc5a 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1780,12 +1780,17 @@ def get_n_splits(self, *args, **kw): def test_n_features_in(): - # make sure grid search delegates n_features_in to the best estimator + # make sure grid search and random search delegate n_features_in to the + # best estimator n_features = 4 X, y = make_classification(n_features=n_features) gbdt = HistGradientBoostingClassifier() param_grid = {'max_iter': [3, 4]} gs = GridSearchCV(gbdt, param_grid) + rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1) assert not hasattr(gs, 'n_features_in_') + assert not hasattr(rs, 'n_features_in_') gs.fit(X, y) + rs.fit(X, y) assert gs.n_features_in_ == n_features + assert rs.n_features_in_ == n_features diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2cc9daf54be32..c3a0fb3d28c4e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2664,6 +2664,7 @@ def check_n_features_in(name, estimator_orig): # Make sure that n_features_in_ attribute doesn't exist until fit is # called, and that its value is correct. + print(name) rng = np.random.RandomState(0) estimator = clone(estimator_orig) From f3fb539fbdbb61215cc12d1f61586cd63278049e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 11 Sep 2019 13:59:37 -0400 Subject: [PATCH 27/53] removed NonRectangularInputMixin and set n_features_in to SparseCoder --- sklearn/base.py | 9 --------- sklearn/cluster/tests/test_bicluster.py | 3 ++- sklearn/decomposition/dict_learning.py | 1 + sklearn/decomposition/tests/test_dict_learning.py | 6 ++++++ sklearn/feature_extraction/dict_vectorizer.py | 5 ++--- sklearn/feature_extraction/tests/test_dict_vectorizer.py | 4 ++-- sklearn/feature_extraction/tests/test_text.py | 7 +++---- sklearn/feature_extraction/text.py | 4 ++-- 8 files changed, 18 insertions(+), 21 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 5b8f7637c5261..36a849d2e2c1e 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -583,15 +583,6 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y, **fit_params).transform(X) -class NonRectangularInputMixin: - """Mixin class for all estimators with non-rectangular input. - - For now only vectorizers are relevant for this mixin. - """ - - n_features_in_ = None - - class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" _estimator_type = "DensityEstimator" diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 6044c90d11412..5057480572a6b 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -258,7 +258,8 @@ def test_wrong_shape(): model.fit(data) -@pytest.mark.parametrize('est', (SpectralBiclustering(), SpectralCoclustering())) +@pytest.mark.parametrize('est', + (SpectralBiclustering(), SpectralCoclustering())) def test_n_features_in_(est): X, _, _ = make_biclusters((3, 3), 3, random_state=0) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index ce3ed2aa44978..9ed705d680059 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -1024,6 +1024,7 @@ def __init__(self, dictionary, transform_algorithm='omp', transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter) self.components_ = dictionary + self.n_features_in_ = dictionary.shape[1] def fit(self, X, y=None): """Do nothing and return the estimator unchanged diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 54c5ece561f18..af8a1869626f3 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -498,3 +498,9 @@ def test_sparse_coder_parallel_mmap(): sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data) + + +def test_sparse_coder_n_features_in(): + d = np.array([[1, 2, 3], [1, 2, 3]]) + sc = SparseCoder(d) + assert sc.n_features_in_ == d.shape[1] diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py index 8ba68ea19efaf..857806c892806 100644 --- a/sklearn/feature_extraction/dict_vectorizer.py +++ b/sklearn/feature_extraction/dict_vectorizer.py @@ -9,7 +9,7 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin +from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, tosequence @@ -21,8 +21,7 @@ def _tosequence(X): return tosequence(X) -class DictVectorizer(NonRectangularInputMixin, TransformerMixin, - BaseEstimator): +class DictVectorizer(TransformerMixin, BaseEstimator): """Transforms lists of feature-value mappings to vectors. This transformer turns lists of mappings (dict-like objects) of feature diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 32a14fe82be5b..a65feb2d7590b 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -116,7 +116,7 @@ def test_n_features_in(): # For vectorizers, n_features_in_ does not make sense and it is always # None dv = DictVectorizer() - assert dv.n_features_in_ is None + assert not hasattr(dv, 'n_features_in_') d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] dv.fit(d) - assert dv.n_features_in_ is None + assert not hasattr(dv, 'n_features_in_') diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index a6170e4efbec2..f775589fb6a8a 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1350,9 +1350,8 @@ def test_unused_parameters_warn(Vectorizer, stop_words, (CountVectorizer, JUNK_FOOD_DOCS)) ) def test_n_features_in(Vectorizer, X): - # For vectorizers, n_features_in_ does not make sense and it is always - # None + # For vectorizers, n_features_in_ does not make sense vectorizer = Vectorizer() - assert vectorizer.n_features_in_ is None + assert not hasattr(vectorizer, 'n_features_in_') vectorizer.fit(X) - assert vectorizer.n_features_in_ is None + assert not hasattr(vectorizer, 'n_features_in_') diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 908cfcd526677..610a9f66c58ff 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -25,7 +25,7 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin, NonRectangularInputMixin +from ..base import BaseEstimator, TransformerMixin from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS @@ -181,7 +181,7 @@ def _check_stop_list(stop): return frozenset(stop) -class VectorizerMixin(NonRectangularInputMixin): +class VectorizerMixin: """Provides common code for text vectorizers (tokenization logic).""" _white_spaces = re.compile(r"\s\s+") From 4b7b7581ab0fd3c379d17fe29b331ebedce05f3e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 12 Sep 2019 09:13:23 -0400 Subject: [PATCH 28/53] simpler logic for dummies --- sklearn/dummy.py | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index d4c9ed730caa0..f0d665704377e 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -129,6 +129,7 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] check_consistent_length(X, y, sample_weight) + self.n_features_in_ = None # No input validation is done for X if self.strategy == "constant": if self.constant is None: @@ -356,20 +357,6 @@ def score(self, X, y, sample_weight=None): X = np.zeros(shape=(len(y), 1)) return super().score(X, y, sample_weight) - @property - def n_features_in_(self): - # For consistency with other estimators we raise a AttributeError so - # that hasattr() fails if the estimator isn't fitted. - try: - check_is_fitted(self) - except NotFittedError as nfe: - raise AttributeError( - "{} object has no n_features_in_ attribute." - .format(self.__class__.__name__) - ) from nfe - - return None # Dummies don't validate the input - class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): """ @@ -441,6 +428,7 @@ def fit(self, X, y, sample_weight=None): % (self.strategy, allowed_strategies)) y = check_array(y, ensure_2d=False) + self.n_features_in_ = None # No input validation is done for X if len(y) == 0: raise ValueError("y must not be empty.") @@ -569,17 +557,3 @@ def score(self, X, y, sample_weight=None): if X is None: X = np.zeros(shape=(len(y), 1)) return super().score(X, y, sample_weight) - - @property - def n_features_in_(self): - # For consistency with other estimators we raise a AttributeError so - # that hasattr() fails if the estimator isn't fitted. - try: - check_is_fitted(self) - except NotFittedError as nfe: - raise AttributeError( - "{} object has no n_features_in_ attribute." - .format(self.__class__.__name__) - ) from nfe - - return None # Dummies don't validate the input From 53027d36eadb8f1df06a768238bdbac04cf060da Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 12 Sep 2019 09:21:58 -0400 Subject: [PATCH 29/53] comments --- sklearn/cluster/k_means_.py | 2 +- sklearn/decomposition/fastica_.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 50af23b8d046c..7a398e2985e0f 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -954,7 +954,7 @@ def fit(self, X, y=None, sample_weight=None): random_state = check_random_state(self.random_state) # This validates twice but there is not clean way to avoid validation - # in k_means. + # in k_means(). Please see issue 14897. order = "C" if self.copy_x else None self._validate_X(X, accept_sparse='csr', dtype=[np.float64, np.float32], order=order, diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index b32ce59715fbe..7815ccd2b0ae8 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -502,6 +502,8 @@ def _fit(self, X, compute_sources=False): X_new : array-like, shape (n_samples, n_components) """ + # This validates twice but there is not clean way to avoid validation + # in fastica(). Please see issue 14897. self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES, ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args From a1aea70fc8b05b0156cbd609a89688b0aa9d9a15 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 15 Sep 2019 18:22:30 -0400 Subject: [PATCH 30/53] pep8 --- sklearn/dummy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index fad508926ae83..6dc524b778e45 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -17,7 +17,6 @@ from .utils.random import random_choice_csc from .utils.stats import _weighted_percentile from .utils.multiclass import class_distribution -from .exceptions import NotFittedError from .utils import deprecated From 9ecc396079fa43f887a62414578bbddec59c130d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 15 Sep 2019 18:24:33 -0400 Subject: [PATCH 31/53] remove print --- sklearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3e8d65a584e00..285b318589d3d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2671,7 +2671,6 @@ def check_n_features_in(name, estimator_orig): # Make sure that n_features_in_ attribute doesn't exist until fit is # called, and that its value is correct. - print(name) rng = np.random.RandomState(0) estimator = clone(estimator_orig) From 9292c8459c54a7b3f1dd190bd7f335ffc1373a73 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 16 Sep 2019 11:05:23 -0400 Subject: [PATCH 32/53] avoid dep warning --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 285b318589d3d..5a10bf6db88fe 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2685,7 +2685,7 @@ def check_n_features_in(name, estimator_orig): y = rng.normal(size=n_samples) else: y = rng.randint(low=0, high=2, size=n_samples) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) assert not hasattr(estimator, 'n_features_in_') estimator.fit(X, y) From 6846bea5babb61dacdabb69bee5bd343074e847b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 19 Sep 2019 08:50:29 -0400 Subject: [PATCH 33/53] merged (maybe) --- doc/glossary.rst | 1 + doc/modules/classes.rst | 4 + doc/modules/ensemble.rst | 138 +++- doc/modules/neighbors.rst | 91 ++- doc/whats_new/v0.22.rst | 39 + examples/ensemble/plot_stack_predictors.py | 123 +++ .../approximate_nearest_neighbors.py | 293 ++++++++ .../plot_caching_nearest_neighbors.py | 60 ++ examples/neighbors/plot_nca_illustration.py | 59 +- sklearn/cluster/dbscan_.py | 36 +- sklearn/cluster/spectral.py | 31 +- sklearn/cluster/tests/test_dbscan.py | 17 + sklearn/cluster/tests/test_spectral.py | 20 + sklearn/datasets/samples_generator.py | 32 +- .../datasets/tests/test_samples_generator.py | 30 + sklearn/ensemble/__init__.py | 3 + sklearn/ensemble/_stacking.py | 704 ++++++++++++++++++ sklearn/ensemble/base.py | 17 + sklearn/ensemble/tests/test_stacking.py | 492 ++++++++++++ sklearn/ensemble/tests/test_voting.py | 4 +- sklearn/ensemble/voting.py | 18 +- sklearn/manifold/_utils.pyx | 30 +- sklearn/manifold/isomap.py | 78 +- sklearn/manifold/locally_linear.py | 2 +- sklearn/manifold/spectral_embedding_.py | 40 +- sklearn/manifold/t_sne.py | 116 ++- sklearn/manifold/tests/test_isomap.py | 52 ++ .../manifold/tests/test_spectral_embedding.py | 34 +- sklearn/manifold/tests/test_t_sne.py | 183 ++--- sklearn/neighbors/__init__.py | 3 + sklearn/neighbors/base.py | 389 ++++++++-- sklearn/neighbors/classification.py | 49 +- sklearn/neighbors/graph.py | 319 +++++++- sklearn/neighbors/lof.py | 21 +- sklearn/neighbors/regression.py | 24 +- sklearn/neighbors/tests/test_graph.py | 79 ++ sklearn/neighbors/tests/test_neighbors.py | 184 ++++- .../tests/test_neighbors_pipeline.py | 221 ++++++ sklearn/neighbors/unsupervised.py | 32 +- sklearn/tests/test_common.py | 5 +- sklearn/utils/estimator_checks.py | 43 +- 41 files changed, 3603 insertions(+), 513 deletions(-) create mode 100644 examples/ensemble/plot_stack_predictors.py create mode 100644 examples/neighbors/approximate_nearest_neighbors.py create mode 100644 examples/neighbors/plot_caching_nearest_neighbors.py create mode 100644 sklearn/ensemble/_stacking.py create mode 100644 sklearn/ensemble/tests/test_stacking.py create mode 100644 sklearn/neighbors/tests/test_graph.py create mode 100644 sklearn/neighbors/tests/test_neighbors_pipeline.py diff --git a/doc/glossary.rst b/doc/glossary.rst index 99f512cc49acc..1c5535cdc2bb4 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -697,6 +697,7 @@ General Concepts to :term:`unlabeled` samples in semi-supervised classification. sparse matrix + sparse graph A representation of two-dimensional numeric data that is more memory efficient the corresponding dense numpy array where almost all elements are zero. We use the :mod:`scipy.sparse` framework, which provides diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 16658a39b1612..444895245bf6b 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -422,6 +422,8 @@ Samples generator ensemble.RandomForestClassifier ensemble.RandomForestRegressor ensemble.RandomTreesEmbedding + ensemble.StackingClassifier + ensemble.StackingRegressor ensemble.VotingClassifier ensemble.VotingRegressor ensemble.HistGradientBoostingRegressor @@ -1234,9 +1236,11 @@ Model validation neighbors.KernelDensity neighbors.KNeighborsClassifier neighbors.KNeighborsRegressor + neighbors.KNeighborsTransformer neighbors.LocalOutlierFactor neighbors.RadiusNeighborsClassifier neighbors.RadiusNeighborsRegressor + neighbors.RadiusNeighborsTransformer neighbors.NearestCentroid neighbors.NearestNeighbors neighbors.NeighborhoodComponentsAnalysis diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index b023b4711c57f..02be4f4cff624 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -722,7 +722,7 @@ The parameter ``learning_rate`` strongly interacts with the parameter ``n_estimators``, the number of weak learners to fit. Smaller values of ``learning_rate`` require larger numbers of weak learners to maintain a constant training error. Empirical evidence suggests that small -values of ``learning_rate`` favor better test error. [HTF2009]_ +values of ``learning_rate`` favor better test error. [HTF]_ recommend to set the learning rate to a small constant (e.g. ``learning_rate <= 0.1``) and choose ``n_estimators`` by early stopping. For a more detailed discussion of the interaction between @@ -1056,7 +1056,9 @@ The following example shows how to fit the majority rule classifier:: >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1) >>> clf3 = GaussianNB() - >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') + >>> eclf = VotingClassifier( + ... estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='hard') >>> for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): ... scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5) @@ -1142,7 +1144,10 @@ hyperparameters of the individual estimators:: >>> clf1 = LogisticRegression(random_state=1) >>> clf2 = RandomForestClassifier(random_state=1) >>> clf3 = GaussianNB() - >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') + >>> eclf = VotingClassifier( + ... estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='soft' + ... ) >>> params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200]} @@ -1156,13 +1161,17 @@ In order to predict the class labels based on the predicted class-probabilities (scikit-learn estimators in the VotingClassifier must support ``predict_proba`` method):: - >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') + >>> eclf = VotingClassifier( + ... estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='soft' + ... ) Optionally, weights can be provided for the individual classifiers:: - >>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], - ... voting='soft', weights=[2, 5, 1]) - + >>> eclf = VotingClassifier( + ... estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='soft', weights=[2,5,1] + ... ) .. _voting_regressor: @@ -1187,7 +1196,7 @@ The following example shows how to fit the VotingRegressor:: >>> # Loading some example data >>> X, y = load_boston(return_X_y=True) - + >>> # Training classifiers >>> reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10) >>> reg2 = RandomForestRegressor(random_state=1, n_estimators=10) @@ -1203,3 +1212,116 @@ The following example shows how to fit the VotingRegressor:: .. topic:: Examples: * :ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py` + +.. _stacking: + +Stacked generalization +====================== + +Stacked generalization is a method for combining estimators to reduce their +biases [W1992]_ [HTF]_. More precisely, the predictions of each individual +estimator are stacked together and used as input to a final estimator to +compute the prediction. This final estimator is trained through +cross-validation. + +The :class:`StackingClassifier` and :class:`StackingRegressor` provide such +strategies which can be applied to classification and regression problems. + +The `estimators` parameter corresponds to the list of the estimators which +are stacked together in parallel on the input data. It should be given as a +list of names and estimators:: + + >>> from sklearn.linear_model import RidgeCV, LassoCV + >>> from sklearn.svm import SVR + >>> estimators = [('ridge', RidgeCV()), + ... ('lasso', LassoCV(random_state=42)), + ... ('svr', SVR(C=1, gamma=1e-6))] + +The `final_estimator` will use the predictions of the `estimators` as input. It +needs to be a classifier or a regressor when using :class:`StackingClassifier` +or :class:`StackingRegressor`, respectively:: + + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> from sklearn.ensemble import StackingRegressor + >>> reg = StackingRegressor( + ... estimators=estimators, + ... final_estimator=GradientBoostingRegressor(random_state=42)) + +To train the `estimators` and `final_estimator`, the `fit` method needs +to be called on the training data:: + + >>> from sklearn.datasets import load_boston + >>> X, y = load_boston(return_X_y=True) + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... random_state=42) + >>> reg.fit(X_train, y_train) + StackingRegressor(...) + +During training, the `estimators` are fitted on the whole training data +`X_train`. They will be used when calling `predict` or `predict_proba`. To +generalize and avoid over-fitting, the `final_estimator` is trained on +out-samples using :func:`sklearn.model_selection.cross_val_predict` internally. + +For :class:`StackingClassifier`, note that the output of the ``estimators`` is +controlled by the parameter `stack_method` and it is called by each estimator. +This parameter is either a string, being estimator method names, or `'auto'` +which will automatically identify an available method depending on the +availability, tested in the order of preference: `predict_proba`, +`decision_function` and `predict`. + +A :class:`StackingRegressor` and :class:`StackingClassifier` can be used as +any other regressor or classifier, exposing a `predict`, `predict_proba`, and +`decision_function` methods, e.g.:: + + >>> y_pred = reg.predict(X_test) + >>> from sklearn.metrics import r2_score + >>> print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred))) + R2 score: 0.81 + +Note that it is also possible to get the output of the stacked outputs of the +`estimators` using the `transform` method:: + + >>> reg.transform(X_test[:5]) + array([[28.78..., 28.43... , 22.62...], + [35.96..., 32.58..., 23.68...], + [14.97..., 14.05..., 16.45...], + [25.19..., 25.54..., 22.92...], + [18.93..., 19.26..., 17.03... ]]) + +In practise, a stacking predictor predict as good as the best predictor of the +base layer and even sometimes outputperform it by combining the different +strength of the these predictors. However, training a stacking predictor is +computationally expensive. + +.. note:: + For :class:`StackingClassifier`, when using `stack_method_='predict_proba'`, + the first column is dropped when the problem is a binary classification + problem. Indeed, both probability columns predicted by each estimator are + perfectly collinear. + +.. note:: + Multiple stacking layers can be achieved by assigning `final_estimator` to + a :class:`StackingClassifier` or :class:`StackingRegressor`:: + + >>> final_layer = StackingRegressor( + ... estimators=[('rf', RandomForestRegressor(random_state=42)), + ... ('gbrt', GradientBoostingRegressor(random_state=42))], + ... final_estimator=RidgeCV() + ... ) + >>> multi_layer_regressor = StackingRegressor( + ... estimators=[('ridge', RidgeCV()), + ... ('lasso', LassoCV(random_state=42)), + ... ('svr', SVR(C=1, gamma=1e-6, kernel='rbf'))], + ... final_estimator=final_layer + ... ) + >>> multi_layer_regressor.fit(X_train, y_train) + StackingRegressor(...) + >>> print('R2 score: {:.2f}' + ... .format(multi_layer_regressor.score(X_test, y_test))) + R2 score: 0.82 + +.. topic:: References + + .. [W1992] Wolpert, David H. "Stacked generalization." Neural networks 5.2 + (1992): 241-259. diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index e07e66e833919..7c7da8f0afdb7 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -135,8 +135,8 @@ have the same interface; we'll show an example of using the KD Tree here: Refer to the :class:`KDTree` and :class:`BallTree` class documentation for more information on the options available for nearest neighbors searches, -including specification of query strategies, distance metrics, etc. For a list -of available metrics, see the documentation of the :class:`DistanceMetric` +including specification of query strategies, distance metrics, etc. For a list +of available metrics, see the documentation of the :class:`DistanceMetric` class. .. _classification: @@ -160,8 +160,8 @@ training point, where :math:`r` is a floating-point value specified by the user. The :math:`k`-neighbors classification in :class:`KNeighborsClassifier` -is the most commonly used technique. The optimal choice of the value :math:`k` -is highly data-dependent: in general a larger :math:`k` suppresses the effects +is the most commonly used technique. The optimal choice of the value :math:`k` +is highly data-dependent: in general a larger :math:`k` suppresses the effects of noise, but makes the classification boundaries less distinct. In cases where the data is not uniformly sampled, radius-based neighbors @@ -320,7 +320,7 @@ To address the inefficiencies of KD Trees in higher dimensions, the *ball tree* data structure was developed. Where KD trees partition data along Cartesian axes, ball trees partition data in a series of nesting hyper-spheres. This makes tree construction more costly than that of the -KD tree, but results in a data structure which can be very efficient on +KD tree, but results in a data structure which can be very efficient on highly structured data, even in very high dimensions. A ball tree recursively divides the data into @@ -509,6 +509,87 @@ the model from 0.81 to 0.82. * :ref:`sphx_glr_auto_examples_neighbors_plot_nearest_centroid.py`: an example of classification using nearest centroid with different shrink thresholds. +.. _neighbors_transformer: + +Nearest Neighbors Transformer +============================= + +Many scikit-learn estimators rely on nearest neighbors: Several classifiers and +regressors such as :class:`KNeighborsClassifier` and +:class:`KNeighborsRegressor`, but also some clustering methods such as +:class:`~sklearn.cluster.DBSCAN` and +:class:`~sklearn.cluster.SpectralClustering`, and some manifold embeddings such +as :class:`~sklearn.manifold.TSNE` and :class:`~sklearn.manifold.Isomap`. + +All these estimators can compute internally the nearest neighbors, but most of +them also accept precomputed nearest neighbors :term:`sparse graph`, +as given by :func:`~sklearn.neighbors.kneighbors_graph` and +:func:`~sklearn.neighbors.radius_neighbors_graph`. With mode +`mode='connectivity'`, these functions return a binary adjacency sparse graph +as required, for instance, in :class:`~sklearn.cluster.SpectralClustering`. +Whereas with `mode='distance'`, they return a distance sparse graph as required, +for instance, in :class:`~sklearn.cluster.DBSCAN`. To include these functions in +a scikit-learn pipeline, one can also use the corresponding classes +:class:`KNeighborsTransformer` and :class:`RadiusNeighborsTransformer`. +The benefits of this sparse graph API are multiple. + +First, the precomputed graph can be re-used multiple times, for instance while +varying a parameter of the estimator. This can be done manually by the user, or +using the caching properties of the scikit-learn pipeline: + + >>> from sklearn.manifold import Isomap + >>> from sklearn.neighbors import KNeighborsTransformer + >>> from sklearn.pipeline import make_pipeline + >>> estimator = make_pipeline( + ... KNeighborsTransformer(n_neighbors=5, mode='distance'), + ... Isomap(neighbors_algorithm='precomputed'), + ... memory='/path/to/cache') + +Second, precomputing the graph can give finer control on the nearest neighbors +estimation, for instance enabling multiprocessing though the parameter +`n_jobs`, which might not be available in all estimators. + +Finally, the precomputation can be performed by custom estimators to use +different implementations, such as approximate nearest neighbors methods, or +implementation with special data types. The precomputed neighbors +:term:`sparse graph` needs to be formatted as in +:func:`~sklearn.neighbors.radius_neighbors_graph` output: + +* a CSR matrix (although COO, CSC or LIL will be accepted). +* only explicitly store nearest neighborhoods of each sample with respect to the + training data. This should include those at 0 distance from a query point, + including the matrix diagonal when computing the nearest neighborhoods + between the training data and itself. +* each row's `data` should store the distance in increasing order (optional. + Unsorted data will be stable-sorted, adding a computational overhead). +* all values in data should be non-negative. +* there should be no duplicate `indices` in any row + (see https://github.com/scipy/scipy/issues/5807). +* if the algorithm being passed the precomputed matrix uses k nearest neighbors + (as opposed to radius neighborhood), at least k neighbors must be stored in + each row (or k+1, as explained in the following note). + +.. note:: + When a specific number of neighbors is queried (using + :class:`KNeighborsTransformer`), the definition of `n_neighbors` is ambiguous + since it can either include each training point as its own neighbor, or + exclude them. Neither choice is perfect, since including them leads to a + different number of non-self neighbors during training and testing, while + excluding them leads to a difference between `fit(X).transform(X)` and + `fit_transform(X)`, which is against scikit-learn API. + In :class:`KNeighborsTransformer` we use the definition which includes each + training point as its own neighbor in the count of `n_neighbors`. However, + for compatibility reasons with other estimators which use the other + definition, one extra neighbor will be computed when `mode == 'distance'`. + To maximise compatiblity with all estimators, a safe choice is to always + include one extra neighbor in a custom nearest neighbors estimator, since + unnecessary neighbors will be filtered by following estimators. + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_neighbors_neighbors_in_pipeline_api.py`: an + example of pipelining KNeighborsTransformer and TSNE, and of two custom + nearest neighbors estimators based on external packages. .. _nca: diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 9f60a735556cd..a793c4913ca5c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -69,6 +69,13 @@ Changelog `sample_weights` are not supported by the wrapped estimator). :pr:`13575` by :user:`William de Vazelhes `. +:mod:`sklearn.cluster` +...................... + +- |Feature| :class:`cluster.SpectralClustering` now accepts precomputed sparse + neighbors graph as input. :issue:`10482` by `Tom Dupre la Tour`_ and + :user:`Kumar Ashutosh `. + :mod:`sklearn.compose` ...................... @@ -111,6 +118,10 @@ Changelog :func:`datasets.fetch_20newsgroups` and :func:`datasets.fetch_olivetti_faces` . :pr:`14259` by :user:`Sourav Singh `. +- |Enhancement| :func:`datasets.make_classification` now accepts array-like + `weights` parameter, i.e. list or numpy.array, instead of list only. + :pr:`14764` by :user:`Cat Chenal `. + - |Fix| Fixed a bug in :func:`datasets.fetch_openml`, which failed to load an OpenML dataset that contains an ignored feature. :pr:`14623` by :user:`Sarra Habchi `. @@ -152,6 +163,12 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |MajorFeature| Added :class:`ensemble.StackingClassifier` and + :class:`ensemble.StackingRegressor` to stack predictors using a final + classifier or regressor. + :pr:`11047` by :user:`Guillaume Lemaitre ` and + :user:`Caio Oliveira `. + - Many improvements were made to :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor`: @@ -314,6 +331,14 @@ Changelog :mod:`sklearn.manifold` ....................... +- |Feature| :class:`manifold.Isomap`, :class:`manifold.TSNE`, and + :class:`manifold.SpectralEmbedding` now accept precomputed sparse + neighbors graph as input. :issue:`10482` by `Tom Dupre la Tour`_ and + :user:`Kumar Ashutosh `. + +- |API| Deprecate ``training_data_`` unused attribute in + :class:`manifold.Isomap`. :issue:`10482` by `Tom Dupre la Tour`_. + - |Fix| Fixed a bug where :func:`manifold.spectral_embedding` (and therefore :class:`manifold.SpectralEmbedding` and :class:`cluster.SpectralClustering`) computed wrong eigenvalues with ``eigen_solver='amg'`` when @@ -397,6 +422,20 @@ Changelog :mod:`sklearn.neighbors` ........................ +- |MajorFeature| Added :class:`neighbors.KNeighborsTransformer` and + :class:`neighbors.RadiusNeighborsTransformer`, which transform input dataset + into a sparse neighbors graph. They give finer control on nearest neighbors + computations and enable easy pipeline caching for multiple use. + :issue:`10482` by `Tom Dupre la Tour`_. + +- |Feature| :class:`neighbors.KNeighborsClassifier`, + :class:`neighbors.KNeighborsRegressor`, + :class:`neighbors.RadiusNeighborsClassifier`, + :class:`neighbors.RadiusNeighborsRegressor`, and + :class:`neighbors.LocalOutlierFactor` now accept precomputed sparse + neighbors graph as input. :issue:`10482` by `Tom Dupre la Tour`_ and + :user:`Kumar Ashutosh `. + - |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports predicting probabilities by using `predict_proba` and supports more outlier_label options: 'most_frequent', or different outlier_labels diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py new file mode 100644 index 0000000000000..2c10ac1b362e7 --- /dev/null +++ b/examples/ensemble/plot_stack_predictors.py @@ -0,0 +1,123 @@ +""" +================================= +Combine predictors using stacking +================================= + +Stacking refers to a method to blend estimators. In this strategy, some +estimators are individually fitted on some training data while a final +estimator is trained using the stacked predictions of these base estimators. + +In this example, we illustrate the use case in which different regressors are +stacked together and a final linear penalized regressor is used to output the +prediction. We compare the performance of each individual regressor with the +stacking strategy. Stacking slightly improves the overall performance. + +""" +print(__doc__) + +# Authors: Guillaume Lemaitre +# License: BSD 3 clause + +############################################################################### +# The function ``plot_regression_results`` is used to plot the predicted and +# true targets. + +import matplotlib.pyplot as plt + + +def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time): + """Scatter plot of the predicted vs true targets.""" + ax.plot([y_true.min(), y_true.max()], + [y_true.min(), y_true.max()], + '--r', linewidth=2) + ax.scatter(y_true, y_pred, alpha=0.2) + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['left'].set_position(('outward', 10)) + ax.spines['bottom'].set_position(('outward', 10)) + ax.set_xlim([y_true.min(), y_true.max()]) + ax.set_ylim([y_true.min(), y_true.max()]) + ax.set_xlabel('Measured') + ax.set_ylabel('Predicted') + extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False, + edgecolor='none', linewidth=0) + ax.legend([extra], [scores], loc='upper left') + title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time) + ax.set_title(title) + + +############################################################################### +# Stack of predictors on a single data set +############################################################################### +# It is sometimes tedious to find the model which will best perform on a given +# dataset. Stacking provide an alternative by combining the outputs of several +# learners, without the need to choose a model specifically. The performance of +# stacking is usually close to the best model and sometimes it can outperform +# the prediction performance of each individual model. +# +# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor +# to combine their outputs together. + +from sklearn.ensemble import StackingRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.linear_model import LassoCV +from sklearn.linear_model import RidgeCV + +estimators = [ + ('Random Forest', RandomForestRegressor(random_state=42)), + ('Lasso', LassoCV()), + ('Gradient Boosting', HistGradientBoostingRegressor(random_state=0)) +] +stacking_regressor = StackingRegressor( + estimators=estimators, final_estimator=RidgeCV() +) + + +############################################################################### +# We used the Boston data set (prediction of house prices). We check the +# performance of each individual predictor as well as the stack of the +# regressors. + +import time +import numpy as np +from sklearn.datasets import load_boston +from sklearn.model_selection import cross_validate, cross_val_predict + +X, y = load_boston(return_X_y=True) + +fig, axs = plt.subplots(2, 2, figsize=(9, 7)) +axs = np.ravel(axs) + +for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor', + stacking_regressor)]): + start_time = time.time() + score = cross_validate(est, X, y, + scoring=['r2', 'neg_mean_absolute_error'], + n_jobs=-1, verbose=0) + elapsed_time = time.time() - time.time() + + y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0) + plot_regression_results( + ax, y, y_pred, + name, + (r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$') + .format(np.mean(score['test_r2']), + np.std(score['test_r2']), + -np.mean(score['test_neg_mean_absolute_error']), + np.std(score['test_neg_mean_absolute_error'])), + elapsed_time) + +plt.suptitle('Single predictors versus stacked predictors') +plt.tight_layout() +plt.subplots_adjust(top=0.9) +plt.show() + +############################################################################### +# The stacked regressor will combine the strengths of the different regressors. +# However, we also see that training the stacked regressor is much more +# computationally expensive. diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py new file mode 100644 index 0000000000000..b24087f4cb593 --- /dev/null +++ b/examples/neighbors/approximate_nearest_neighbors.py @@ -0,0 +1,293 @@ +""" +===================================== +Approximate nearest neighbors in TSNE +===================================== + +This example presents how to chain KNeighborsTransformer and TSNE in a +pipeline, and how to wrap the packages `annoy` and `nmslib` to replace +KNeighborsTransformer and perform approximate nearest neighbors. +These package can be installed with `pip install annoy nmslib`. + +Note: Currently TSNE(metric='precomputed') does not modify the precomputed +distances, and thus assumes that precomputed euclidean distances are squared. +In future versions, a parameter in TSNE will control the optional squaring of +precomputed distances (see #12401). + +Note: In :class:`KNeighborsTransformer` we use the definition which includes +each training point as its own neighbor in the count of `n_neighbors`, and for +compatibility reasons, one extra neighbor is computed when +`mode == 'distance'`. Please note that we do the same in the proposed wrappers. + +Sample output: + +Benchmarking on MNIST_2000: +--------------------------- +AnnoyTransformer: 0.583 sec +NMSlibTransformer: 0.321 sec +KNeighborsTransformer: 1.225 sec +TSNE with AnnoyTransformer: 4.903 sec +TSNE with NMSlibTransformer: 5.009 sec +TSNE with KNeighborsTransformer: 6.210 sec +TSNE with internal NearestNeighbors: 6.365 sec + +Benchmarking on MNIST_10000: +---------------------------- +AnnoyTransformer: 4.457 sec +NMSlibTransformer: 2.080 sec +KNeighborsTransformer: 30.680 sec +TSNE with AnnoyTransformer: 30.225 sec +TSNE with NMSlibTransformer: 43.295 sec +TSNE with KNeighborsTransformer: 64.845 sec +TSNE with internal NearestNeighbors: 64.984 sec +""" +# Author: Tom Dupre la Tour +# +# License: BSD 3 clause +import time +import sys + +try: + import annoy +except ImportError: + print("The package 'annoy' is required to run this example.") + sys.exit() + +try: + import nmslib +except ImportError: + print("The package 'nmslib' is required to run this example.") + sys.exit() + +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.ticker import NullFormatter +from scipy.sparse import csr_matrix + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.neighbors import KNeighborsTransformer +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.datasets import fetch_openml +from sklearn.pipeline import make_pipeline +from sklearn.manifold import TSNE +from sklearn.utils import shuffle + +print(__doc__) + + +class NMSlibTransformer(TransformerMixin, BaseEstimator): + """Wrapper for using nmslib as sklearn's KNeighborsTransformer""" + + def __init__(self, n_neighbors=5, metric='euclidean', method='sw-graph', + n_jobs=1): + self.n_neighbors = n_neighbors + self.method = method + self.metric = metric + self.n_jobs = n_jobs + + def fit(self, X): + self.n_samples_fit_ = X.shape[0] + + # see more metric in the manual + # https://github.com/nmslib/nmslib/tree/master/manual + space = { + 'sqeuclidean': 'l2', + 'euclidean': 'l2', + 'cosine': 'cosinesimil', + 'l1': 'l1', + 'l2': 'l2', + }[self.metric] + + self.nmslib_ = nmslib.init(method=self.method, space=space) + self.nmslib_.addDataPointBatch(X) + self.nmslib_.createIndex() + return self + + def transform(self, X): + n_samples_transform = X.shape[0] + + # For compatibility reasons, as each sample is considered as its own + # neighbor, one extra neighbor will be computed. + n_neighbors = self.n_neighbors + 1 + + results = self.nmslib_.knnQueryBatch(X, k=n_neighbors, + num_threads=self.n_jobs) + indices, distances = zip(*results) + indices, distances = np.vstack(indices), np.vstack(distances) + + if self.metric == 'sqeuclidean': + distances **= 2 + + indptr = np.arange(0, n_samples_transform * n_neighbors + 1, + n_neighbors) + kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(), + indptr), shape=(n_samples_transform, + self.n_samples_fit_)) + + return kneighbors_graph + + +class AnnoyTransformer(TransformerMixin, BaseEstimator): + """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer""" + + def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10, + search_k=-1): + self.n_neighbors = n_neighbors + self.n_trees = n_trees + self.search_k = search_k + self.metric = metric + + def fit(self, X): + self.n_samples_fit_ = X.shape[0] + metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean' + self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric) + for i, x in enumerate(X): + self.annoy_.add_item(i, x.tolist()) + self.annoy_.build(self.n_trees) + return self + + def transform(self, X): + return self._transform(X) + + def fit_transform(self, X, y=None): + return self.fit(X)._transform(X=None) + + def _transform(self, X): + """As `transform`, but handles X is None for faster `fit_transform`.""" + + n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0] + + # For compatibility reasons, as each sample is considered as its own + # neighbor, one extra neighbor will be computed. + n_neighbors = self.n_neighbors + 1 + + indices = np.empty((n_samples_transform, n_neighbors), + dtype=np.int) + distances = np.empty((n_samples_transform, n_neighbors)) + + if X is None: + for i in range(self.annoy_.get_n_items()): + ind, dist = self.annoy_.get_nns_by_item( + i, n_neighbors, self.search_k, include_distances=True) + + indices[i], distances[i] = ind, dist + else: + for i, x in enumerate(X): + indices[i], distances[i] = self.annoy_.get_nns_by_vector( + x.tolist(), n_neighbors, self.search_k, + include_distances=True) + + if self.metric == 'sqeuclidean': + distances **= 2 + + indptr = np.arange(0, n_samples_transform * n_neighbors + 1, + n_neighbors) + kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(), + indptr), shape=(n_samples_transform, + self.n_samples_fit_)) + + return kneighbors_graph + + +def test_transformers(): + """Test that AnnoyTransformer and KNeighborsTransformer give same results + """ + X = np.random.RandomState(42).randn(10, 2) + + knn = KNeighborsTransformer() + Xt0 = knn.fit_transform(X) + + ann = AnnoyTransformer() + Xt1 = ann.fit_transform(X) + + nms = NMSlibTransformer() + Xt2 = nms.fit_transform(X) + + assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5) + assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5) + + +def load_mnist(n_samples): + """Load MNIST, shuffle the data, and return only n_samples.""" + mnist = fetch_openml(data_id=41063) + X, y = shuffle(mnist.data, mnist.target, random_state=42) + return X[:n_samples], y[:n_samples] + + +def run_benchmark(): + datasets = [ + ('MNIST_2000', load_mnist(n_samples=2000)), + ('MNIST_10000', load_mnist(n_samples=10000)), + ] + + n_iter = 500 + perplexity = 30 + # TSNE requires a certain number of neighbors which depends on the + # perplexity parameter. + # Add one since we include each sample as its own neighbor. + n_neighbors = int(3. * perplexity + 1) + 1 + + transformers = [ + ('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors, + metric='sqeuclidean')), + ('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors, + metric='sqeuclidean')), + ('KNeighborsTransformer', KNeighborsTransformer( + n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')), + ('TSNE with AnnoyTransformer', make_pipeline( + AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'), + TSNE(metric='precomputed', perplexity=perplexity, + method="barnes_hut", random_state=42, n_iter=n_iter), )), + ('TSNE with NMSlibTransformer', make_pipeline( + NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'), + TSNE(metric='precomputed', perplexity=perplexity, + method="barnes_hut", random_state=42, n_iter=n_iter), )), + ('TSNE with KNeighborsTransformer', make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', + metric='sqeuclidean'), + TSNE(metric='precomputed', perplexity=perplexity, + method="barnes_hut", random_state=42, n_iter=n_iter), )), + ('TSNE with internal NearestNeighbors', + TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut", + random_state=42, n_iter=n_iter)), + ] + + # init the plot + nrows = len(datasets) + ncols = np.sum([1 for name, model in transformers if 'TSNE' in name]) + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False, + figsize=(5 * ncols, 4 * nrows)) + axes = axes.ravel() + i_ax = 0 + + for dataset_name, (X, y) in datasets: + + msg = 'Benchmarking on %s:' % dataset_name + print('\n%s\n%s' % (msg, '-' * len(msg))) + + for transformer_name, transformer in transformers: + start = time.time() + Xt = transformer.fit_transform(X) + duration = time.time() - start + + # print the duration report + longest = np.max([len(name) for name, model in transformers]) + whitespaces = ' ' * (longest - len(transformer_name)) + print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration)) + + # plot TSNE embedding which should be very similar across methods + if 'TSNE' in transformer_name: + axes[i_ax].set_title(transformer_name + '\non ' + dataset_name) + axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y, alpha=0.2, + cmap=plt.cm.viridis) + axes[i_ax].xaxis.set_major_formatter(NullFormatter()) + axes[i_ax].yaxis.set_major_formatter(NullFormatter()) + axes[i_ax].axis('tight') + i_ax += 1 + + fig.tight_layout() + plt.show() + + +if __name__ == '__main__': + test_transformers() + run_benchmark() diff --git a/examples/neighbors/plot_caching_nearest_neighbors.py b/examples/neighbors/plot_caching_nearest_neighbors.py new file mode 100644 index 0000000000000..9fecea09f6b78 --- /dev/null +++ b/examples/neighbors/plot_caching_nearest_neighbors.py @@ -0,0 +1,60 @@ +""" +========================= +Caching nearest neighbors +========================= + +This examples demonstrates how to precompute the k nearest neighbors before +using them in KNeighborsClassifier. KNeighborsClassifier can compute the +nearest neighbors internally, but precomputing them can have several benefits, +such as finer parameter control, caching for multiple use, or custom +implementations. + +Here we use the caching property of pipelines to cache the nearest neighbors +graph between multiple fits of KNeighborsClassifier. The first call is slow +since it computes the neighbors graph, while subsequent call are faster as they +do not need to recompute the graph. Here the durations are small since the +dataset is small, but the gain can be more substantial when the dataset grows +larger, or when the grid of parameter to search is large. +""" +# Author: Tom Dupre la Tour +# +# License: BSD 3 clause +import matplotlib.pyplot as plt + +from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.datasets import load_digits +from sklearn.pipeline import Pipeline + +print(__doc__) + +X, y = load_digits(return_X_y=True) +n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9] + +# The transformer computes the nearest neighbors graph using the maximum number +# of neighbors necessary in the grid search. The classifier model filters the +# nearest neighbors graph as required by its own n_neighbors parameter. +graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), + mode='distance') +classifier_model = KNeighborsClassifier(metric='precomputed') + +# Note that we give `memory` a directory to cache the graph computation. +full_model = Pipeline( + steps=[('graph', graph_model), ('classifier', classifier_model)], + memory='./cache') + +param_grid = {'classifier__n_neighbors': n_neighbors_list} +grid_model = GridSearchCV(full_model, param_grid) +grid_model.fit(X, y) + +# Plot the results of the grid search. +fig, axes = plt.subplots(1, 2, figsize=(8, 4)) +axes[0].errorbar(x=n_neighbors_list, + y=grid_model.cv_results_['mean_test_score'], + yerr=grid_model.cv_results_['std_test_score']) +axes[0].set(xlabel='n_neighbors', title='Classification accuracy') +axes[1].errorbar(x=n_neighbors_list, y=grid_model.cv_results_['mean_fit_time'], + yerr=grid_model.cv_results_['std_fit_time'], color='r') +axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)') +fig.tight_layout() +plt.show() diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py index 38d06d1c244b7..9de22673606f2 100644 --- a/examples/neighbors/plot_nca_illustration.py +++ b/examples/neighbors/plot_nca_illustration.py @@ -3,10 +3,10 @@ Neighborhood Components Analysis Illustration ============================================= -An example illustrating the goal of learning a distance metric that maximizes -the nearest neighbors classification accuracy. The example is solely for -illustration purposes. Please refer to the :ref:`User Guide ` for -more information. +This example illustrates a learned distance metric that maximizes +the nearest neighbors classification accuracy. It provides a visual +representation of this metric compared to the original point +space. Please refer to the :ref:`User Guide ` for more information. """ # License: BSD 3 clause @@ -20,23 +20,31 @@ print(__doc__) -random_state = 0 +############################################################################## +# Original points +# --------------- +# First we create a data set of 9 samples from 3 classes, and plot the points +# in the original space. For this example, we focus on the classification of +# point no. 3. The thickness of a link between point no. 3 and another point +# is proportional to their distance. -# Create a tiny data set of 9 samples from 3 classes X, y = make_classification(n_samples=9, n_features=2, n_informative=2, n_redundant=0, n_classes=3, n_clusters_per_class=1, - class_sep=1.0, random_state=random_state) + class_sep=1.0, random_state=0) -# Plot the points in the original space -plt.figure() +plt.figure(1) ax = plt.gca() - -# Draw the graph nodes for i in range(X.shape[0]): ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center') ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4) -def p_i(X, i): +ax.set_title("Original points") +ax.axes.get_xaxis().set_visible(False) +ax.axes.get_yaxis().set_visible(False) +ax.axis('equal') # so that boundaries are displayed correctly as circles + + +def link_thickness_i(X, i): diff_embedded = X[i] - X dist_embedded = np.einsum('ij,ij->i', diff_embedded, diff_embedded) @@ -52,34 +60,30 @@ def p_i(X, i): def relate_point(X, i, ax): pt_i = X[i] for j, pt_j in enumerate(X): - thickness = p_i(X, i) + thickness = link_thickness_i(X, i) if i != j: line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]]) ax.plot(*line, c=cm.Set1(y[j]), linewidth=5*thickness[j]) -# we consider only point 3 i = 3 - -# Plot bonds linked to sample i in the original space relate_point(X, i, ax) -ax.set_title("Original points") -ax.axes.get_xaxis().set_visible(False) -ax.axes.get_yaxis().set_visible(False) -ax.axis('equal') +plt.show() -# Learn an embedding with NeighborhoodComponentsAnalysis -nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state) +############################################################################## +# Learning an embedding +# --------------------- +# We use :class:`~sklearn.neighbors.NeighborhoodComponentsAnalysis` to learn an +# embedding and plot the points after the transformation. We then take the +# embedding and find the nearest neighbors. + +nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=0) nca = nca.fit(X, y) -# Plot the points after transformation with NeighborhoodComponentsAnalysis -plt.figure() +plt.figure(2) ax2 = plt.gca() - -# Get the embedding and find the new nearest neighbors X_embedded = nca.transform(X) - relate_point(X_embedded, i, ax2) for i in range(len(X)): @@ -88,7 +92,6 @@ def relate_point(X, i, ax): ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4) -# Make axes equal so that boundaries are displayed correctly as circles ax2.set_title("NCA embedding") ax2.axes.get_xaxis().set_visible(False) ax2.axes.get_yaxis().set_visible(False) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index fbbd912592979..9b6467d170e70 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -51,8 +51,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and - must be square. X may be a sparse matrix, in which case only "nonzero" - elements may be considered neighbors for DBSCAN. + must be square during fit. X may be a :term:`Glossary `, + in which case only "nonzero" elements may be considered neighbors. metric_params : dict, optional Additional keyword arguments for the metric function. @@ -172,8 +172,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and - must be square. X may be a sparse matrix, in which case only "nonzero" - elements may be considered neighbors for DBSCAN. + must be square. X may be a :term:`Glossary `, in which + case only "nonzero" elements may be considered neighbors for DBSCAN. .. versionadded:: 0.17 metric *precomputed* to accept precomputed sparse matrix. @@ -319,32 +319,20 @@ def fit(self, X, y=None, sample_weight=None): # point in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if self.metric == 'precomputed' and sparse.issparse(X): - neighborhoods = np.empty(X.shape[0], dtype=object) - X.sum_duplicates() # XXX: modifies X's internals in-place - # set the diagonal to explicit values, as a point is its own # neighbor with warnings.catch_warnings(): warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place - X_mask = X.data <= self.eps - masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] - masked_indptr = np.concatenate(([0], np.cumsum(X_mask))) - masked_indptr = masked_indptr[X.indptr[1:-1]] - - # split into rows - neighborhoods[:] = np.split(masked_indices, masked_indptr) - else: - neighbors_model = NearestNeighbors( - radius=self.eps, algorithm=self.algorithm, - leaf_size=self.leaf_size, metric=self.metric, - metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs - ) - neighbors_model.fit(X) - # This has worst case O(n^2) memory complexity - neighborhoods = neighbors_model.radius_neighbors( - X, self.eps, return_distance=False) + neighbors_model = NearestNeighbors( + radius=self.eps, algorithm=self.algorithm, + leaf_size=self.leaf_size, metric=self.metric, + metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs) + neighbors_model.fit(X) + # This has worst case O(n^2) memory complexity + neighborhoods = neighbors_model.radius_neighbors(X, + return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 207dfaeb08974..588742613938d 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -13,7 +13,7 @@ from ..utils import check_random_state, as_float_array from ..utils.validation import check_array from ..metrics.pairwise import pairwise_kernels -from ..neighbors import kneighbors_graph +from ..neighbors import kneighbors_graph, NearestNeighbors from ..manifold import spectral_embedding from .k_means_ import k_means @@ -326,10 +326,18 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. Ignored for ``affinity='nearest_neighbors'``. - affinity : string, array-like or callable, default 'rbf' - If a string, this may be one of 'nearest_neighbors', 'precomputed', - 'rbf' or one of the kernels supported by - `sklearn.metrics.pairwise_kernels`. + affinity : string or callable, default 'rbf' + How to construct the affinity matrix. + - 'nearest_neighbors' : construct the affinity matrix by computing a + graph of nearest neighbors. + - 'rbf' : construct the affinity matrix using a radial basis function + (RBF) kernel. + - 'precomputed' : interpret ``X`` as a precomputed affinity matrix. + - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph + of precomputed nearest neighbors, and constructs the affinity matrix + by selecting the ``n_neighbors`` nearest neighbors. + - one of the kernels supported by + :func:`~sklearn.metrics.pairwise_kernels`. Only kernels that produce similarity scores (non-negative values that increase with similarity) should be used. This property is not checked @@ -468,7 +476,9 @@ def fit(self, X, y=None): """ X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2) - if X.shape[0] == X.shape[1] and self.affinity != "precomputed": + allow_squared = self.affinity in ["precomputed", + "precomputed_nearest_neighbors"] + if X.shape[0] == X.shape[1] and not allow_squared: warnings.warn("The spectral clustering API has changed. ``fit``" "now constructs an affinity matrix from data. To use" " a custom affinity matrix, " @@ -479,6 +489,12 @@ def fit(self, X, y=None): include_self=True, n_jobs=self.n_jobs) self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) + elif self.affinity == 'precomputed_nearest_neighbors': + estimator = NearestNeighbors(n_neighbors=self.n_neighbors, + n_jobs=self.n_jobs, + metric="precomputed").fit(X) + connectivity = estimator.kneighbors_graph(X=X, mode='connectivity') + self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) elif self.affinity == 'precomputed': self.affinity_matrix_ = X else: @@ -530,4 +546,5 @@ def fit_predict(self, X, y=None): @property def _pairwise(self): - return self.affinity == "precomputed" + return self.affinity in ["precomputed", + "precomputed_nearest_neighbors"] diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index e74120ecb9c03..17f85b4fb0fbf 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -95,6 +95,23 @@ def test_dbscan_sparse_precomputed(include_self): assert_array_equal(labels_dense, labels_sparse) +def test_dbscan_sparse_precomputed_different_eps(): + # test that precomputed neighbors graph is filtered if computed with + # a radius larger than DBSCAN's eps. + lower_eps = 0.2 + nn = NearestNeighbors(radius=lower_eps).fit(X) + D_sparse = nn.radius_neighbors_graph(X, mode='distance') + dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed') + + higher_eps = lower_eps + 0.7 + nn = NearestNeighbors(radius=higher_eps).fit(X) + D_sparse = nn.radius_neighbors_graph(X, mode='distance') + dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed') + + assert_array_equal(dbscan_lower[0], dbscan_higher[0]) + assert_array_equal(dbscan_lower[1], dbscan_higher[1]) + + @pytest.mark.parametrize('use_sparse', [True, False]) @pytest.mark.parametrize('metric', ['precomputed', 'minkowski']) def test_dbscan_input_not_modified(use_sparse, metric): diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 9ea9cfa7df9b8..dd1a1227a8f09 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -17,6 +17,7 @@ from sklearn.metrics import pairwise_distances from sklearn.metrics import adjusted_rand_score from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel +from sklearn.neighbors import NearestNeighbors from sklearn.datasets.samples_generator import make_blobs try: @@ -102,6 +103,25 @@ def test_spectral_clustering_sparse(): assert adjusted_rand_score(y, labels) == 1 +def test_precomputed_nearest_neighbors_filtering(): + # Test precomputed graph filtering when containing too many neighbors + X, y = make_blobs(n_samples=200, random_state=0, + centers=[[1, 1], [-1, -1]], cluster_std=0.01) + + n_neighbors = 2 + results = [] + for additional_neighbors in [0, 10]: + nn = NearestNeighbors( + n_neighbors=n_neighbors + additional_neighbors).fit(X) + graph = nn.kneighbors_graph(X, mode='connectivity') + labels = SpectralClustering(random_state=0, n_clusters=2, + affinity='precomputed_nearest_neighbors', + n_neighbors=n_neighbors).fit(graph).labels_ + results.append(labels) + + assert_array_equal(results[0], results[1]) + + def test_affinities(): # Note: in the following, random_state has been selected to have # a dataset that yields a stable eigen decomposition both when built diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py index f42eb9d83798c..d7e5ee52db7eb 100644 --- a/sklearn/datasets/samples_generator.py +++ b/sklearn/datasets/samples_generator.py @@ -91,7 +91,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, n_clusters_per_class : int, optional (default=2) The number of clusters per class. - weights : list of floats or None (default=None) + weights : array-like of shape (n_classes,) or (n_classes - 1,),\ + (default=None) The proportions of samples assigned to each class. If None, then classes are balanced. Note that if ``len(weights) == n_classes - 1``, then the last class weight is automatically inferred. @@ -160,22 +161,27 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, " features") # Use log2 to avoid overflow errors if n_informative < np.log2(n_classes * n_clusters_per_class): - raise ValueError("n_classes * n_clusters_per_class must" - " be smaller or equal 2 ** n_informative") - if weights and len(weights) not in [n_classes, n_classes - 1]: - raise ValueError("Weights specified but incompatible with number " - "of classes.") + msg = "n_classes({}) * n_clusters_per_class({}) must be" + msg += " smaller or equal 2**n_informative({})={}" + raise ValueError(msg.format(n_classes, n_clusters_per_class, + n_informative, 2**n_informative)) + + if weights is not None: + if len(weights) not in [n_classes, n_classes - 1]: + raise ValueError("Weights specified but incompatible with number " + "of classes.") + if len(weights) == n_classes - 1: + if isinstance(weights, list): + weights = weights + [1.0 - sum(weights)] + else: + weights = np.resize(weights, n_classes) + weights[-1] = 1.0 - sum(weights[:-1]) + else: + weights = [1.0 / n_classes] * n_classes n_useless = n_features - n_informative - n_redundant - n_repeated n_clusters = n_classes * n_clusters_per_class - if weights and len(weights) == (n_classes - 1): - weights = weights + [1.0 - sum(weights)] - - if weights is None: - weights = [1.0 / n_classes] * n_classes - weights[-1] = 1.0 - sum(weights[:-1]) - # Distribute samples among clusters by weight n_samples_per_cluster = [ int(n_samples * weights[k % n_classes] / n_clusters_per_class) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index ecd7e7cba1ef1..f10fd54dc681e 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -146,6 +146,36 @@ def test_make_classification_informative_features(): n_clusters_per_class=2) +@pytest.mark.parametrize( + 'weights, err_type, err_msg', + [ + ([], ValueError, + "Weights specified but incompatible with number of classes."), + ([.25, .75, .1], ValueError, + "Weights specified but incompatible with number of classes."), + (np.array([]), ValueError, + "Weights specified but incompatible with number of classes."), + (np.array([.25, .75, .1]), ValueError, + "Weights specified but incompatible with number of classes."), + (np.random.random(3), ValueError, + "Weights specified but incompatible with number of classes.") + ] +) +def test_make_classification_weights_type(weights, err_type, err_msg): + with pytest.raises(err_type, match=err_msg): + make_classification(weights=weights) + + +@pytest.mark.parametrize("kwargs", [{}, {"n_classes": 3, "n_informative": 3}]) +def test_make_classification_weights_array_or_list_ok(kwargs): + X1, y1 = make_classification(weights=[.1, .9], + random_state=0, **kwargs) + X2, y2 = make_classification(weights=np.array([.1, .9]), + random_state=0, **kwargs) + assert_almost_equal(X1, X2) + assert_almost_equal(y1, y2) + + def test_make_multilabel_classification_return_sequences(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=100, n_features=20, diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index e8e8f46e2dec1..3eadb76b9f744 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -18,6 +18,8 @@ from .gradient_boosting import GradientBoostingRegressor from .voting import VotingClassifier from .voting import VotingRegressor +from ._stacking import StackingClassifier +from ._stacking import StackingRegressor from . import bagging from . import forest @@ -32,5 +34,6 @@ "BaggingRegressor", "IsolationForest", "GradientBoostingClassifier", "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "VotingRegressor", + "StackingClassifier", "StackingRegressor", "bagging", "forest", "gradient_boosting", "partial_dependence", "weight_boosting"] diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py new file mode 100644 index 0000000000000..c2a09c54b4622 --- /dev/null +++ b/sklearn/ensemble/_stacking.py @@ -0,0 +1,704 @@ +"""Stacking classifier and regressor.""" + +# Authors: Guillaume Lemaitre +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod +from copy import deepcopy + +import numpy as np +from joblib import Parallel, delayed + +from ..base import clone +from ..base import ClassifierMixin, RegressorMixin, TransformerMixin +from ..base import is_classifier, is_regressor +from ..base import MetaEstimatorMixin + +from .base import _parallel_fit_estimator + +from ..linear_model import LogisticRegression +from ..linear_model import RidgeCV + +from ..model_selection import cross_val_predict +from ..model_selection import check_cv + +from ..preprocessing import LabelEncoder + +from ..utils import Bunch +from ..utils.metaestimators import _BaseComposition +from ..utils.metaestimators import if_delegate_has_method +from ..utils.multiclass import check_classification_targets +from ..utils.validation import check_is_fitted +from ..utils.validation import column_or_1d + + +class _BaseStacking(TransformerMixin, MetaEstimatorMixin, _BaseComposition, + metaclass=ABCMeta): + """Base class for stacking method.""" + _required_parameters = ['estimators'] + + @abstractmethod + def __init__(self, estimators, final_estimator=None, cv=None, + stack_method='auto', n_jobs=None, verbose=0): + self.estimators = estimators + self.final_estimator = final_estimator + self.cv = cv + self.stack_method = stack_method + self.n_jobs = n_jobs + self.verbose = verbose + + @abstractmethod + def _validate_estimators(self): + if self.estimators is None or len(self.estimators) == 0: + raise ValueError( + "Invalid 'estimators' attribute, 'estimators' should be a list" + " of (string, estimator) tuples." + ) + names, estimators = zip(*self.estimators) + self._validate_names(names) + return names, estimators + + def _clone_final_estimator(self, default): + if self.final_estimator is not None: + self.final_estimator_ = clone(self.final_estimator) + else: + self.final_estimator_ = clone(default) + + def set_params(self, **params): + """Set the parameters for the stacking estimator. + + Valid parameter keys can be listed with `get_params()`. + + Parameters + ---------- + params : keyword arguments + Specific parameters using e.g. + `set_params(parameter_name=new_value)`. In addition, to setting the + parameters of the stacking estimator, the individual estimator of + the stacking estimators can also be set, or can be removed by + setting them to 'drop'. + + Examples + -------- + # In this example, the RandomForestClassifier is removed + clf1 = LogisticRegression() + clf2 = RandomForestClassifier() + eclf = StackingClassifier(estimators=[('lr', clf1), ('rf', clf2)] + eclf.set_params(rf='drop') + """ + super()._set_params('estimators', **params) + return self + + def get_params(self, deep=True): + """Get the parameters of the stacking estimator. + + Parameters + ---------- + deep : bool + Setting it to True gets the various classifiers and the parameters + of the classifiers as well. + """ + return super()._get_params('estimators', deep=deep) + + def _concatenate_predictions(self, predictions): + """Concatenate the predictions of each first layer learner. + + This helper is in charge of ensuring the preditions are 2D arrays and + it will drop one of the probability column when using probabilities + in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1) + """ + X_meta = [] + for est_idx, preds in enumerate(predictions): + # case where the the estimator returned a 1D array + if preds.ndim == 1: + X_meta.append(preds.reshape(-1, 1)) + else: + if (self.stack_method_[est_idx] == 'predict_proba' and + len(self.classes_) == 2): + # Remove the first column when using probabilities in + # binary classification because both features are perfectly + # collinear. + X_meta.append(preds[:, 1:]) + else: + X_meta.append(preds) + return np.concatenate(X_meta, axis=1) + + @staticmethod + def _method_name(name, estimator, method): + if estimator == 'drop': + return None + if method == 'auto': + if getattr(estimator, 'predict_proba', None): + return 'predict_proba' + elif getattr(estimator, 'decision_function', None): + return 'decision_function' + else: + return 'predict' + else: + if not hasattr(estimator, method): + raise ValueError('Underlying estimator {} does not implement ' + 'the method {}.'.format(name, method)) + return method + + def fit(self, X, y, sample_weight=None): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,) or None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if all underlying estimators + support sample weights. + + Returns + ------- + self : object + """ + # all_estimators contains all estimators, the one to be fitted and the + # 'drop' string. + names, all_estimators = self._validate_estimators() + self._validate_final_estimator() + + has_estimator = any(est != 'drop' for est in all_estimators) + if not has_estimator: + raise ValueError( + "All estimators are dropped. At least one is required " + "to be an estimator." + ) + + stack_method = [self.stack_method] * len(all_estimators) + + # Fit the base estimators on the whole training data. Those + # base estimators will be used in transform, predict, and + # predict_proba. They are exposed publicly. + self.estimators_ = Parallel(n_jobs=self.n_jobs)( + delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight) + for est in all_estimators if est != 'drop' + ) + + self.named_estimators_ = Bunch() + est_fitted_idx = 0 + for name_est, org_est in zip(names, all_estimators): + if org_est != 'drop': + self.named_estimators_[name_est] = self.estimators_[ + est_fitted_idx] + est_fitted_idx += 1 + + # To train the meta-classifier using the most data as possible, we use + # a cross-validation to obtain the output of the stacked estimators. + + # To ensure that the data provided to each estimator are the same, we + # need to set the random state of the cv if there is one and we need to + # take a copy. + cv = check_cv(self.cv, y=y, classifier=is_classifier(self)) + if hasattr(cv, 'random_state') and cv.random_state is None: + cv.random_state = np.random.RandomState() + + self.stack_method_ = [ + self._method_name(name, est, meth) + for name, est, meth in zip(names, all_estimators, stack_method) + ] + + predictions = Parallel(n_jobs=self.n_jobs)( + delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), + method=meth, n_jobs=self.n_jobs, + verbose=self.verbose) + for est, meth in zip(all_estimators, self.stack_method_) + if est != 'drop' + ) + + # Only not None or not 'drop' estimators will be used in transform. + # Remove the None from the method as well. + self.stack_method_ = [ + meth for (meth, est) in zip(self.stack_method_, all_estimators) + if est != 'drop' + ] + + X_meta = self._concatenate_predictions(predictions) + if sample_weight is not None: + try: + self.final_estimator_.fit( + X_meta, y, sample_weight=sample_weight + ) + except TypeError as exc: + if "unexpected keyword argument 'sample_weight'" in str(exc): + raise TypeError( + "Underlying estimator {} does not support sample " + "weights." + .format(self.final_estimator_.__class__.__name__) + ) from exc + raise + else: + self.final_estimator_.fit(X_meta, y) + + return self + + def _transform(self, X): + """Concatenate and return the predictions of the estimators.""" + check_is_fitted(self) + predictions = [ + getattr(est, meth)(X) + for est, meth in zip(self.estimators_, self.stack_method_) + if est != 'drop' + ] + return self._concatenate_predictions(predictions) + + @if_delegate_has_method(delegate='final_estimator_') + def predict(self, X, **predict_params): + """Predict target for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + **predict_params : dict of str -> obj + Parameters to the `predict` called by the `final_estimator`. Note + that this may be used to return uncertainties from some estimators + with `return_std` or `return_cov`. Be aware that it will only + accounts for uncertainty in the final estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_output) + Predicted targets. + """ + + check_is_fitted(self) + return self.final_estimator_.predict( + self.transform(X), **predict_params + ) + + +class StackingClassifier(ClassifierMixin, _BaseStacking): + """Stack of estimators with a final classifier. + + Stacked generalization consists in stacking the output of individual + estimator and use a classifier to compute the final prediction. Stacking + allows to use the strength of each individual estimator by using their + output as input of a final estimator. + + Note that `estimators_` are fitted on the full `X` while `final_estimator_` + is trained using cross-validated predictions of the base estimators using + `cross_val_predict`. + + .. versionadded:: 0.22 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimators : list of (str, estimator) + Base estimators which will be stacked together. Each element of the + list is defined as a tuple of string (i.e. name) and an estimator + instance. An estimator can be set to 'drop' using `set_params`. + + final_estimator : estimator, default=None + A classifier which will be used to combine the base estimators. + The default classifier is a `LogisticRegression`. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy used in + `cross_val_predict` to train `final_estimator`. Possible inputs for + cv are: + + * None, to use the default 5-fold cross validation, + * integer, to specify the number of folds in a (Stratified) KFold, + * An object to be used as a cross-validation generator, + * An iterable yielding train, test splits. + + For integer/None inputs, if the estimator is a classifier and y is + either binary or multiclass, `StratifiedKFold` is used. In all other + cases, `KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. note:: + A larger number of split will provide no benefits if the number + of training samples is large enough. Indeed, the training time + will increase. ``cv`` is not used for model evaluation but for + prediction. + + stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \ + default='auto' + Methods called for each base estimator. It can be: + + * if 'auto', it will try to invoke, for each estimator, + `'predict_proba'`, `'decision_function'` or `'predict'` in that + order. + * otherwise, one of `'predict_proba'`, `'decision_function'` or + `'predict'`. If the method is not implemented by the estimator, it + will raise an error. + + n_jobs : int, default=None + The number of jobs to run in parallel all `estimators` `fit`. + `None` means 1 unless in a `joblib.parallel_backend` context. -1 means + using all processors. See Glossary for more details. + + Attributes + ---------- + estimators_ : list of estimators + The elements of the estimators parameter, having been fitted on the + training data. If an estimator has been set to `'drop'`, it + will not appear in `estimators_`. + + named_estimators_ : Bunch + Attribute to access any fitted sub-estimators by name. + + final_estimator_ : estimator + The classifier which predicts given the output of `estimators_`. + + stack_method_ : list of str + The method used by each base estimator. + + Notes + ----- + When `predict_proba` is used by each estimator (i.e. most of the time for + `stack_method='auto'` or specifically for `stack_method='predict_proba'`), + The first column predicted by each estimator will be dropped in the case + of a binary classification problem. Indeed, both feature will be perfectly + collinear. + + References + ---------- + .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2 + (1992): 241-259. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.svm import LinearSVC + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.ensemble import StackingClassifier + >>> X, y = load_iris(return_X_y=True) + >>> estimators = [ + ... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), + ... ('svr', make_pipeline(StandardScaler(), + ... LinearSVC(random_state=42))) + ... ] + >>> clf = StackingClassifier( + ... estimators=estimators, final_estimator=LogisticRegression() + ... ) + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=42 + ... ) + >>> clf.fit(X_train, y_train).score(X_test, y_test) + 0.9... + + """ + def __init__(self, estimators, final_estimator=None, cv=None, + stack_method='auto', n_jobs=None, verbose=0): + super().__init__( + estimators=estimators, + final_estimator=final_estimator, + cv=cv, + stack_method=stack_method, + n_jobs=n_jobs, + verbose=verbose + ) + + def _validate_estimators(self): + names, estimators = super()._validate_estimators() + for est in estimators: + if est != 'drop' and not is_classifier(est): + raise ValueError( + "The estimator {} should be a classifier." + .format(est.__class__.__name__) + ) + return names, estimators + + def _validate_final_estimator(self): + self._clone_final_estimator(default=LogisticRegression()) + if not is_classifier(self.final_estimator_): + raise ValueError( + "'final_estimator' parameter should be a classifier. Got {}" + .format(self.final_estimator_) + ) + + def fit(self, X, y, sample_weight=None): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,) or None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if all underlying estimators + support sample weights. + + Returns + ------- + self : object + """ + check_classification_targets(y) + self._le = LabelEncoder().fit(y) + self.classes_ = self._le.classes_ + return super().fit(X, self._le.transform(y), sample_weight) + + @if_delegate_has_method(delegate='final_estimator_') + def predict(self, X, **predict_params): + """Predict target for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + **predict_params : dict of str -> obj + Parameters to the `predict` called by the `final_estimator`. Note + that this may be used to return uncertainties from some estimators + with `return_std` or `return_cov`. Be aware that it will only + accounts for uncertainty in the final estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_output) + Predicted targets. + """ + y_pred = super().predict(X, **predict_params) + return self._le.inverse_transform(y_pred) + + @if_delegate_has_method(delegate='final_estimator_') + def predict_proba(self, X): + """Predict class probabilities for X using + `final_estimator_.predict_proba`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + probabilities : ndarray of shape (n_samples, n_classes) or \ + list of ndarray of shape (n_output,) + The class probabilities of the input samples. + """ + check_is_fitted(self) + return self.final_estimator_.predict_proba(self.transform(X)) + + @if_delegate_has_method(delegate='final_estimator_') + def decision_function(self, X): + """Predict decision function for samples in X using + `final_estimator_.decision_function`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \ + or (n_samples, n_classes * (n_classes-1) / 2) + The decision function computed the final estimator. + """ + check_is_fitted(self) + return self.final_estimator_.decision_function(self.transform(X)) + + def transform(self, X): + """Return class labels or probabilities for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + y_preds : ndarray of shape (n_samples, n_estimators) or \ + (n_samples, n_classes * n_estimators) + Prediction outputs for each estimator. + """ + return self._transform(X) + + +class StackingRegressor(RegressorMixin, _BaseStacking): + """Stack of estimators with a final regressor. + + Stacked generalization consists in stacking the output of individual + estimator and use a regressor to compute the final prediction. Stacking + allows to use the strength of each individual estimator by using their + output as input of a final estimator. + + Note that `estimators_` are fitted on the full `X` while `final_estimator_` + is trained using cross-validated predictions of the base estimators using + `cross_val_predict`. + + .. versionadded:: 0.22 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimators : list of (str, estimator) + Base estimators which will be stacked together. Each element of the + list is defined as a tuple of string (i.e. name) and an estimator + instance. An estimator can be set to 'drop' using `set_params`. + + final_estimator : estimator, default=None + A regressor which will be used to combine the base estimators. + The default regressor is a `RidgeCV`. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy used in + `cross_val_predict` to train `final_estimator`. Possible inputs for + cv are: + + * None, to use the default 5-fold cross validation, + * integer, to specify the number of folds in a (Stratified) KFold, + * An object to be used as a cross-validation generator, + * An iterable yielding train, test splits. + + For integer/None inputs, if the estimator is a classifier and y is + either binary or multiclass, `StratifiedKFold` is used. In all other + cases, `KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. note:: + A larger number of split will provide no benefits if the number + of training samples is large enough. Indeed, the training time + will increase. ``cv`` is not used for model evaluation but for + prediction. + + n_jobs : int, default=None + The number of jobs to run in parallel for `fit` of all `estimators`. + `None` means 1 unless in a `joblib.parallel_backend` context. -1 means + using all processors. See Glossary for more details. + + Attributes + ---------- + estimators_ : list of estimator + The elements of the estimators parameter, having been fitted on the + training data. If an estimator has been set to `'drop'`, it + will not appear in `estimators_`. + + named_estimators_ : Bunch + Attribute to access any fitted sub-estimators by name. + + final_estimator_ : estimator + The regressor to stacked the base estimators fitted. + + References + ---------- + .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2 + (1992): 241-259. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import RidgeCV + >>> from sklearn.svm import LinearSVR + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.ensemble import StackingRegressor + >>> X, y = load_diabetes(return_X_y=True) + >>> estimators = [ + ... ('lr', RidgeCV()), + ... ('svr', LinearSVR(random_state=42)) + ... ] + >>> reg = StackingRegressor( + ... estimators=estimators, + ... final_estimator=RandomForestRegressor(n_estimators=10, + ... random_state=42) + ... ) + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=42 + ... ) + >>> reg.fit(X_train, y_train).score(X_test, y_test) + 0.3... + + """ + def __init__(self, estimators, final_estimator=None, cv=None, n_jobs=None, + verbose=0): + super().__init__( + estimators=estimators, + final_estimator=final_estimator, + cv=cv, + stack_method="predict", + n_jobs=n_jobs, + verbose=verbose + ) + + def _validate_estimators(self): + names, estimators = super()._validate_estimators() + for est in estimators: + if est != 'drop' and not is_regressor(est): + raise ValueError( + "The estimator {} should be a regressor." + .format(est.__class__.__name__) + ) + return names, estimators + + def _validate_final_estimator(self): + self._clone_final_estimator(default=RidgeCV()) + if not is_regressor(self.final_estimator_): + raise ValueError( + "'final_estimator' parameter should be a regressor. Got {}" + .format(self.final_estimator_) + ) + + def fit(self, X, y, sample_weight=None): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,) or None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if all underlying estimators + support sample weights. + + Returns + ------- + self : object + """ + y = column_or_1d(y, warn=True) + return super().fit(X, y, sample_weight) + + def transform(self, X): + """Return the predictions for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + y_preds : ndarray of shape (n_samples, n_estimators) + Prediction outputs for each estimator. + """ + return self._transform(X) diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py index 36c7b1067c381..b266c38bcbfaa 100644 --- a/sklearn/ensemble/base.py +++ b/sklearn/ensemble/base.py @@ -19,6 +19,23 @@ MAX_RAND_SEED = np.iinfo(np.int32).max +def _parallel_fit_estimator(estimator, X, y, sample_weight=None): + """Private function used to fit an estimator within a job.""" + if sample_weight is not None: + try: + estimator.fit(X, y, sample_weight=sample_weight) + except TypeError as exc: + if "unexpected keyword argument 'sample_weight'" in str(exc): + raise TypeError( + "Underlying estimator {} does not support sample weights." + .format(estimator.__class__.__name__) + ) from exc + raise + else: + estimator.fit(X, y) + return estimator + + def _set_random_states(estimator, random_state=None): """Sets fixed random_state parameters for an estimator diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py new file mode 100644 index 0000000000000..3a61456a5665e --- /dev/null +++ b/sklearn/ensemble/tests/test_stacking.py @@ -0,0 +1,492 @@ +"""Test the stacking classifier and regressor.""" + +# Authors: Guillaume Lemaitre +# License: BSD 3 clause + +import pytest +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.base import ClassifierMixin +from sklearn.base import RegressorMixin +from sklearn.base import clone + +from sklearn.exceptions import ConvergenceWarning + +from sklearn.datasets import load_iris +from sklearn.datasets import load_diabetes +from sklearn.datasets import load_breast_cancer + +from sklearn.dummy import DummyClassifier +from sklearn.dummy import DummyRegressor +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LinearRegression +from sklearn.svm import LinearSVC +from sklearn.svm import LinearSVR +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor +from sklearn.preprocessing import scale + +from sklearn.ensemble import StackingClassifier +from sklearn.ensemble import StackingRegressor + +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import KFold + +from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import ignore_warnings +from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.estimator_checks import check_no_attributes_set_in_init + +X_diabetes, y_diabetes = load_diabetes(return_X_y=True) +X_iris, y_iris = load_iris(return_X_y=True) + + +@pytest.mark.parametrize( + "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)] +) +@pytest.mark.parametrize( + "final_estimator", [None, RandomForestClassifier(random_state=42)] +) +def test_stacking_classifier_iris(cv, final_estimator): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, y_test = train_test_split( + scale(X_iris), y_iris, stratify=y_iris, random_state=42 + ) + estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] + clf = StackingClassifier( + estimators=estimators, final_estimator=final_estimator, cv=cv + ) + clf.fit(X_train, y_train) + clf.predict(X_test) + clf.predict_proba(X_test) + assert clf.score(X_test, y_test) > 0.8 + + X_trans = clf.transform(X_test) + assert X_trans.shape[1] == 6 + + clf.set_params(lr='drop') + clf.fit(X_train, y_train) + clf.predict(X_test) + clf.predict_proba(X_test) + if final_estimator is None: + # LogisticRegression has decision_function method + clf.decision_function(X_test) + + X_trans = clf.transform(X_test) + assert X_trans.shape[1] == 3 + + +def test_stacking_classifier_drop_column_binary_classification(): + # check that a column is dropped in binary classification + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, _ = train_test_split( + scale(X), y, stratify=y, random_state=42 + ) + + # both classifiers implement 'predict_proba' and will both drop one column + estimators = [('lr', LogisticRegression()), + ('rf', RandomForestClassifier(random_state=42))] + clf = StackingClassifier(estimators=estimators, cv=3) + + clf.fit(X_train, y_train) + X_trans = clf.transform(X_test) + assert X_trans.shape[1] == 2 + + # LinearSVC does not implement 'predict_proba' and will not drop one column + estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] + clf.set_params(estimators=estimators) + + clf.fit(X_train, y_train) + X_trans = clf.transform(X_test) + assert X_trans.shape[1] == 2 + + +def test_stacking_classifier_drop_estimator(): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, _ = train_test_split( + scale(X_iris), y_iris, stratify=y_iris, random_state=42 + ) + estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))] + rf = RandomForestClassifier(n_estimators=10, random_state=42) + clf = StackingClassifier( + estimators=[('svc', LinearSVC(random_state=0))], + final_estimator=rf, cv=5 + ) + clf_drop = StackingClassifier( + estimators=estimators, final_estimator=rf, cv=5 + ) + + clf.fit(X_train, y_train) + clf_drop.fit(X_train, y_train) + assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) + assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) + assert_allclose(clf.transform(X_test), clf_drop.transform(X_test)) + + +def test_stacking_regressor_drop_estimator(): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, _ = train_test_split( + scale(X_diabetes), y_diabetes, random_state=42 + ) + estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))] + rf = RandomForestRegressor(n_estimators=10, random_state=42) + reg = StackingRegressor( + estimators=[('svr', LinearSVR(random_state=0))], + final_estimator=rf, cv=5 + ) + reg_drop = StackingRegressor( + estimators=estimators, final_estimator=rf, cv=5 + ) + + reg.fit(X_train, y_train) + reg_drop.fit(X_train, y_train) + assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) + assert_allclose(reg.transform(X_test), reg_drop.transform(X_test)) + + +@pytest.mark.parametrize( + "cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)] +) +@pytest.mark.parametrize( + "final_estimator, predict_params", + [(None, {}), + (RandomForestRegressor(random_state=42), {}), + (DummyRegressor(), {'return_std': True})] +) +def test_stacking_regressor_diabetes(cv, final_estimator, predict_params): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, _ = train_test_split( + scale(X_diabetes), y_diabetes, random_state=42 + ) + estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] + reg = StackingRegressor( + estimators=estimators, final_estimator=final_estimator, cv=cv + ) + reg.fit(X_train, y_train) + result = reg.predict(X_test, **predict_params) + expected_result_length = 2 if predict_params else 1 + if predict_params: + assert len(result) == expected_result_length + + X_trans = reg.transform(X_test) + assert X_trans.shape[1] == 2 + + reg.set_params(lr='drop') + reg.fit(X_train, y_train) + reg.predict(X_test) + + X_trans = reg.transform(X_test) + assert X_trans.shape[1] == 1 + + +def test_stacking_classifier_drop_binary_prob(): + # check that classifier will drop one of the probability column for + # binary classification problem + + # Select only the 2 first classes + X_, y_ = scale(X_iris[:100]), y_iris[:100] + + estimators = [ + ('lr', LogisticRegression()), ('rf', RandomForestClassifier()) + ] + clf = StackingClassifier(estimators=estimators) + clf.fit(X_, y_) + X_meta = clf.transform(X_) + assert X_meta.shape[1] == 2 + + +class NoWeightRegressor(BaseEstimator, RegressorMixin): + def fit(self, X, y): + self.reg = DummyRegressor() + return self.reg.fit(X, y) + + def predict(self, X): + return np.ones(X.shape[0]) + + +class NoWeightClassifier(BaseEstimator, ClassifierMixin): + def fit(self, X, y): + self.clf = DummyClassifier() + return self.clf.fit(X, y) + + +@pytest.mark.parametrize( + "y, params, type_err, msg_err", + [(y_iris, + {'estimators': None}, + ValueError, "Invalid 'estimators' attribute,"), + (y_iris, + {'estimators': []}, + ValueError, "Invalid 'estimators' attribute,"), + (y_iris, + {'estimators': [('lr', LinearRegression()), + ('svm', LinearSVC(max_iter=5e4))]}, + ValueError, 'should be a classifier'), + (y_iris, + {'estimators': [('lr', LogisticRegression()), + ('svm', SVC(max_iter=5e4))], + 'stack_method': 'predict_proba'}, + ValueError, 'does not implement the method predict_proba'), + (y_iris, + {'estimators': [('lr', LogisticRegression()), + ('cor', NoWeightClassifier())]}, + TypeError, 'does not support sample weight'), + (y_iris, + {'estimators': [('lr', LogisticRegression()), + ('cor', LinearSVC(max_iter=5e4))], + 'final_estimator': NoWeightClassifier()}, + TypeError, 'does not support sample weight'), + (y_iris, + {'estimators': [('lr', 'drop'), ('svm', 'drop')]}, + ValueError, 'All estimators are dropped'), + (y_iris, + {'estimators': [('lr', LogisticRegression()), ('svm', LinearSVC())], + 'final_estimator': RandomForestRegressor()}, + ValueError, 'parameter should be a classifier.')] +) +def test_stacking_classifier_error(y, params, type_err, msg_err): + with pytest.raises(type_err, match=msg_err): + clf = StackingClassifier(**params, cv=3) + clf.fit( + scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]) + ) + + +@pytest.mark.parametrize( + "y, params, type_err, msg_err", + [(y_diabetes, + {'estimators': None}, + ValueError, "Invalid 'estimators' attribute,"), + (y_diabetes, + {'estimators': []}, + ValueError, "Invalid 'estimators' attribute,"), + (y_diabetes, + {'estimators': [('lr', LogisticRegression()), ('svm', LinearSVR())]}, + ValueError, 'should be a regressor'), + (y_diabetes, + {'estimators': [('lr', LinearRegression()), + ('cor', NoWeightRegressor())]}, + TypeError, 'does not support sample weight'), + (y_diabetes, + {'estimators': [('lr', LinearRegression()), + ('cor', LinearSVR())], + 'final_estimator': NoWeightRegressor()}, + TypeError, 'does not support sample weight'), + (y_diabetes, + {'estimators': [('lr', 'drop'), ('svm', 'drop')]}, + ValueError, 'All estimators are dropped'), + (y_diabetes, + {'estimators': [('lr', LinearRegression()), ('svm', LinearSVR())], + 'final_estimator': RandomForestClassifier()}, + ValueError, 'parameter should be a regressor.')] +) +def test_stacking_regressor_error(y, params, type_err, msg_err): + with pytest.raises(type_err, match=msg_err): + reg = StackingRegressor(**params, cv=3) + reg.fit( + scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]) + ) + + +@pytest.mark.parametrize( + "stacking_estimator", + [StackingClassifier(estimators=[('lr', LogisticRegression()), + ('svm', LinearSVC())]), + StackingRegressor(estimators=[('lr', LinearRegression()), + ('svm', LinearSVR(max_iter=1e4))])] +) +def test_stacking_named_estimators(stacking_estimator): + stacking_estimator.fit(scale(X_iris), y_iris) + estimators = stacking_estimator.named_estimators_ + assert len(estimators) == 2 + assert sorted(list(estimators.keys())) == sorted(['lr', 'svm']) + + +@pytest.mark.parametrize( + "stacking_estimator", + [StackingClassifier(estimators=[('lr', LogisticRegression()), + ('rf', RandomForestClassifier()), + ('svm', LinearSVC())]), + StackingRegressor(estimators=[('lr', LinearRegression()), + ('rf', RandomForestRegressor()), + ('svm', LinearSVR(max_iter=1e4))])] +) +def test_stacking_named_estimators_dropped(stacking_estimator): + stacking_estimator.set_params(rf='drop') + stacking_estimator.fit(scale(X_iris), y_iris) + estimators = stacking_estimator.named_estimators_ + assert 'rf' not in estimators.keys() + assert len(estimators) == 2 + assert sorted(list(estimators.keys())) == sorted(['lr', 'svm']) + + +@pytest.mark.parametrize( + "stacking_estimator", + [StackingClassifier(estimators=[('lr', LogisticRegression()), + ('svm', LinearSVC())]), + StackingRegressor(estimators=[('lr', LinearRegression()), + ('svm', LinearSVR())])] +) +def test_stacking_set_get_params(stacking_estimator): + params = stacking_estimator.get_params() + assert 'lr' in list(params.keys()) + assert 'svm' in list(params.keys()) + + stacking_estimator.set_params(lr='drop') + params = stacking_estimator.get_params() + assert params['lr'] == 'drop' + + +@pytest.mark.parametrize( + "estimator, X, y", + [(StackingClassifier( + estimators=[('lr', LogisticRegression(random_state=0)), + ('svm', LinearSVC(random_state=0))]), + X_iris[:100], y_iris[:100]), # keep only classes 0 and 1 + (StackingRegressor( + estimators=[('lr', LinearRegression()), + ('svm', LinearSVR(random_state=0))]), + X_diabetes, y_diabetes)], + ids=['StackingClassifier', 'StackingRegressor'] +) +def test_stacking_randomness(estimator, X, y): + # checking that fixing the random state of the CV will lead to the same + # results + estimator_full = clone(estimator) + estimator_full.set_params( + cv=KFold(shuffle=True, random_state=np.random.RandomState(0)) + ) + + estimator_drop = clone(estimator) + estimator_drop.set_params(lr='drop') + estimator_drop.set_params( + cv=KFold(shuffle=True, random_state=np.random.RandomState(0)) + ) + + assert_allclose( + estimator_full.fit(X, y).transform(X)[:, 1:], + estimator_drop.fit(X, y).transform(X) + ) + + +# These warnings are raised due to _BaseComposition +@pytest.mark.filterwarnings("ignore:TypeError occurred during set_params") +@pytest.mark.filterwarnings("ignore:Estimator's parameters changed after") +@pytest.mark.parametrize( + "estimator", + [StackingClassifier( + estimators=[('lr', LogisticRegression(random_state=0)), + ('tree', DecisionTreeClassifier(random_state=0))]), + StackingRegressor( + estimators=[('lr', LinearRegression()), + ('tree', DecisionTreeRegressor(random_state=0))])], + ids=['StackingClassifier', 'StackingRegressor'] +) +def test_check_estimators_stacking_estimator(estimator): + check_estimator(estimator) + check_no_attributes_set_in_init(estimator.__class__.__name__, estimator) + + +def test_stacking_classifier_stratify_default(): + # check that we stratify the classes for the default CV + clf = StackingClassifier( + estimators=[('lr', LogisticRegression(max_iter=1e4)), + ('svm', LinearSVC(max_iter=1e4))] + ) + # since iris is not shuffled, a simple k-fold would not contain the + # 3 classes during training + clf.fit(X_iris, y_iris) + + +@pytest.mark.parametrize( + "stacker, X, y", + [(StackingClassifier( + estimators=[('lr', LogisticRegression()), + ('svm', LinearSVC(random_state=42))], + final_estimator=LogisticRegression(), + cv=KFold(shuffle=True, random_state=42)), + *load_breast_cancer(return_X_y=True)), + (StackingRegressor( + estimators=[('lr', LinearRegression()), + ('svm', LinearSVR(random_state=42))], + final_estimator=LinearRegression(), + cv=KFold(shuffle=True, random_state=42)), + X_diabetes, y_diabetes)], + ids=['StackingClassifier', 'StackingRegressor'] +) +def test_stacking_with_sample_weight(stacker, X, y): + # check that sample weights has an influence on the fitting + # note: ConvergenceWarning are catch since we are not worrying about the + # convergence here + n_half_samples = len(y) // 2 + total_sample_weight = np.array( + [0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples) + ) + X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split( + X, y, total_sample_weight, random_state=42 + ) + + with ignore_warnings(category=ConvergenceWarning): + stacker.fit(X_train, y_train) + y_pred_no_weight = stacker.predict(X_test) + + with ignore_warnings(category=ConvergenceWarning): + stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape)) + y_pred_unit_weight = stacker.predict(X_test) + + assert_allclose(y_pred_no_weight, y_pred_unit_weight) + + with ignore_warnings(category=ConvergenceWarning): + stacker.fit(X_train, y_train, sample_weight=sample_weight_train) + y_pred_biased = stacker.predict(X_test) + + assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0 + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize( + "stacker, X, y", + [(StackingClassifier( + estimators=[('lr', LogisticRegression()), + ('svm', LinearSVC(random_state=42))], + final_estimator=LogisticRegression()), + *load_breast_cancer(return_X_y=True)), + (StackingRegressor( + estimators=[('lr', LinearRegression()), + ('svm', LinearSVR(random_state=42))], + final_estimator=LinearRegression()), + X_diabetes, y_diabetes)], + ids=['StackingClassifier', 'StackingRegressor'] +) +def test_stacking_cv_influence(stacker, X, y): + # check that the stacking affects the fit of the final estimator but not + # the fit of the base estimators + # note: ConvergenceWarning are catch since we are not worrying about the + # convergence here + stacker_cv_3 = clone(stacker) + stacker_cv_5 = clone(stacker) + + stacker_cv_3.set_params(cv=3) + stacker_cv_5.set_params(cv=5) + + stacker_cv_3.fit(X, y) + stacker_cv_5.fit(X, y) + + # the base estimators should be identical + for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, + stacker_cv_5.estimators_): + assert_allclose(est_cv_3.coef_, est_cv_5.coef_) + + # the final estimator should be different + with pytest.raises(AssertionError, match='Not equal'): + assert_allclose(stacker_cv_3.final_estimator_.coef_, + stacker_cv_5.final_estimator_.coef_) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index bbfb91751726a..e2fce1eb2e918 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -328,7 +328,7 @@ def test_sample_weight(): voting='soft') msg = ('Underlying estimator KNeighborsClassifier does not support ' 'sample weights.') - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): eclf3.fit(X, y, sample_weight) # check that _parallel_fit_estimator will raise the right error @@ -524,7 +524,7 @@ def test_none_estimator_with_weights(X, y, voter, drop): ids=['VotingRegressor', 'VotingClassifier'] ) def test_check_estimators_voting_estimator(estimator): - # FIXME: to be removed when meta-estimators can be specified themselves + # FIXME: to be removed when meta-estimators can specified themselves # their testing parameters (for required parameters). check_estimator(estimator) check_no_attributes_set_in_init(estimator.__class__.__name__, estimator) diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py index c7bdac82c7c62..dbc8a2b7bff93 100644 --- a/sklearn/ensemble/voting.py +++ b/sklearn/ensemble/voting.py @@ -23,6 +23,7 @@ from ..base import RegressorMixin from ..base import TransformerMixin from ..base import clone +from .base import _parallel_fit_estimator from ..preprocessing import LabelEncoder from ..utils import Bunch from ..utils.validation import check_is_fitted @@ -32,23 +33,6 @@ from ..exceptions import NotFittedError -def _parallel_fit_estimator(estimator, X, y, sample_weight=None): - """Private function used to fit an estimator within a job.""" - if sample_weight is not None: - try: - estimator.fit(X, y, sample_weight=sample_weight) - except TypeError as exc: - if "unexpected keyword argument 'sample_weight'" in str(exc): - raise ValueError( - "Underlying estimator {} does not support sample weights." - .format(estimator.__class__.__name__) - ) from exc - raise - else: - estimator.fit(X, y) - return estimator - - class _BaseVoting(TransformerMixin, _BaseComposition): """Base class for voting. diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx index b3ee42eaef8a3..676d3676fb8c1 100644 --- a/sklearn/manifold/_utils.pyx +++ b/sklearn/manifold/_utils.pyx @@ -13,24 +13,21 @@ cdef float EPSILON_DBL = 1e-8 cdef float PERPLEXITY_TOLERANCE = 1e-5 cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( - np.ndarray[np.float32_t, ndim=2] affinities, - np.ndarray[np.int64_t, ndim=2] neighbors, + np.ndarray[np.float32_t, ndim=2] sqdistances, float desired_perplexity, int verbose): """Binary search for sigmas of conditional Gaussians. This approximation reduces the computational complexity from O(N^2) to - O(uN). See the exact method '_binary_search_perplexity' for more details. + O(uN). Parameters ---------- - affinities : array-like, shape (n_samples, k) - Distances between training samples and its k nearest neighbors. - - neighbors : array-like, shape (n_samples, k) or None - Each row contains the indices to the k nearest neigbors. If this - array is None, then the perplexity is estimated over all data - not just the nearest neighbors. + sqdistances : array-like, shape (n_samples, n_neighbors) + Distances between training samples and their k nearest neighbors. + When using the exact method, this is a square (n_samples, n_samples) + distance matrix. The TSNE default metric is "euclidean" which is + interpreted as squared euclidean distance. desired_perplexity : float Desired perplexity (2^entropy) of the conditional Gaussians. @@ -46,7 +43,9 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( # Maximum number of binary search steps cdef long n_steps = 100 - cdef long n_samples = affinities.shape[0] + cdef long n_samples = sqdistances.shape[0] + cdef long n_neighbors = sqdistances.shape[1] + cdef int using_neighbors = n_neighbors < n_samples # Precisions of conditional Gaussian distributions cdef float beta cdef float beta_min @@ -61,11 +60,6 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( cdef float sum_Pi cdef float sum_disti_Pi cdef long i, j, k, l - cdef long n_neighbors = n_samples - cdef int using_neighbors = neighbors is not None - - if using_neighbors: - n_neighbors = neighbors.shape[1] # This array is later used as a 32bit array. It has multiple intermediate # floating point additions that benefit from the extra precision @@ -85,7 +79,7 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( sum_Pi = 0.0 for j in range(n_neighbors): if j != i or using_neighbors: - P[i, j] = math.exp(-affinities[i, j] * beta) + P[i, j] = math.exp(-sqdistances[i, j] * beta) sum_Pi += P[i, j] if sum_Pi == 0.0: @@ -94,7 +88,7 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( for j in range(n_neighbors): P[i, j] /= sum_Pi - sum_disti_Pi += affinities[i, j] * P[i, j] + sum_disti_Pi += sqdistances[i, j] * P[i, j] entropy = math.log(sum_Pi) + beta * sum_disti_Pi entropy_diff = entropy - desired_entropy diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py index e512ce565553d..93d7a17eca9db 100644 --- a/sklearn/manifold/isomap.py +++ b/sklearn/manifold/isomap.py @@ -6,7 +6,8 @@ import numpy as np from ..base import BaseEstimator, TransformerMixin from ..neighbors import NearestNeighbors, kneighbors_graph -from ..utils import check_array +from ..utils.deprecation import deprecated +from ..utils.validation import check_is_fitted from ..utils.graph import graph_shortest_path from ..decomposition import KernelPCA from ..preprocessing import KernelCenterer @@ -58,12 +59,35 @@ class Isomap(TransformerMixin, BaseEstimator): Algorithm to use for nearest neighbors search, passed to neighbors.NearestNeighbors instance. - n_jobs : int or None, optional (default=None) + n_jobs : int or None, default=None The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. + metric : string, or callable, default="minkowski" + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by :func:`sklearn.metrics.pairwise_distances` for + its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. X may be a :term:`Glossary `. + + .. versionadded:: 0.22 + + p : int, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + .. versionadded:: 0.22 + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + .. versionadded:: 0.22 + Attributes ---------- embedding_ : array-like, shape (n_samples, n_components) @@ -73,9 +97,6 @@ class Isomap(TransformerMixin, BaseEstimator): :class:`~sklearn.decomposition.KernelPCA` object used to implement the embedding. - training_data_ : array-like, shape (n_samples, n_features) - Stores the training data. - nbrs_ : sklearn.neighbors.NearestNeighbors instance Stores nearest neighbors instance, including BallTree or KDtree if applicable. @@ -104,7 +125,8 @@ class Isomap(TransformerMixin, BaseEstimator): def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', - neighbors_algorithm='auto', n_jobs=None): + neighbors_algorithm='auto', n_jobs=None, metric='minkowski', + p=2, metric_params=None): self.n_neighbors = n_neighbors self.n_components = n_components self.eigen_solver = eigen_solver @@ -113,14 +135,19 @@ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', self.path_method = path_method self.neighbors_algorithm = neighbors_algorithm self.n_jobs = n_jobs + self.metric = metric + self.p = p + self.metric_params = metric_params def _fit_transform(self, X): - X = self._validate_X(X, accept_sparse='csr') self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, + metric=self.metric, p=self.p, + metric_params=self.metric_params, n_jobs=self.n_jobs) self.nbrs_.fit(X) - self.training_data_ = self.nbrs_._fit_X + self.n_features_in_ = self.nbrs_.n_features_in_ + self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, @@ -128,6 +155,8 @@ def _fit_transform(self, X): n_jobs=self.n_jobs) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, + metric=self.metric, p=self.p, + metric_params=self.metric_params, mode='distance', n_jobs=self.n_jobs) self.dist_matrix_ = graph_shortest_path(kng, @@ -138,6 +167,13 @@ def _fit_transform(self, X): self.embedding_ = self.kernel_pca_.fit_transform(G) + @property + @deprecated("Attribute training_data_ was deprecated in version 0.22 and " + "will be removed in 0.24.") + def training_data_(self): + check_is_fitted(self) + return self.nbrs_._fit_X + def reconstruction_error(self): """Compute the reconstruction error for the embedding. @@ -167,9 +203,9 @@ def fit(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors} + X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors} Sample data, shape = (n_samples, n_features), in the form of a - numpy array, precomputed tree, or NearestNeighbors + numpy array, sparse graph, precomputed tree, or NearestNeighbors object. y : Ignored @@ -186,7 +222,7 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix, BallTree, KDTree} + X : {array-like, sparse graph, BallTree, KDTree} Training vector, where n_samples in the number of samples and n_features is the number of features. @@ -212,21 +248,27 @@ def transform(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like, shape (n_queries, n_features) + If neighbors_algorithm='precomputed', X is assumed to be a + distance matrix or a sparse graph of shape + (n_queries, n_samples_fit). Returns ------- - X_new : array-like, shape (n_samples, n_components) + X_new : array-like, shape (n_queries, n_components) """ - X = check_array(X) + check_is_fitted(self) distances, indices = self.nbrs_.kneighbors(X, return_distance=True) - # Create the graph of shortest distances from X to self.training_data_ - # via the nearest neighbors of X. + # Create the graph of shortest distances from X to + # training data via the nearest neighbors of X. # This can be done as a single array operation, but it potentially # takes a lot of memory. To avoid that, use a loop: - G_X = np.zeros((X.shape[0], self.training_data_.shape[0])) - for i in range(X.shape[0]): + + n_samples_fit = self.nbrs_.n_samples_fit_ + n_queries = distances.shape[0] + G_X = np.zeros((n_queries, n_samples_fit)) + for i in range(n_queries): G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0) diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index 402153048eb14..dc77cff0f9da5 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -99,7 +99,7 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None): """ knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(X) X = knn._fit_X - n_samples = X.shape[0] + n_samples = knn.n_samples_fit_ ind = knn.kneighbors(X, return_distance=False)[:, 1:] data = barycenter_weights(X, X[ind], reg=reg) indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index e6a646d13ffd0..1052aeec9c955 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -19,7 +19,7 @@ from ..utils.extmath import _deterministic_vector_sign_flip from ..utils.fixes import lobpcg from ..metrics.pairwise import rbf_kernel -from ..neighbors import kneighbors_graph +from ..neighbors import kneighbors_graph, NearestNeighbors def _graph_connected_component(graph, node_id): @@ -157,7 +157,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, Parameters ---------- - adjacency : array-like or sparse matrix, shape: (n_samples, n_samples) + adjacency : array-like or sparse graph, shape: (n_samples, n_samples) The adjacency matrix of the graph to embed. n_components : integer, optional, default 8 @@ -369,9 +369,14 @@ class SpectralEmbedding(BaseEstimator): affinity : string or callable, default : "nearest_neighbors" How to construct the affinity matrix. - - 'nearest_neighbors' : construct affinity matrix by knn graph - - 'rbf' : construct affinity matrix by rbf kernel - - 'precomputed' : interpret X as precomputed affinity matrix + - 'nearest_neighbors' : construct the affinity matrix by computing a + graph of nearest neighbors. + - 'rbf' : construct the affinity matrix by computing a radial basis + function (RBF) kernel. + - 'precomputed' : interpret ``X`` as a precomputed affinity matrix. + - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph + of precomputed nearest neighbors, and constructs the affinity matrix + by selecting the ``n_neighbors`` nearest neighbors. - callable : use passed in function as affinity the function takes in data matrix (n_samples, n_features) and return affinity matrix (n_samples, n_samples). @@ -453,7 +458,8 @@ def __init__(self, n_components=2, affinity="nearest_neighbors", @property def _pairwise(self): - return self.affinity == "precomputed" + return self.affinity in ["precomputed", + "precomputed_nearest_neighbors"] def _get_affinity_matrix(self, X, Y=None): """Calculate the affinity matrix from data @@ -477,6 +483,13 @@ def _get_affinity_matrix(self, X, Y=None): if self.affinity == 'precomputed': self.affinity_matrix_ = X return self.affinity_matrix_ + if self.affinity == 'precomputed_nearest_neighbors': + estimator = NearestNeighbors(n_neighbors=self.n_neighbors, + n_jobs=self.n_jobs, + metric="precomputed").fit(X) + connectivity = estimator.kneighbors_graph(X=X, mode='connectivity') + self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) + return self.affinity_matrix_ if self.affinity == 'nearest_neighbors': if sparse.issparse(X): warnings.warn("Nearest neighbors affinity currently does " @@ -507,12 +520,12 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. If affinity is "precomputed" - X : array-like, shape (n_samples, n_samples), + X : {array-like, sparse matrix}, shape (n_samples, n_samples), Interpret X as precomputed adjacency graph computed from samples. @@ -522,12 +535,13 @@ def fit(self, X, y=None): Returns the instance itself. """ - X = self._validate_X(X, ensure_min_samples=2, estimator=self) + X = self._validate_X(X, accept_sparse='csr', ensure_min_samples=2, + estimator=self) random_state = check_random_state(self.random_state) if isinstance(self.affinity, str): - if self.affinity not in {"nearest_neighbors", "rbf", - "precomputed"}: + if self.affinity not in {"nearest_neighbors", "rbf", "precomputed", + "precomputed_nearest_neighbors"}: raise ValueError(("%s is not a valid affinity. Expected " "'precomputed', 'rbf', 'nearest_neighbors' " "or a callable.") % self.affinity) @@ -547,12 +561,12 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. If affinity is "precomputed" - X : array-like, shape (n_samples, n_samples), + X : {array-like, sparse matrix}, shape (n_samples, n_samples), Interpret X as precomputed adjacency graph computed from samples. diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 70732b8d6ac16..598b820263776 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -11,14 +11,14 @@ from time import time import numpy as np from scipy import linalg -import scipy.sparse as sp from scipy.spatial.distance import pdist from scipy.spatial.distance import squareform -from scipy.sparse import csr_matrix +from scipy.sparse import csr_matrix, issparse from ..neighbors import NearestNeighbors from ..base import BaseEstimator from ..utils import check_array from ..utils import check_random_state +from ..utils.validation import check_non_negative from ..decomposition import PCA from ..metrics.pairwise import pairwise_distances from . import _utils @@ -53,14 +53,14 @@ def _joint_probabilities(distances, desired_perplexity, verbose): # the desired perplexity distances = distances.astype(np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( - distances, None, desired_perplexity, verbose) + distances, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) return P -def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): +def _joint_probabilities_nn(distances, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. @@ -70,11 +70,9 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): Parameters ---------- - distances : array, shape (n_samples, k) - Distances of samples to its k nearest neighbors. - - neighbors : array, shape (n_samples, k) - Indices of the k nearest-neighbors for each samples. + distances : CSR sparse matrix, shape (n_samples, n_samples) + Distances of samples to its n_neighbors nearest neighbors. All other + distances are left to zero (and are not materialized in memory). desired_perplexity : float Desired perplexity of the joint probability distributions. @@ -90,17 +88,18 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): t0 = time() # Compute conditional probabilities such that they approximately match # the desired perplexity - n_samples, k = neighbors.shape - distances = distances.astype(np.float32, copy=False) - neighbors = neighbors.astype(np.int64, copy=False) + distances.sort_indices() + n_samples = distances.shape[0] + distances_data = distances.data.reshape(n_samples, -1) + distances_data = distances_data.astype(np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( - distances, neighbors, desired_perplexity, verbose) + distances_data, desired_perplexity, verbose) assert np.all(np.isfinite(conditional_P)), \ "All probabilities should be finite" # Symmetrize the joint probability distribution using sparse operations - P = csr_matrix((conditional_P.ravel(), neighbors.ravel(), - range(0, n_samples * k + 1, k)), + P = csr_matrix((conditional_P.ravel(), distances.indices, + distances.indptr), shape=(n_samples, n_samples)) P = P + P.T @@ -638,55 +637,35 @@ def __init__(self, n_components=2, perplexity=30.0, self.angle = angle def _fit(self, X, skip_num_points=0): - """Fit the model using X as training data. - - Note that sparse arrays can only be handled by method='exact'. - It is recommended that you convert your sparse array to dense - (e.g. `X.toarray()`) if it fits in memory, or otherwise using a - dimensionality reduction technique (e.g. TruncatedSVD). + """Private function to fit the model using X as training data.""" - Parameters - ---------- - X : array, shape (n_samples, n_features) or (n_samples, n_samples) - If the metric is 'precomputed' X must be a square distance - matrix. Otherwise it contains a sample per row. Note that - when method='barnes_hut', X cannot be a sparse array and - will be converted to a 32 bit float array if need be. - Method='exact' allows sparse arrays and 64 bit floating point - inputs. - - skip_num_points : int (optional, default:0) - This does not compute the gradient for points with indices below - `skip_num_points`. This is useful when computing transforms of new - data where you'd like to keep the old data fixed. - """ if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") + if self.method == 'barnes_hut': + X = self._validate_X(X, accept_sparse=['csr'], + ensure_min_samples=2, + dtype=[np.float32, np.float64]) + else: + X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") - if np.any(X < 0): - raise ValueError("All distances should be positive, the " - "precomputed distances given as X is not " - "correct") - if self.method == 'barnes_hut' and sp.issparse(X): - raise TypeError('A sparse matrix was passed, but dense ' - 'data is required for method="barnes_hut". Use ' - 'X.toarray() to convert to a dense numpy array if ' - 'the array is small enough for it to fit in ' - 'memory. Otherwise consider dimensionality ' - 'reduction techniques (e.g. TruncatedSVD)') - if self.method == 'barnes_hut': - X = self._validate_X(X, ensure_min_samples=2, - dtype=[np.float32, np.float64]) - else: - X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + + check_non_negative(X, "TSNE.fit(). With metric='precomputed', X " + "should contain positive distances.") + + if self.method == "exact" and issparse(X): + raise TypeError( + 'TSNE with method="exact" does not accept sparse ' + 'precomputed distance matrix. Use method="barnes_hut" ' + 'or provide the dense distance matrix.') + if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " @@ -730,17 +709,19 @@ def _fit(self, X, skip_num_points=0): "or then equal to one") else: - # Cpmpute the number of nearest neighbors to find. + # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. - k = min(n_samples - 1, int(3. * self.perplexity + 1)) + n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1)) if self.verbose: - print("[t-SNE] Computing {} nearest neighbors...".format(k)) + print("[t-SNE] Computing {} nearest neighbors..." + .format(n_neighbors)) # Find the nearest neighbors for every point - knn = NearestNeighbors(algorithm='auto', n_neighbors=k, + knn = NearestNeighbors(algorithm='auto', + n_neighbors=n_neighbors, metric=self.metric) t0 = time() knn.fit(X) @@ -750,12 +731,11 @@ def _fit(self, X, skip_num_points=0): n_samples, duration)) t0 = time() - distances_nn, neighbors_nn = knn.kneighbors( - None, n_neighbors=k) + distances_nn = knn.kneighbors_graph(mode='distance') duration = time() - t0 if self.verbose: - print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..." - .format(n_samples, duration)) + print("[t-SNE] Computed neighbors for {} samples " + "in {:.3f}s...".format(n_samples, duration)) # Free the memory used by the ball_tree del knn @@ -766,11 +746,11 @@ def _fit(self, X, skip_num_points=0): # the method was derived using the euclidean method as in the # input space. Not sure of the implication of using a different # metric. - distances_nn **= 2 + distances_nn.data **= 2 # compute the joint probability distribution for the input space - P = _joint_probabilities_nn(distances_nn, neighbors_nn, - self.perplexity, self.verbose) + P = _joint_probabilities_nn(distances_nn, self.perplexity, + self.verbose) if isinstance(self.init, np.ndarray): X_embedded = self.init @@ -869,7 +849,10 @@ def fit_transform(self, X, y=None): ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance - matrix. Otherwise it contains a sample per row. + matrix. Otherwise it contains a sample per row. If the method + is 'exact', X may be a sparse matrix of type 'csr', 'csc' + or 'coo'. If the method is 'barnes_hut' and the metric is + 'precomputed', X may be a precomputed sparse graph. y : Ignored @@ -891,7 +874,8 @@ def fit(self, X, y=None): If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' - or 'coo'. + or 'coo'. If the method is 'barnes_hut' and the metric is + 'precomputed', X may be a precomputed sparse graph. y : Ignored """ diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py index 4502ffdd6c33b..6122840a5ef33 100644 --- a/sklearn/manifold/tests/test_isomap.py +++ b/sklearn/manifold/tests/test_isomap.py @@ -1,6 +1,7 @@ from itertools import product import numpy as np from numpy.testing import assert_almost_equal, assert_array_almost_equal +import pytest from sklearn import datasets from sklearn import manifold @@ -114,6 +115,57 @@ def test_pipeline(): assert .9 < clf.score(X, y) +def test_pipeline_with_nearest_neighbors_transformer(): + # Test chaining NearestNeighborsTransformer and Isomap with + # neighbors_algorithm='precomputed' + algorithm = 'auto' + n_neighbors = 10 + + X, _ = datasets.make_blobs(random_state=0) + X2, _ = datasets.make_blobs(random_state=1) + + # compare the chained version and the compact version + est_chain = pipeline.make_pipeline( + neighbors.KNeighborsTransformer( + n_neighbors=n_neighbors, algorithm=algorithm, mode='distance'), + manifold.Isomap(n_neighbors=n_neighbors, metric='precomputed')) + est_compact = manifold.Isomap(n_neighbors=n_neighbors, + neighbors_algorithm=algorithm) + + Xt_chain = est_chain.fit_transform(X) + Xt_compact = est_compact.fit_transform(X) + assert_array_almost_equal(Xt_chain, Xt_compact) + + Xt_chain = est_chain.transform(X2) + Xt_compact = est_compact.transform(X2) + assert_array_almost_equal(Xt_chain, Xt_compact) + + +def test_different_metric(): + # Test that the metric parameters work correctly, and default to euclidean + def custom_metric(x1, x2): + return np.sqrt(np.sum(x1 ** 2 + x2 ** 2)) + + # metric, p, is_euclidean + metrics = [('euclidean', 2, True), + ('manhattan', 1, False), + ('minkowski', 1, False), + ('minkowski', 2, True), + (custom_metric, 2, False)] + + X, _ = datasets.make_blobs(random_state=0) + reference = manifold.Isomap().fit_transform(X) + + for metric, p, is_euclidean in metrics: + embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X) + + if is_euclidean: + assert_array_almost_equal(embedding, reference) + else: + with pytest.raises(AssertionError, match='not almost equal'): + assert_array_almost_equal(embedding, reference) + + def test_isomap_clone_bug(): # regression test for bug reported in #6062 model = manifold.Isomap() diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index d9c066c474b1c..a1d790c699a16 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -12,6 +12,7 @@ from sklearn.manifold import spectral_embedding from sklearn.metrics.pairwise import rbf_kernel from sklearn.metrics import normalized_mutual_info_score +from sklearn.neighbors import NearestNeighbors from sklearn.cluster import KMeans from sklearn.datasets.samples_generator import make_blobs from sklearn.utils.extmath import _deterministic_vector_sign_flip @@ -125,7 +126,9 @@ def test_spectral_embedding_two_components(seed=36): assert normalized_mutual_info_score(true_label, label_) == 1.0 -def test_spectral_embedding_precomputed_affinity(seed=36): +@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], + ids=["dense", "sparse"]) +def test_spectral_embedding_precomputed_affinity(X, seed=36): # Test spectral embedding with precomputed kernel gamma = 1.0 se_precomp = SpectralEmbedding(n_components=2, affinity="precomputed", @@ -133,14 +136,33 @@ def test_spectral_embedding_precomputed_affinity(seed=36): se_rbf = SpectralEmbedding(n_components=2, affinity="rbf", gamma=gamma, random_state=np.random.RandomState(seed)) - embed_precomp = se_precomp.fit_transform(rbf_kernel(S, gamma=gamma)) - embed_rbf = se_rbf.fit_transform(S) + embed_precomp = se_precomp.fit_transform(rbf_kernel(X, gamma=gamma)) + embed_rbf = se_rbf.fit_transform(X) assert_array_almost_equal( se_precomp.affinity_matrix_, se_rbf.affinity_matrix_) assert _check_with_col_sign_flipping(embed_precomp, embed_rbf, 0.05) -def test_spectral_embedding_callable_affinity(seed=36): +def test_precomputed_nearest_neighbors_filtering(): + # Test precomputed graph filtering when containing too many neighbors + n_neighbors = 2 + results = [] + for additional_neighbors in [0, 10]: + nn = NearestNeighbors( + n_neighbors=n_neighbors + additional_neighbors).fit(S) + graph = nn.kneighbors_graph(S, mode='connectivity') + embedding = SpectralEmbedding(random_state=0, n_components=2, + affinity='precomputed_nearest_neighbors', + n_neighbors=n_neighbors + ).fit(graph).embedding_ + results.append(embedding) + + assert_array_equal(results[0], results[1]) + + +@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], + ids=["dense", "sparse"]) +def test_spectral_embedding_callable_affinity(X, seed=36): # Test spectral embedding with callable affinity gamma = 0.9 kern = rbf_kernel(S, gamma=gamma) @@ -152,8 +174,8 @@ def test_spectral_embedding_callable_affinity(seed=36): se_rbf = SpectralEmbedding(n_components=2, affinity="rbf", gamma=gamma, random_state=np.random.RandomState(seed)) - embed_rbf = se_rbf.fit_transform(S) - embed_callable = se_callable.fit_transform(S) + embed_rbf = se_rbf.fit_transform(X) + embed_callable = se_callable.fit_transform(X) assert_array_almost_equal( se_callable.affinity_matrix_, se_rbf.affinity_matrix_) assert_array_almost_equal(kern, se_rbf.affinity_matrix_) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 0b11e327256f6..34662604892af 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -3,11 +3,12 @@ import numpy as np from numpy.testing import assert_allclose import scipy.sparse as sp - import pytest -from sklearn.neighbors import BallTree from sklearn.neighbors import NearestNeighbors +from sklearn.neighbors import kneighbors_graph +from sklearn.exceptions import EfficiencyWarning +from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal @@ -104,13 +105,10 @@ def flat_function(_, compute_error=True): def test_binary_search(): # Test if the binary search finds Gaussians with desired perplexity. random_state = check_random_state(0) - distances = random_state.randn(50, 2).astype(np.float32) - # Distances shouldn't be negative - distances = np.abs(distances.dot(distances.T)) - np.fill_diagonal(distances, 0.0) + data = random_state.randn(50, 5) + distances = pairwise_distances(data).astype(np.float32) desired_perplexity = 25.0 - P = _binary_search_perplexity(distances, None, desired_perplexity, - verbose=0) + P = _binary_search_perplexity(distances, desired_perplexity, verbose=0) P = np.maximum(P, np.finfo(np.double).eps) mean_perplexity = np.mean([np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])]) @@ -124,34 +122,34 @@ def test_binary_search_neighbors(): n_samples = 200 desired_perplexity = 25.0 random_state = check_random_state(0) - distances = random_state.randn(n_samples, 2).astype(np.float32) - # Distances shouldn't be negative - distances = np.abs(distances.dot(distances.T)) - np.fill_diagonal(distances, 0.0) - P1 = _binary_search_perplexity(distances, None, desired_perplexity, - verbose=0) + data = random_state.randn(n_samples, 2).astype(np.float32, copy=False) + distances = pairwise_distances(data) + P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0) # Test that when we use all the neighbors the results are identical - k = n_samples - neighbors_nn = np.argsort(distances, axis=1)[:, 1:k].astype(np.int64, - copy=False) - distances_nn = np.array([distances[k, neighbors_nn[k]] - for k in range(n_samples)]) - P2 = _binary_search_perplexity(distances_nn, neighbors_nn, - desired_perplexity, verbose=0) - P_nn = np.array([P1[k, neighbors_nn[k]] for k in range(n_samples)]) - assert_array_almost_equal(P_nn, P2, decimal=4) - - # Test that the highest P_ij are the same when few neighbors are used - for k in np.linspace(80, n_samples, 5): + n_neighbors = n_samples - 1 + nn = NearestNeighbors().fit(data) + distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, + mode='distance') + distances_nn = distance_graph.data.astype(np.float32, copy=False) + distances_nn = distances_nn.reshape(n_samples, n_neighbors) + P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) + + indptr = distance_graph.indptr + P1_nn = np.array([P1[k, distance_graph.indices[indptr[k]:indptr[k + 1]]] + for k in range(n_samples)]) + assert_array_almost_equal(P1_nn, P2, decimal=4) + + # Test that the highest P_ij are the same when fewer neighbors are used + for k in np.linspace(150, n_samples - 1, 5): k = int(k) - topn = k * 10 # check the top 10 *k entries out of k * k entries - neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64, - copy=False) - distances_nn = np.array([distances[k, neighbors_nn[k]] - for k in range(n_samples)]) - P2k = _binary_search_perplexity(distances_nn, neighbors_nn, - desired_perplexity, verbose=0) + topn = k * 10 # check the top 10 * k entries out of k * k entries + distance_graph = nn.kneighbors_graph(n_neighbors=k, mode='distance') + distances_nn = distance_graph.data.astype(np.float32, copy=False) + distances_nn = distances_nn.reshape(n_samples, k) + P2k = _binary_search_perplexity(distances_nn, desired_perplexity, + verbose=0) + assert_array_almost_equal(P1_nn, P2, decimal=2) idx = np.argsort(P1.ravel())[::-1] P1top = P1.ravel()[idx][:topn] idx = np.argsort(P2k.ravel())[::-1] @@ -163,20 +161,22 @@ def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. - k = 10 + n_neighbors = 10 n_samples = 100 random_state = check_random_state(0) - distances = random_state.randn(n_samples, 2).astype(np.float32) - # Distances shouldn't be negative - distances = np.abs(distances.dot(distances.T)) - np.fill_diagonal(distances, 0.0) + data = random_state.randn(n_samples, 5) + nn = NearestNeighbors().fit(data) + distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, + mode='distance') + distances = distance_graph.data.astype(np.float32, copy=False) + distances = distances.reshape(n_samples, n_neighbors) last_P = None - neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64, - copy=False) + desired_perplexity = 3 for _ in range(100): - P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(), - 3, verbose=0) - P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0) + P = _binary_search_perplexity(distances.copy(), desired_perplexity, + verbose=0) + P1 = _joint_probabilities_nn(distance_graph, desired_perplexity, + verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: @@ -262,14 +262,15 @@ def test_optimization_minimizes_kl_divergence(): assert kl_divergences[2] <= kl_divergences[1] -def test_fit_csr_matrix(): +@pytest.mark.parametrize('method', ['exact', 'barnes_hut']) +def test_fit_csr_matrix(method): # X can be a sparse matrix. random_state = check_random_state(0) X = random_state.randn(50, 2) X[(np.random.randint(0, 50, 25), np.random.randint(0, 2, 25))] = 0.0 X_csr = sp.csr_matrix(X) tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0, - random_state=0, method='exact', n_iter=750) + random_state=0, method=method, n_iter=750) X_embedded = tsne.fit_transform(X_csr) assert_allclose(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0, rtol=1.1e-1) @@ -295,8 +296,8 @@ def test_trustworthiness_not_euclidean_metric(): random_state = check_random_state(0) X = random_state.randn(100, 2) assert (trustworthiness(X, X, metric='cosine') == - trustworthiness(pairwise_distances(X, metric='cosine'), X, - metric='precomputed')) + trustworthiness(pairwise_distances(X, metric='cosine'), X, + metric='precomputed')) def test_early_exaggeration_too_small(): @@ -313,20 +314,55 @@ def test_too_few_iterations(): tsne.fit_transform(np.array([[0.0], [0.0]])) -def test_non_square_precomputed_distances(): - # Precomputed distance matrices must be square matrices. +@pytest.mark.parametrize('method, retype', [ + ('exact', np.asarray), + ('barnes_hut', np.asarray), + ('barnes_hut', sp.csr_matrix), +]) +@pytest.mark.parametrize('D, message_regex', [ + ([[0.0], [1.0]], ".* square distance matrix"), + ([[0., -1.], [1., 0.]], ".* positive.*"), +]) +def test_bad_precomputed_distances(method, D, retype, message_regex): + tsne = TSNE(metric="precomputed", method=method) + with pytest.raises(ValueError, match=message_regex): + tsne.fit_transform(retype(D)) + + +def test_exact_no_precomputed_sparse(): + tsne = TSNE(metric='precomputed', method='exact') + with pytest.raises(TypeError, match='sparse'): + tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]])) + + +def test_high_perplexity_precomputed_sparse_distances(): + # Perplexity should be less than 50 + dist = np.array([[1., 0., 0.], [0., 1., 0.], [1., 0., 0.]]) + bad_dist = sp.csr_matrix(dist) tsne = TSNE(metric="precomputed") - with pytest.raises(ValueError, match=".* square distance matrix"): - tsne.fit_transform(np.array([[0.0], [1.0]])) + msg = "3 neighbors per samples are required, but some samples have only 1" + with pytest.raises(ValueError, match=msg): + tsne.fit_transform(bad_dist) + + +@ignore_warnings(category=EfficiencyWarning) +def test_sparse_precomputed_distance(): + """Make sure that TSNE works identically for sparse and dense matrix""" + random_state = check_random_state(0) + X = random_state.randn(100, 2) + D_sparse = kneighbors_graph(X, n_neighbors=100, mode='distance', + include_self=True) + D = pairwise_distances(X) + assert sp.issparse(D_sparse) + assert_almost_equal(D_sparse.A, D) -def test_non_positive_precomputed_distances(): - # Precomputed distance matrices must be positive. - bad_dist = np.array([[0., -1.], [1., 0.]]) - for method in ['barnes_hut', 'exact']: - tsne = TSNE(metric="precomputed", method=method) - with pytest.raises(ValueError, match="All distances .*precomputed.*"): - tsne.fit_transform(bad_dist) + tsne = TSNE(metric="precomputed", random_state=0) + Xt_dense = tsne.fit_transform(D) + + for fmt in ['csr', 'lil']: + Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt)) + assert_almost_equal(Xt_dense, Xt_sparse) def test_non_positive_computed_distances(): @@ -563,17 +599,6 @@ def test_reduction_to_one_component(): assert(np.all(np.isfinite(X_embedded))) -def test_no_sparse_on_barnes_hut(): - # No sparse matrices allowed on Barnes-Hut. - random_state = check_random_state(0) - X = random_state.randn(100, 2) - X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0 - X_csr = sp.csr_matrix(X) - tsne = TSNE(n_iter=199, method='barnes_hut') - with pytest.raises(TypeError, match="A sparse matrix was.*"): - tsne.fit_transform(X_csr) - - @pytest.mark.parametrize('method', ['barnes_hut', 'exact']) @pytest.mark.parametrize('dt', [np.float32, np.float64]) def test_64bit(method, dt): @@ -616,25 +641,17 @@ def test_barnes_hut_angle(): degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) - distances = random_state.randn(n_samples, n_features) - distances = distances.astype(np.float32) - distances = abs(distances.dot(distances.T)) - np.fill_diagonal(distances, 0.0) + data = random_state.randn(n_samples, n_features) + distances = pairwise_distances(data) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) - k = n_samples - 1 - bt = BallTree(distances) - distances_nn, neighbors_nn = bt.query(distances, k=k + 1) - neighbors_nn = neighbors_nn[:, 1:] - distances_nn = np.array([distances[i, neighbors_nn[i]] - for i in range(n_samples)]) - assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ - abs(distances[0, neighbors_nn[0]] - distances_nn[0]) - P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn, - perplexity, verbose=0) + n_neighbors = n_samples - 1 + distances_csr = NearestNeighbors().fit(data).kneighbors_graph( + n_neighbors=n_neighbors, mode='distance') + P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 550cab3c01bca..85cc9c3e6a0ad 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -7,6 +7,7 @@ from .kd_tree import KDTree from .dist_metrics import DistanceMetric from .graph import kneighbors_graph, radius_neighbors_graph +from .graph import KNeighborsTransformer, RadiusNeighborsTransformer from .unsupervised import NearestNeighbors from .classification import KNeighborsClassifier, RadiusNeighborsClassifier from .regression import KNeighborsRegressor, RadiusNeighborsRegressor @@ -21,10 +22,12 @@ 'KDTree', 'KNeighborsClassifier', 'KNeighborsRegressor', + 'KNeighborsTransformer', 'NearestCentroid', 'NearestNeighbors', 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', + 'RadiusNeighborsTransformer', 'kneighbors_graph', 'radius_neighbors_graph', 'KernelDensity', diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 37b77f16f7920..28d0483ac9b5b 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -26,7 +26,8 @@ from ..utils import check_X_y, check_array, gen_even_slices from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted -from ..exceptions import DataConversionWarning +from ..utils.validation import check_non_negative +from ..exceptions import DataConversionWarning, EfficiencyWarning VALID_METRICS = dict(ball_tree=BallTree.valid_metrics, kd_tree=KDTree.valid_metrics, @@ -103,6 +104,187 @@ def _get_weights(dist, weights): "'distance', or a callable function") +def _is_sorted_by_data(graph): + """Returns whether the graph's non-zero entries are sorted by data + + The non-zero entries are stored in graph.data and graph.indices. + For each row (or sample), the non-zero entries can be either: + - sorted by indices, as after graph.sort_indices() + - sorted by data, as after _check_precomputed(graph) + - not sorted. + + Parameters + ---------- + graph : CSR sparse matrix, shape (n_samples, n_samples) + Neighbors graph as given by kneighbors_graph or radius_neighbors_graph + + Returns + ------- + res : boolean + Whether input graph is sorted by data + """ + assert graph.format == 'csr' + out_of_order = graph.data[:-1] > graph.data[1:] + line_change = np.unique(graph.indptr[1:-1] - 1) + line_change = line_change[line_change < out_of_order.shape[0]] + return (out_of_order.sum() == out_of_order[line_change].sum()) + + +def _check_precomputed(X): + """Check precomputed distance matrix + + If the precomputed distance matrix is sparse, it checks that the non-zero + entries are sorted by distances. If not, the matrix is copied and sorted. + + Parameters + ---------- + X : {sparse matrix, array-like}, (n_samples, n_samples) + Distance matrix to other samples. X may be a sparse matrix, in which + case only non-zero elements may be considered neighbors. + + Returns + ------- + X : {sparse matrix, array-like}, (n_samples, n_samples) + Distance matrix to other samples. X may be a sparse matrix, in which + case only non-zero elements may be considered neighbors. + """ + if not issparse(X): + X = check_array(X) + check_non_negative(X, whom="precomputed distance matrix.") + return X + else: + graph = X + + if graph.format not in ('csr', 'csc', 'coo', 'lil'): + raise TypeError('Sparse matrix in {!r} format is not supported due to ' + 'its handling of explicit zeros'.format(graph.format)) + copied = graph.format != 'csr' + graph = check_array(graph, accept_sparse='csr') + check_non_negative(graph, whom="precomputed distance matrix.") + + if not _is_sorted_by_data(graph): + warnings.warn('Precomputed sparse input was not sorted by data.', + EfficiencyWarning) + if not copied: + graph = graph.copy() + + # if each sample has the same number of provided neighbors + row_nnz = np.diff(graph.indptr) + if row_nnz.max() == row_nnz.min(): + n_samples = graph.shape[0] + distances = graph.data.reshape(n_samples, -1) + + order = np.argsort(distances, kind='mergesort') + order += np.arange(n_samples)[:, None] * row_nnz[0] + order = order.ravel() + graph.data = graph.data[order] + graph.indices = graph.indices[order] + + else: + for start, stop in zip(graph.indptr, graph.indptr[1:]): + order = np.argsort(graph.data[start:stop], kind='mergesort') + graph.data[start:stop] = graph.data[start:stop][order] + graph.indices[start:stop] = graph.indices[start:stop][order] + return graph + + +def _kneighbors_from_graph(graph, n_neighbors, return_distance): + """Decompose a nearest neighbors sparse graph into distances and indices + + Parameters + ---------- + graph : CSR sparse matrix, shape (n_samples, n_samples) + Neighbors graph as given by kneighbors_graph or radius_neighbors_graph + + n_neighbors : int + Number of neighbors required for each sample. + + return_distance : boolean + If False, distances will not be returned + + Returns + ------- + neigh_dist : array, shape (n_samples, n_neighbors) + Distances to nearest neighbors. Only present if return_distance=True. + + neigh_ind : array, shape (n_samples, n_neighbors) + Indices of nearest neighbors. + """ + n_samples = graph.shape[0] + assert graph.format == 'csr' + + # number of neighbors by samples + row_nnz = np.diff(graph.indptr) + row_nnz_min = row_nnz.min() + if n_neighbors is not None and row_nnz_min < n_neighbors: + raise ValueError( + '%d neighbors per samples are required, but some samples have only' + ' %d neighbors in precomputed graph matrix. Decrease number of ' + 'neighbors used or recompute the graph with more neighbors.' + % (n_neighbors, row_nnz_min)) + + def extract(a): + # if each sample has the same number of provided neighbors + if row_nnz.max() == row_nnz_min: + return a.reshape(n_samples, -1)[:, :n_neighbors] + else: + idx = np.tile(np.arange(n_neighbors), (n_samples, 1)) + idx += graph.indptr[:-1, None] + return a.take(idx, mode='clip').reshape(n_samples, n_neighbors) + + if return_distance: + return extract(graph.data), extract(graph.indices) + else: + return extract(graph.indices) + + +def _radius_neighbors_from_graph(graph, radius, return_distance): + """Decompose a nearest neighbors sparse graph into distances and indices + + Parameters + ---------- + graph : CSR sparse matrix, shape (n_samples, n_samples) + Neighbors graph as given by kneighbors_graph or radius_neighbors_graph + + radius : float > 0 + Radius of neighborhoods. + + return_distance : boolean + If False, distances will not be returned + + Returns + ------- + neigh_dist : array, shape (n_samples,) of arrays + Distances to nearest neighbors. Only present if return_distance=True. + + neigh_ind :array, shape (n_samples,) of arrays + Indices of nearest neighbors. + """ + assert graph.format == 'csr' + + no_filter_needed = graph.data.max() <= radius + + if no_filter_needed: + data, indices, indptr = graph.data, graph.indices, graph.indptr + else: + mask = graph.data <= radius + if return_distance: + data = np.compress(mask, graph.data) + indices = np.compress(mask, graph.indices) + indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr] + + indices = indices.astype(np.intp, copy=no_filter_needed) + + if return_distance: + neigh_dist = np.array(np.split(data, indptr[1:-1])) + neigh_ind = np.array(np.split(indices, indptr[1:-1])) + + if return_distance: + return neigh_dist, neigh_ind + else: + return neigh_ind + + class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): """Base class for nearest neighbors estimators.""" @@ -192,21 +374,28 @@ def _fit(self, X): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method + self.n_samples_fit_ = X.n_samples_fit_ return self elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' + self.n_samples_fit_ = X.data.shape[0] return self elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' + self.n_samples_fit_ = X.data.shape[0] return self - X = self._validate_X(X, accept_sparse='csr') + if self.effective_metric_ == 'precomputed': + X = _check_precomputed(X) + self.n_features_in_ = X.shape[1] + else: + X = self._validate_X(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: @@ -233,10 +422,12 @@ def _fit(self, X): self._fit_X = X.copy() self._tree = None self._fit_method = 'brute' + self.n_samples_fit_ = X.shape[0] return self self._fit_method = self.algorithm self._fit_X = X + self.n_samples_fit_ = X.shape[0] if self._fit_method == 'auto': # A tree approach is better for small number of neighbors, @@ -289,13 +480,13 @@ def _pairwise(self): return self.metric == 'precomputed' -def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance): +def _tree_query_parallel_helper(tree, *args, **kwargs): """Helper for the Parallel calls in KNeighborsMixin.kneighbors The Cython method tree.query is not directly picklable by cloudpickle under PyPy. """ - return tree.query(data, n_neighbors, return_distance) + return tree.query(*args, **kwargs) class KNeighborsMixin: @@ -342,8 +533,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. @@ -357,11 +548,11 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): Returns ------- - dist : array + neigh_dist : array, shape (n_queries, n_neighbors) Array representing the lengths to points, only present if return_distance=True - ind : array + neigh_ind : array, shape (n_queries, n_neighbors) Indices of the nearest points in the population matrix. Examples @@ -406,7 +597,10 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + if self.effective_metric_ == 'precomputed': + X = _check_precomputed(X) + else: + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -414,28 +608,34 @@ class from an array representing our data set and ask who's # returned, which is removed later n_neighbors += 1 - train_size = self._fit_X.shape[0] - if n_neighbors > train_size: + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: raise ValueError( "Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % - (train_size, n_neighbors) + (n_samples_fit, n_neighbors) ) - n_samples, _ = X.shape - sample_range = np.arange(n_samples)[:, None] n_jobs = effective_n_jobs(self.n_jobs) - if self._fit_method == 'brute': + chunked_results = None + if (self._fit_method == 'brute' and + self.effective_metric_ == 'precomputed' and issparse(X)): + results = _kneighbors_from_graph( + X, n_neighbors=n_neighbors, + return_distance=return_distance) + elif self._fit_method == 'brute': reduce_func = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) # for efficiency, use squared euclidean distances - kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' - else self.effective_metric_params_) + if self.effective_metric_ == 'euclidean': + kwds = {'squared': True} + else: + kwds = self.effective_metric_params_ - result = list(pairwise_distances_chunked( + chunked_results = list(pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) @@ -456,7 +656,7 @@ class from an array representing our data set and ask who's else: delayed_query = delayed(_tree_query_parallel_helper) parallel_kwargs = {"prefer": "threads"} - result = Parallel(n_jobs, **parallel_kwargs)( + chunked_results = Parallel(n_jobs, **parallel_kwargs)( delayed_query( self._tree, X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) @@ -464,23 +664,26 @@ class from an array representing our data set and ask who's else: raise ValueError("internal: _fit_method not recognized") - if return_distance: - dist, neigh_ind = zip(*result) - result = np.vstack(dist), np.vstack(neigh_ind) - else: - result = np.vstack(result) + if chunked_results is not None: + if return_distance: + neigh_dist, neigh_ind = zip(*chunked_results) + results = np.vstack(neigh_dist), np.vstack(neigh_ind) + else: + results = np.vstack(chunked_results) if not query_is_train: - return result + return results else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: - dist, neigh_ind = result + neigh_dist, neigh_ind = results else: - neigh_ind = result + neigh_ind = results + n_queries, _ = X.shape + sample_range = np.arange(n_queries)[:, None] sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more @@ -489,14 +692,13 @@ class from an array representing our data set and ask who's # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) + neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: - dist = np.reshape( - dist[sample_mask], (n_samples, n_neighbors - 1)) - return dist, neigh_ind + neigh_dist = np.reshape( + neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + return neigh_dist, neigh_ind return neigh_ind def kneighbors_graph(self, X=None, n_neighbors=None, @@ -505,8 +707,8 @@ def kneighbors_graph(self, X=None, n_neighbors=None, Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. @@ -522,7 +724,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, Returns ------- - A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit] + A : sparse graph in CSR format, shape = [n_queries, n_samples_fit] n_samples_fit is the number of samples in the fitted data A[i, j] is assigned the weight of edge that connects i to j. @@ -547,21 +749,13 @@ def kneighbors_graph(self, X=None, n_neighbors=None, if n_neighbors is None: n_neighbors = self.n_neighbors - # kneighbors does the None handling. - if X is not None: - X = check_array(X, accept_sparse='csr') - n_samples1 = X.shape[0] - else: - n_samples1 = self._fit_X.shape[0] - - n_samples2 = self._fit_X.shape[0] - n_nonzero = n_samples1 * n_neighbors - A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) + # check the input only in self.kneighbors # construct CSR matrix representation of the k-NN graph if mode == 'connectivity': - A_data = np.ones(n_samples1 * n_neighbors) A_ind = self.kneighbors(X, n_neighbors, return_distance=False) + n_queries = A_ind.shape[0] + A_data = np.ones(n_queries * n_neighbors) elif mode == 'distance': A_data, A_ind = self.kneighbors( @@ -573,19 +767,24 @@ def kneighbors_graph(self, X=None, n_neighbors=None, 'Unsupported mode, must be one of "connectivity" ' 'or "distance" but got "%s" instead' % mode) + n_queries = A_ind.shape[0] + n_samples_fit = self.n_samples_fit_ + n_nonzero = n_queries * n_neighbors + A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) + kneighbors_graph = csr_matrix((A_data, A_ind.ravel(), A_indptr), - shape=(n_samples1, n_samples2)) + shape=(n_queries, n_samples_fit)) return kneighbors_graph -def _tree_query_radius_parallel_helper(tree, data, radius, return_distance): +def _tree_query_radius_parallel_helper(tree, *args, **kwargs): """Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors The Cython method tree.query_radius is not directly picklable by cloudpickle under PyPy. """ - return tree.query_radius(data, radius, return_distance) + return tree.query_radius(*args, **kwargs) class RadiusNeighborsMixin: @@ -625,7 +824,8 @@ def _radius_neighbors_reduce_func(self, dist, start, results = neigh_ind return results - def radius_neighbors(self, X=None, radius=None, return_distance=True): + def radius_neighbors(self, X=None, radius=None, return_distance=True, + sort_results=False): """Finds the neighbors within a given radius of a point or points. Return the indices and distances of each point from the dataset @@ -647,16 +847,24 @@ def radius_neighbors(self, X=None, radius=None, return_distance=True): (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. - If False, distances will not be returned + If False, distances will not be returned. + + sort_results : boolean, optional. Defaults to False. + If True, the distances and indices will be sorted before being + returned. If False, the results will not be sorted. If + return_distance == False, setting sort_results = True will + result in an error. + + .. versionadded:: 0.22 Returns ------- - dist : array, shape (n_samples,) of arrays + neigh_dist : array, shape (n_samples,) of arrays Array representing the distances to each point, only present if return_distance=True. The distance values are computed according to the ``metric`` constructor parameter. - ind : array, shape (n_samples,) of arrays + neigh_ind : array, shape (n_samples,) of arrays An array of arrays of indices of the approximate nearest points from the population matrix that lie within a ball of size ``radius`` around the query points. @@ -695,7 +903,10 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + if self.effective_metric_ == 'precomputed': + X = _check_precomputed(X) + else: + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -703,7 +914,12 @@ class from an array representing our data set and ask who's if radius is None: radius = self.radius - if self._fit_method == 'brute': + if (self._fit_method == 'brute' and + self.effective_metric_ == 'precomputed' and issparse(X)): + results = _radius_neighbors_from_graph( + X, radius=radius, return_distance=return_distance) + + elif self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': radius *= radius @@ -715,23 +931,23 @@ class from an array representing our data set and ask who's radius=radius, return_distance=return_distance) - results = pairwise_distances_chunked( + chunked_results = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=self.n_jobs, **kwds) if return_distance: - dist_chunks, neigh_ind_chunks = zip(*results) - dist_list = sum(dist_chunks, []) + neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) + neigh_dist_list = sum(neigh_dist_chunks, []) neigh_ind_list = sum(neigh_ind_chunks, []) # See https://github.com/numpy/numpy/issues/5456 - # if you want to understand why this is initialized this way. - dist = np.empty(len(dist_list), dtype='object') - dist[:] = dist_list + # to understand why this is initialized this way. + neigh_dist = np.empty(len(neigh_dist_list), dtype='object') + neigh_dist[:] = neigh_dist_list neigh_ind = np.empty(len(neigh_ind_list), dtype='object') neigh_ind[:] = neigh_ind_list - results = dist, neigh_ind + results = neigh_dist, neigh_ind else: - neigh_ind_list = sum(results, []) + neigh_ind_list = sum(chunked_results, []) results = np.empty(len(neigh_ind_list), dtype='object') results[:] = neigh_ind_list @@ -750,15 +966,18 @@ class from an array representing our data set and ask who's else: delayed_query = delayed(_tree_query_radius_parallel_helper) parallel_kwargs = {"prefer": "threads"} - results = Parallel(n_jobs, **parallel_kwargs)( - delayed_query(self._tree, X[s], radius, return_distance) + + chunked_results = Parallel(n_jobs, **parallel_kwargs)( + delayed_query(self._tree, X[s], radius, return_distance, + sort_results=sort_results) + for s in gen_even_slices(X.shape[0], n_jobs) ) if return_distance: - neigh_ind, dist = tuple(zip(*results)) - results = np.hstack(dist), np.hstack(neigh_ind) + neigh_ind, neigh_dist = tuple(zip(*chunked_results)) + results = np.hstack(neigh_dist), np.hstack(neigh_ind) else: - results = np.hstack(results) + results = np.hstack(chunked_results) else: raise ValueError("internal: _fit_method not recognized") @@ -769,7 +988,7 @@ class from an array representing our data set and ask who's # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: - dist, neigh_ind = results + neigh_dist, neigh_ind = results else: neigh_ind = results @@ -778,13 +997,14 @@ class from an array representing our data set and ask who's neigh_ind[ind] = ind_neighbor[mask] if return_distance: - dist[ind] = dist[ind][mask] + neigh_dist[ind] = neigh_dist[ind][mask] if return_distance: - return dist, neigh_ind + return neigh_dist, neigh_ind return neigh_ind - def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): + def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity', + sort_results=False): """Computes the (weighted) graph of Neighbors for points in X Neighborhoods are restricted the points at a distance lower than @@ -792,7 +1012,7 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): Parameters ---------- - X : array-like, shape = [n_samples, n_features], optional + X : array-like, shape = [n_queries, n_features], optional The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. @@ -806,9 +1026,17 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): connectivity matrix with ones and zeros, in 'distance' the edges are Euclidean distance between points. + sort_results : boolean, optional. Defaults to False. + If True, the distances and indices will be sorted before being + returned. If False, the results will not be sorted. + Only used with mode='distance'. + + .. versionadded:: 0.22 + Returns ------- - A : sparse matrix in CSR format, shape = [n_samples, n_samples] + A : sparse graph in CSR format, shape = [n_queries, n_samples_fit] + n_samples_fit is the number of samples in the fitted data A[i, j] is assigned the weight of edge that connects i to j. Examples @@ -829,10 +1057,9 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): kneighbors_graph """ check_is_fitted(self) - if X is not None: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) - n_samples2 = self._fit_X.shape[0] + # check the input only in self.radius_neighbors + if radius is None: radius = self.radius @@ -843,14 +1070,16 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): A_data = None elif mode == 'distance': dist, A_ind = self.radius_neighbors(X, radius, - return_distance=True) + return_distance=True, + sort_results=sort_results) A_data = np.concatenate(list(dist)) else: raise ValueError( 'Unsupported mode, must be one of "connectivity", ' 'or "distance" but got %s instead' % mode) - n_samples1 = A_ind.shape[0] + n_queries = A_ind.shape[0] + n_samples_fit = self.n_samples_fit_ n_neighbors = np.array([len(a) for a in A_ind]) A_ind = np.concatenate(list(A_ind)) if A_data is None: @@ -859,7 +1088,7 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): np.cumsum(n_neighbors))) return csr_matrix((A_data, A_ind, A_indptr), - shape=(n_samples1, n_samples2)) + shape=(n_queries, n_samples_fit)) class SupervisedFloatMixin: diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index a72f710ae57ea..209bd93537166 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -74,6 +74,9 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`Glossary `, + in which case only "nonzero" elements may be considered neighbors. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -157,13 +160,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of shape [n_samples] or [n_samples, n_outputs] + y : array of shape [n_queries] or [n_queries, n_outputs] Class labels for each data sample. """ X = check_array(X, accept_sparse='csr') @@ -176,10 +179,10 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - n_samples = _num_samples(X) + n_queries = _num_samples(X) weights = _get_weights(neigh_dist, self.weights) - y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) for k, classes_k in enumerate(classes_): if weights is None: mode, _ = stats.mode(_y[neigh_ind, k], axis=1) @@ -199,13 +202,13 @@ def predict_proba(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - p : array of shape = [n_samples, n_classes], or a list of n_outputs + p : array of shape = [n_queries, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -220,7 +223,7 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - n_samples = _num_samples(X) + n_queries = _num_samples(X) weights = _get_weights(neigh_dist, self.weights) if weights is None: @@ -230,7 +233,7 @@ def predict_proba(self, X): probabilities = [] for k, classes_k in enumerate(classes_): pred_labels = _y[:, k][neigh_ind] - proba_k = np.zeros((n_samples, classes_k.size)) + proba_k = np.zeros((n_queries, classes_k.size)) # a simple ':' index doesn't work right for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) @@ -303,6 +306,9 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`Glossary `, + in which case only "nonzero" elements may be considered neighbors. outlier_label : {manual label, 'most_frequent'}, optional (default = None) label for outlier samples (samples with no neighbors in given radius). @@ -448,13 +454,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of shape [n_samples] or [n_samples, n_outputs] + y : array of shape [n_queries] or [n_queries, n_outputs] Class labels for each data sample. """ @@ -466,9 +472,8 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - n_samples = probs[0].shape[0] - y_pred = np.empty((n_samples, n_outputs), - dtype=classes_[0].dtype) + n_queries = probs[0].shape[0] + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) for k, prob in enumerate(probs): # iterate over multi-output, assign labels based on probabilities @@ -491,23 +496,23 @@ def predict_proba(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - p : array of shape = [n_samples, n_classes], or a list of n_outputs + p : array of shape = [n_queries, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. """ X = check_array(X, accept_sparse='csr') - n_samples = _num_samples(X) + n_queries = _num_samples(X) neigh_dist, neigh_ind = self.radius_neighbors(X) - outlier_mask = np.zeros(n_samples, dtype=np.bool) + outlier_mask = np.zeros(n_queries, dtype=np.bool) outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] outliers = np.flatnonzero(outlier_mask) inliers = np.flatnonzero(~outlier_mask) @@ -535,7 +540,7 @@ def predict_proba(self, X): pred_labels = np.zeros(len(neigh_ind), dtype=object) pred_labels[:] = [_y[ind, k] for ind in neigh_ind] - proba_k = np.zeros((n_samples, classes_k.size)) + proba_k = np.zeros((n_queries, classes_k.size)) proba_inl = np.zeros((len(inliers), classes_k.size)) # samples have different size of neighbors within the same radius diff --git a/sklearn/neighbors/graph.py b/sklearn/neighbors/graph.py index 3999ff458e121..da3954ff909c7 100644 --- a/sklearn/neighbors/graph.py +++ b/sklearn/neighbors/graph.py @@ -1,11 +1,15 @@ """Nearest Neighbors graph functions""" # Author: Jake Vanderplas +# Tom Dupre la Tour # # License: BSD 3 clause (C) INRIA, University of Amsterdam - from .base import KNeighborsMixin, RadiusNeighborsMixin +from .base import NeighborsBase +from .base import UnsupervisedMixin from .unsupervised import NearestNeighbors +from ..base import TransformerMixin +from ..utils.validation import check_is_fitted def _check_params(X, metric, p, metric_params): @@ -21,14 +25,16 @@ def _check_params(X, metric, p, metric_params): func_param, param_name, est_params[param_name])) -def _query_include_self(X, include_self): +def _query_include_self(X, include_self, mode): """Return the query based on include_self param""" - if include_self: - query = X._fit_X - else: - query = None + if include_self == 'auto': + include_self = mode == 'connectivity' - return query + # it does not include each sample as its own neighbors + if not include_self: + X = None + + return X def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', @@ -65,10 +71,10 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', metric_params : dict, optional additional keyword arguments for the metric function. - include_self : bool, default=False. + include_self : bool or 'auto', default=False Whether or not to mark each sample as the first nearest neighbor to - itself. If `None`, then True is used for mode='connectivity' and False - for mode='distance' as this will preserve backwards compatibility. + itself. If 'auto', then True is used for mode='connectivity' and False + for mode='distance'. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. @@ -78,7 +84,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', Returns ------- - A : sparse matrix in CSR format, shape = [n_samples, n_samples] + A : sparse graph in CSR format, shape = [n_samples, n_samples] A[i, j] is assigned the weight of edge that connects i to j. Examples @@ -101,7 +107,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', else: _check_params(X, metric, p, metric_params) - query = _query_include_self(X, include_self) + query = _query_include_self(X._fit_X, include_self, mode) return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode) @@ -143,10 +149,10 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', metric_params : dict, optional additional keyword arguments for the metric function. - include_self : bool, default=False + include_self : bool or 'auto', default=False Whether or not to mark each sample as the first nearest neighbor to - itself. If `None`, then True is used for mode='connectivity' and False - for mode='distance' as this will preserve backwards compatibility. + itself. If 'auto', then True is used for mode='connectivity' and False + for mode='distance'. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. @@ -156,7 +162,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', Returns ------- - A : sparse matrix in CSR format, shape = [n_samples, n_samples] + A : sparse graph in CSR format, shape = [n_samples, n_samples] A[i, j] is assigned the weight of edge that connects i to j. Examples @@ -180,5 +186,284 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', else: _check_params(X, metric, p, metric_params) - query = _query_include_self(X, include_self) + query = _query_include_self(X._fit_X, include_self, mode) return X.radius_neighbors_graph(query, radius, mode) + + +class KNeighborsTransformer(NeighborsBase, KNeighborsMixin, + UnsupervisedMixin, TransformerMixin): + """Transform X into a (weighted) graph of k nearest neighbors + + The transformed data is a sparse graph as returned by kneighbors_graph. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + mode : {'distance', 'connectivity'}, default='distance' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + n_neighbors : int, default=5 + Number of neighbors for each sample in the transformed sparse graph. + For compatibility reasons, as each sample is considered as its own + neighbor, one extra neighbor will be computed when mode == 'distance'. + In this case, the sparse graph contains (n_neighbors + 1) neighbors. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default='minkowski' + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : int, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=1 + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Examples + -------- + >>> from sklearn.manifold import Isomap + >>> from sklearn.neighbors import KNeighborsTransformer + >>> from sklearn.pipeline import make_pipeline + >>> estimator = make_pipeline( + ... KNeighborsTransformer(n_neighbors=5, mode='distance'), + ... Isomap(neighbors_algorithm='precomputed')) + """ + def __init__(self, mode='distance', n_neighbors=5, algorithm='auto', + leaf_size=30, metric='minkowski', p=2, metric_params=None, + n_jobs=1): + super(KNeighborsTransformer, self).__init__( + n_neighbors=n_neighbors, radius=None, algorithm=algorithm, + leaf_size=leaf_size, metric=metric, p=p, + metric_params=metric_params, n_jobs=n_jobs) + self.mode = mode + + def transform(self, X): + """Computes the (weighted) graph of Neighbors for points in X + + Parameters + ---------- + X : array-like of shape (n_samples_transform, n_features) + Sample data + + Returns + ------- + Xt : CSR sparse graph of shape (n_samples_transform, n_samples_fit) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + """ + check_is_fitted(self) + add_one = self.mode == 'distance' + return self.kneighbors_graph(X, mode=self.mode, + n_neighbors=self.n_neighbors + add_one) + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : ignored + + Returns + ------- + Xt : CSR sparse graph of shape (n_samples, n_samples) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + """ + return self.fit(X).transform(X) + + +class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin, + UnsupervisedMixin, TransformerMixin): + """Transform X into a (weighted) graph of neighbors nearer than a radius + + The transformed data is a sparse graph as returned by + radius_neighbors_graph. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + mode : {'distance', 'connectivity'}, default='distance' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + radius : float, default=1. + Radius of neighborhood in the transformed sparse graph. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default='minkowski' + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : int, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=1 + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Examples + -------- + >>> from sklearn.cluster import DBSCAN + >>> from sklearn.neighbors import RadiusNeighborsTransformer + >>> from sklearn.pipeline import make_pipeline + >>> estimator = make_pipeline( + ... RadiusNeighborsTransformer(radius=42.0, mode='distance'), + ... DBSCAN(min_samples=30, metric='precomputed')) + """ + def __init__(self, mode='distance', radius=1., algorithm='auto', + leaf_size=30, metric='minkowski', p=2, metric_params=None, + n_jobs=1): + super(RadiusNeighborsTransformer, self).__init__( + n_neighbors=None, radius=radius, algorithm=algorithm, + leaf_size=leaf_size, metric=metric, p=p, + metric_params=metric_params, n_jobs=n_jobs) + self.mode = mode + + def transform(self, X): + """Computes the (weighted) graph of Neighbors for points in X + + Parameters + ---------- + X : array-like of shape (n_samples_transform, n_features) + Sample data + + Returns + ------- + Xt : CSR sparse graph of shape (n_samples_transform, n_samples_fit) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + """ + check_is_fitted(self) + return self.radius_neighbors_graph(X, mode=self.mode, + sort_results=True) + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : ignored + + Returns + ------- + Xt : CSR sparse graph, shape (n_samples, n_samples) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + """ + return self.fit(X).transform(X) diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index f4f697565cd3e..fa02bed235535 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -60,8 +60,9 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. - If 'precomputed', the training input X is expected to be a distance - matrix. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. X may be a sparse matrix, in which case only "nonzero" + elements may be considered neighbors. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable @@ -118,8 +119,6 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - Affects only :meth:`kneighbors` and :meth:`kneighbors_graph` methods. - Attributes ---------- @@ -239,7 +238,7 @@ def fit(self, X, y=None): super().fit(X) - n_samples = self._fit_X.shape[0] + n_samples = self.n_samples_fit_ if self.n_neighbors > n_samples: warnings.warn("n_neighbors (%s) is greater than the " "total number of samples (%s). n_neighbors " @@ -247,8 +246,8 @@ def fit(self, X, y=None): % (self.n_neighbors, n_samples)) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - self._distances_fit_X_, _neighbors_indices_fit_X_ = ( - self.kneighbors(None, n_neighbors=self.n_neighbors_)) + self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors( + n_neighbors=self.n_neighbors_) self._lrd = self._local_reachability_density( self._distances_fit_X_, _neighbors_indices_fit_X_) @@ -320,7 +319,7 @@ def _predict(self, X=None): is_inlier = np.ones(X.shape[0], dtype=int) is_inlier[self.decision_function(X) < 0] = -1 else: - is_inlier = np.ones(self._fit_X.shape[0], dtype=int) + is_inlier = np.ones(self.n_samples_fit_, dtype=int) is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 return is_inlier @@ -475,17 +474,17 @@ def _local_reachability_density(self, distances_X, neighbors_indices): Parameters ---------- - distances_X : array, shape (n_query, self.n_neighbors) + distances_X : array, shape (n_queries, self.n_neighbors) Distances to the neighbors (in the training samples `self._fit_X`) of each query point to compute the LRD. - neighbors_indices : array, shape (n_query, self.n_neighbors) + neighbors_indices : array, shape (n_queries, self.n_neighbors) Neighbors indices (of each query point) among training samples self._fit_X. Returns ------- - local_reachability_density : array, shape (n_samples,) + local_reachability_density : array, shape (n_queries,) The local reachability density of each sample. """ dist_k = self._distances_fit_X_[neighbors_indices, diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index a8819b222c1bd..cbb033f0b8cae 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -13,7 +13,6 @@ import warnings import numpy as np -from scipy.sparse import issparse from .base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin from .base import RadiusNeighborsMixin, SupervisedFloatMixin @@ -78,6 +77,9 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`Glossary `, + in which case only "nonzero" elements may be considered neighbors. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -151,20 +153,15 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of int, shape = [n_samples] or [n_samples, n_outputs] + y : array of int, shape = [n_queries] or [n_queries, n_outputs] Target values """ - if issparse(X) and self.metric == 'precomputed': - raise ValueError( - "Sparse matrices not supported for prediction with " - "precomputed kernels. Densify your matrix." - ) X = check_array(X, accept_sparse='csr') neigh_dist, neigh_ind = self.kneighbors(X) @@ -249,6 +246,9 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`Glossary `, + in which case only "nonzero" elements may be considered neighbors. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -315,13 +315,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' + X : array-like, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of float, shape = [n_samples] or [n_samples, n_outputs] + y : array of float, shape = [n_queries] or [n_queries, n_outputs] Target values """ X = check_array(X, accept_sparse='csr') diff --git a/sklearn/neighbors/tests/test_graph.py b/sklearn/neighbors/tests/test_graph.py new file mode 100644 index 0000000000000..b4f6ddb42ed06 --- /dev/null +++ b/sklearn/neighbors/tests/test_graph.py @@ -0,0 +1,79 @@ +import numpy as np + +from sklearn.metrics import euclidean_distances +from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer +from sklearn.neighbors.base import _is_sorted_by_data + + +def test_transformer_result(): + # Test the number of neighbors returned + n_neighbors = 5 + n_samples_fit = 20 + n_queries = 18 + n_features = 10 + + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + X2 = rng.randn(n_queries, n_features) + radius = np.percentile(euclidean_distances(X), 10) + + # with n_neighbors + for mode in ['distance', 'connectivity']: + add_one = mode == 'distance' + nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode) + Xt = nnt.fit_transform(X) + assert Xt.shape == (n_samples_fit, n_samples_fit) + assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), ) + assert Xt.format == 'csr' + assert _is_sorted_by_data(Xt) + + X2t = nnt.transform(X2) + assert X2t.shape == (n_queries, n_samples_fit) + assert X2t.data.shape == (n_queries * (n_neighbors + add_one), ) + assert X2t.format == 'csr' + assert _is_sorted_by_data(X2t) + + # with radius + for mode in ['distance', 'connectivity']: + add_one = mode == 'distance' + nnt = RadiusNeighborsTransformer(radius=radius, mode=mode) + Xt = nnt.fit_transform(X) + assert Xt.shape == (n_samples_fit, n_samples_fit) + assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), ) + assert Xt.format == 'csr' + assert _is_sorted_by_data(Xt) + + X2t = nnt.transform(X2) + assert X2t.shape == (n_queries, n_samples_fit) + assert not X2t.data.shape == (n_queries * (n_neighbors + add_one), ) + assert X2t.format == 'csr' + assert _is_sorted_by_data(X2t) + + +def _has_explicit_diagonal(X): + """Return True if the diagonal is explicitly stored""" + X = X.tocoo() + explicit = X.row[X.row == X.col] + return len(explicit) == X.shape[0] + + +def test_explicit_diagonal(): + # Test that the diagonal is explicitly stored in the sparse graph + n_neighbors = 5 + n_samples_fit, n_samples_transform, n_features = 20, 18, 10 + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + X2 = rng.randn(n_samples_transform, n_features) + + nnt = KNeighborsTransformer(n_neighbors=n_neighbors) + Xt = nnt.fit_transform(X) + assert _has_explicit_diagonal(Xt) + assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) + + Xt = nnt.transform(X) + assert _has_explicit_diagonal(Xt) + assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) + + # Using transform on new data should not always have zero diagonal + X2t = nnt.transform(X2) + assert not _has_explicit_diagonal(X2t) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 3da1c2579700f..0d7166da64fd8 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1,19 +1,22 @@ from itertools import product +import pytest import numpy as np from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse) -import pytest - from sklearn import metrics from sklearn import neighbors, datasets +from sklearn.base import clone from sklearn.exceptions import DataConversionWarning +from sklearn.exceptions import EfficiencyWarning from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.neighbors.base import VALID_METRICS_SPARSE, VALID_METRICS +from sklearn.neighbors.base import _is_sorted_by_data, _check_precomputed +from sklearn.pipeline import make_pipeline from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises @@ -133,15 +136,15 @@ def test_not_fitted_error_gets_raised(): assert_raises(NotFittedError, neighbors_.radius_neighbors_graph, X) -def test_precomputed(random_state=42): +@ignore_warnings(category=EfficiencyWarning) +def check_precomputed(make_train_test, estimators): """Tests unsupervised NearestNeighbors with a distance matrix.""" # Note: smaller samples may result in spurious test success - rng = np.random.RandomState(random_state) + rng = np.random.RandomState(42) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) - DXX = metrics.pairwise_distances(X, metric='euclidean') - DYX = metrics.pairwise_distances(Y, X, metric='euclidean') - for method in ['kneighbors']: + DXX, DYX = make_train_test(X, Y) + for method in ['kneighbors', ]: # TODO: also test radius_neighbors, but requires different assertion # As a feature matrix (n_samples by n_features) @@ -175,11 +178,7 @@ def test_precomputed(random_state=42): assert_raises(ValueError, getattr(nbrs_D, method), X) target = np.arange(X.shape[0]) - for Est in (neighbors.KNeighborsClassifier, - neighbors.RadiusNeighborsClassifier, - neighbors.KNeighborsRegressor, - neighbors.RadiusNeighborsRegressor): - print(Est) + for Est in estimators: est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) @@ -188,6 +187,118 @@ def test_precomputed(random_state=42): assert_array_almost_equal(pred_X, pred_D) +def test_precomputed_dense(): + def make_train_test(X_train, X_test): + return (metrics.pairwise_distances(X_train), + metrics.pairwise_distances(X_test, X_train)) + + estimators = [ + neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, neighbors.RadiusNeighborsRegressor + ] + check_precomputed(make_train_test, estimators) + + +@pytest.mark.parametrize('fmt', ['csr', 'lil']) +def test_precomputed_sparse_knn(fmt): + def make_train_test(X_train, X_test): + nn = neighbors.NearestNeighbors(n_neighbors=3 + 1).fit(X_train) + return (nn.kneighbors_graph(X_train, mode='distance').asformat(fmt), + nn.kneighbors_graph(X_test, mode='distance').asformat(fmt)) + + # We do not test RadiusNeighborsClassifier and RadiusNeighborsRegressor + # since the precomputed neighbors graph is built with k neighbors only. + estimators = [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + ] + check_precomputed(make_train_test, estimators) + + +@pytest.mark.parametrize('fmt', ['csr', 'lil']) +def test_precomputed_sparse_radius(fmt): + def make_train_test(X_train, X_test): + nn = neighbors.NearestNeighbors(radius=1).fit(X_train) + return (nn.radius_neighbors_graph(X_train, + mode='distance').asformat(fmt), + nn.radius_neighbors_graph(X_test, + mode='distance').asformat(fmt)) + + # We do not test KNeighborsClassifier and KNeighborsRegressor + # since the precomputed neighbors graph is built with a radius. + estimators = [ + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ] + check_precomputed(make_train_test, estimators) + + +def test_is_sorted_by_data(): + # Test that _is_sorted_by_data works as expected. In CSR sparse matrix, + # entries in each row can be sorted by indices, by data, or unsorted. + # _is_sorted_by_data should return True when entries are sorted by data, + # and False in all other cases. + + # Test with sorted 1D array + X = csr_matrix(np.arange(10)) + assert _is_sorted_by_data(X) + # Test with unsorted 1D array + X[0, 2] = 5 + assert not _is_sorted_by_data(X) + + # Test when the data is sorted in each sample, but not necessarily + # between samples + X = csr_matrix([[0, 1, 2], [3, 0, 0], [3, 4, 0], [1, 0, 2]]) + assert _is_sorted_by_data(X) + + # Test with duplicates entries in X.indptr + data, indices, indptr = [0, 4, 2, 2], [0, 1, 1, 1], [0, 2, 2, 4] + X = csr_matrix((data, indices, indptr), shape=(3, 3)) + assert _is_sorted_by_data(X) + + +@ignore_warnings(category=EfficiencyWarning) +def test_check_precomputed(): + # Test that _check_precomputed returns a graph sorted by data + X = csr_matrix(np.abs(np.random.RandomState(42).randn(10, 10))) + assert not _is_sorted_by_data(X) + Xt = _check_precomputed(X) + assert _is_sorted_by_data(Xt) + + # est with a different number of nonzero entries for each sample + mask = np.random.RandomState(42).randint(2, size=(10, 10)) + X = X.toarray() + X[mask == 1] = 0 + X = csr_matrix(X) + assert not _is_sorted_by_data(X) + Xt = _check_precomputed(X) + assert _is_sorted_by_data(Xt) + + +@ignore_warnings(category=EfficiencyWarning) +def test_precomputed_sparse_invalid(): + dist = np.array([[0., 2., 1.], [2., 0., 3.], [1., 3., 0.]]) + dist_csr = csr_matrix(dist) + neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed") + neigh.fit(dist_csr) + neigh.kneighbors(None, n_neighbors=1) + neigh.kneighbors(np.array([[0., 0., 0.]]), n_neighbors=2) + + # Ensures enough number of nearest neighbors + dist = np.array([[0., 2., 0.], [2., 0., 3.], [0., 3., 0.]]) + dist_csr = csr_matrix(dist) + neigh.fit(dist_csr) + msg = "2 neighbors per samples are required, but some samples have only 1" + assert_raises_regex(ValueError, msg, neigh.kneighbors, None, n_neighbors=1) + + # Checks error with inconsistent distance matrix + dist = np.array([[5., 2., 1.], [-2., 0., 3.], [1., 3., 0.]]) + dist_csr = csr_matrix(dist) + msg = "Negative values in data passed to precomputed distance matrix." + assert_raises_regex(ValueError, msg, neigh.kneighbors, dist_csr, + n_neighbors=1) + + def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) @@ -821,6 +932,7 @@ def test_RadiusNeighborsRegressor_multioutput(n_samples=40, assert np.all(np.abs(y_pred - y_target) < 0.3) +@ignore_warnings(category=EfficiencyWarning) def test_kneighbors_regressor_sparse(n_samples=40, n_features=5, n_test_pts=10, @@ -846,10 +958,7 @@ def test_kneighbors_regressor_sparse(n_samples=40, assert np.mean(knn.predict(X2).round() == y) > 0.95 X2_pre = sparsev(pairwise_distances(X, metric='euclidean')) - if issparse(sparsev(X2_pre)): - assert_raises(ValueError, knn_pre.predict, X2_pre) - else: - assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95 + assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95 def test_neighbors_iris(): @@ -1318,6 +1427,7 @@ def test_k_and_radius_neighbors_duplicates(): rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode='distance') + rng.sort_indices() assert_array_equal(rng.A, [[0, 1], [1, 0]]) assert_array_equal(rng.indices, [0, 1, 0, 1]) assert_array_equal(rng.data, [0, 1, 1, 0]) @@ -1498,3 +1608,45 @@ def test_radius_neighbors_predict_proba(): proba_label = np.where(proba.sum(axis=1) == 0, outlier_label, proba_label) assert_array_equal(pred, proba_label) + + +def test_pipeline_with_nearest_neighbors_transformer(): + # Test chaining KNeighborsTransformer and classifiers/regressors + rng = np.random.RandomState(0) + X = 2 * rng.rand(40, 5) - 1 + X2 = 2 * rng.rand(40, 5) - 1 + y = rng.rand(40, 1) + + n_neighbors = 12 + radius = 1.5 + # We precompute more neighbors than necessary, to have equivalence between + # k-neighbors estimator after radius-neighbors transformer, and vice-versa. + factor = 2 + + k_trans = neighbors.KNeighborsTransformer( + n_neighbors=n_neighbors, mode='distance') + k_trans_factor = neighbors.KNeighborsTransformer( + n_neighbors=int(n_neighbors * factor), mode='distance') + + r_trans = neighbors.RadiusNeighborsTransformer( + radius=radius, mode='distance') + r_trans_factor = neighbors.RadiusNeighborsTransformer( + radius=int(radius * factor), mode='distance') + + k_reg = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors) + r_reg = neighbors.RadiusNeighborsRegressor(radius=radius) + + test_list = [(k_trans, k_reg), (k_trans_factor, r_reg), + (r_trans, r_reg), (r_trans_factor, k_reg), ] + + for trans, reg in test_list: + # compare the chained version and the compact version + reg_compact = clone(reg) + reg_precomp = clone(reg) + reg_precomp.set_params(metric='precomputed') + + reg_chain = make_pipeline(clone(trans), reg_precomp) + + y_pred_chain = reg_chain.fit(X, y).predict(X2) + y_pred_compact = reg_compact.fit(X, y).predict(X2) + assert_array_almost_equal(y_pred_chain, y_pred_compact) diff --git a/sklearn/neighbors/tests/test_neighbors_pipeline.py b/sklearn/neighbors/tests/test_neighbors_pipeline.py new file mode 100644 index 0000000000000..455cca6937dc1 --- /dev/null +++ b/sklearn/neighbors/tests/test_neighbors_pipeline.py @@ -0,0 +1,221 @@ +""" +This is testing the equivalence between some estimators with internal nearest +neighbors computations, and the corresponding pipeline versions with +KNeighborsTransformer or RadiusNeighborsTransformer to precompute the +neighbors. +""" + +import numpy as np + +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.datasets.samples_generator import make_blobs +from sklearn.pipeline import make_pipeline +from sklearn.base import clone + +from sklearn.neighbors import KNeighborsTransformer +from sklearn.neighbors import RadiusNeighborsTransformer + +from sklearn.cluster import DBSCAN +from sklearn.cluster import SpectralClustering +from sklearn.neighbors import KNeighborsRegressor +from sklearn.neighbors import RadiusNeighborsRegressor +from sklearn.neighbors import LocalOutlierFactor +from sklearn.manifold import SpectralEmbedding +from sklearn.manifold import Isomap +from sklearn.manifold import TSNE + + +def test_spectral_clustering(): + # Test chaining KNeighborsTransformer and SpectralClustering + n_neighbors = 5 + X, _ = make_blobs(random_state=0) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'), + SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed', + random_state=42)) + est_compact = SpectralClustering( + n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42) + labels_compact = est_compact.fit_predict(X) + labels_chain = est_chain.fit_predict(X) + assert_array_almost_equal(labels_chain, labels_compact) + + +def test_spectral_embedding(): + # Test chaining KNeighborsTransformer and SpectralEmbedding + n_neighbors = 5 + + n_samples = 1000 + centers = np.array([ + [0.0, 5.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ]) + S, true_labels = make_blobs(n_samples=n_samples, centers=centers, + cluster_std=1., random_state=42) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'), + SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed', + random_state=42)) + est_compact = SpectralEmbedding( + n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42) + St_compact = est_compact.fit_transform(S) + St_chain = est_chain.fit_transform(S) + assert_array_almost_equal(St_chain, St_compact) + + +def test_dbscan(): + # Test chaining RadiusNeighborsTransformer and DBSCAN + radius = 0.3 + n_clusters = 3 + X = generate_clustered_data(n_clusters=n_clusters) + + # compare the chained version and the compact version + est_chain = make_pipeline( + RadiusNeighborsTransformer(radius=radius, mode='distance'), + DBSCAN(metric='precomputed', eps=radius)) + est_compact = DBSCAN(eps=radius) + + labels_chain = est_chain.fit_predict(X) + labels_compact = est_compact.fit_predict(X) + assert_array_almost_equal(labels_chain, labels_compact) + + +def test_isomap(): + # Test chaining KNeighborsTransformer and Isomap with + # neighbors_algorithm='precomputed' + algorithm = 'auto' + n_neighbors = 10 + + X, _ = make_blobs(random_state=0) + X2, _ = make_blobs(random_state=1) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, algorithm=algorithm, + mode='distance'), + Isomap(n_neighbors=n_neighbors, metric='precomputed')) + est_compact = Isomap(n_neighbors=n_neighbors, + neighbors_algorithm=algorithm) + + Xt_chain = est_chain.fit_transform(X) + Xt_compact = est_compact.fit_transform(X) + assert_array_almost_equal(Xt_chain, Xt_compact) + + Xt_chain = est_chain.transform(X2) + Xt_compact = est_compact.transform(X2) + assert_array_almost_equal(Xt_chain, Xt_compact) + + +def test_tsne(): + # Test chaining KNeighborsTransformer and TSNE + n_iter = 250 + perplexity = 5 + n_neighbors = int(3. * perplexity + 1) + + rng = np.random.RandomState(0) + X = rng.randn(20, 2) + + for metric in ['minkowski', 'sqeuclidean']: + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', + metric=metric), + TSNE(metric='precomputed', perplexity=perplexity, + method="barnes_hut", random_state=42, n_iter=n_iter)) + est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter, + method="barnes_hut", random_state=42) + + Xt_chain = est_chain.fit_transform(X) + Xt_compact = est_compact.fit_transform(X) + assert_array_almost_equal(Xt_chain, Xt_compact) + + +def test_lof_novelty_false(): + # Test chaining KNeighborsTransformer and LocalOutlierFactor + n_neighbors = 4 + + rng = np.random.RandomState(0) + X = rng.randn(40, 2) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'), + LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors, + novelty=False, contamination="auto")) + est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=False, + contamination="auto") + + pred_chain = est_chain.fit_predict(X) + pred_compact = est_compact.fit_predict(X) + assert_array_almost_equal(pred_chain, pred_compact) + + +def test_lof_novelty_true(): + # Test chaining KNeighborsTransformer and LocalOutlierFactor + n_neighbors = 4 + + rng = np.random.RandomState(0) + X1 = rng.randn(40, 2) + X2 = rng.randn(40, 2) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'), + LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors, + novelty=True, contamination="auto")) + est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True, + contamination="auto") + + pred_chain = est_chain.fit(X1).predict(X2) + pred_compact = est_compact.fit(X1).predict(X2) + assert_array_almost_equal(pred_chain, pred_compact) + + +def test_kneighbors_regressor(): + # Test chaining KNeighborsTransformer and classifiers/regressors + rng = np.random.RandomState(0) + X = 2 * rng.rand(40, 5) - 1 + X2 = 2 * rng.rand(40, 5) - 1 + y = rng.rand(40, 1) + + n_neighbors = 12 + radius = 1.5 + # We precompute more neighbors than necessary, to have equivalence between + # k-neighbors estimator after radius-neighbors transformer, and vice-versa. + factor = 2 + + k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance') + k_trans_factor = KNeighborsTransformer(n_neighbors=int( + n_neighbors * factor), mode='distance') + + r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance') + r_trans_factor = RadiusNeighborsTransformer(radius=int( + radius * factor), mode='distance') + + k_reg = KNeighborsRegressor(n_neighbors=n_neighbors) + r_reg = RadiusNeighborsRegressor(radius=radius) + + test_list = [ + (k_trans, k_reg), + (k_trans_factor, r_reg), + (r_trans, r_reg), + (r_trans_factor, k_reg), + ] + + for trans, reg in test_list: + # compare the chained version and the compact version + reg_compact = clone(reg) + reg_precomp = clone(reg) + reg_precomp.set_params(metric='precomputed') + + reg_chain = make_pipeline(clone(trans), reg_precomp) + + y_pred_chain = reg_chain.fit(X, y).predict(X2) + y_pred_compact = reg_compact.fit(X, y).predict(X2) + assert_array_almost_equal(y_pred_chain, y_pred_compact) diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py index 806b6f7736472..4bd02ed0dbfd0 100644 --- a/sklearn/neighbors/unsupervised.py +++ b/sklearn/neighbors/unsupervised.py @@ -1,5 +1,4 @@ """Unsupervised nearest neighbors learner""" - from .base import NeighborsBase from .base import KNeighborsMixin from .base import RadiusNeighborsMixin @@ -40,30 +39,13 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, nature of the problem. metric : string or callable, default 'minkowski' - metric to use for distance computation. Any metric from scikit-learn - or scipy.spatial.distance can be used. - - If metric is a callable function, it is called on each - pair of instances (rows) and the resulting value recorded. The callable - should take two arrays as input and return one value indicating the - distance between them. This works for Scipy's metrics, but is less - efficient than passing the metric name as a string. - - Distance matrices are not supported. - - Valid values for metric are: - - - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan'] - - - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', - 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', - 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', - 'yule'] - - See the documentation for scipy.spatial.distance for details on these - metrics. + the distance metric to use for the tree. The default metric is + minkowski, and with p=2 is equivalent to the standard Euclidean + metric. See the documentation of the DistanceMetric class for a + list of available metrics. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`Glossary `, + in which case only "nonzero" elements may be considered neighbors. p : integer, optional (default = 2) Parameter for the Minkowski metric from diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index ce862255f02eb..9114a8e5f7631 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -19,21 +19,18 @@ from sklearn.utils.testing import all_estimators from sklearn.utils.testing import ignore_warnings -from sklearn.exceptions import ConvergenceWarning, SkipTestWarning +from sklearn.exceptions import ConvergenceWarning from sklearn.utils.estimator_checks import check_estimator import sklearn -from sklearn.base import RegressorMixin from sklearn.cluster.bicluster import BiclusterMixin -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model.base import LinearClassifierMixin from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.utils import IS_PYPY from sklearn.utils.testing import SkipTest from sklearn.utils.estimator_checks import ( - _safe_tags, _construct_instance, set_checking_parameters, _set_check_estimator_ids, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5a10bf6db88fe..c3a498c6f42f8 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -681,10 +681,11 @@ def check_sample_weights_pandas_series(name, estimator_orig): try: import pandas as pd X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], - [2, 1], [2, 2], [2, 3], [2, 4]]) + [2, 1], [2, 2], [2, 3], [2, 4], + [3, 1], [3, 2], [3, 3], [3, 4]]) X = pd.DataFrame(pairwise_estimator_convert_X(X, estimator_orig)) - y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2]) - weights = pd.Series([1] * 8) + y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) + weights = pd.Series([1] * 12) if _safe_tags(estimator, "multioutput_only"): y = pd.DataFrame(y) try: @@ -705,14 +706,15 @@ def check_sample_weights_list(name, estimator_orig): if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) - X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), + n_samples = 30 + X = pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)), estimator_orig) if _safe_tags(estimator, 'binary_only'): - y = np.arange(10) % 2 + y = np.arange(n_samples) % 2 else: - y = np.arange(10) % 3 + y = np.arange(n_samples) % 3 y = _enforce_estimator_tags_y(estimator, y) - sample_weight = [3] * 10 + sample_weight = [3] * n_samples # Test that estimators don't raise any exception estimator.fit(X, y, sample_weight=sample_weight) @@ -940,6 +942,7 @@ def _apply_on_subsets(func, X): n_features = X.shape[1] result_by_batch = [func(batch.reshape(1, n_features)) for batch in X] + # func can output tuple (e.g. score_samples) if type(result_full) == tuple: result_full = result_full[0] @@ -948,6 +951,7 @@ def _apply_on_subsets(func, X): if sparse.issparse(result_full): result_full = result_full.A result_by_batch = [x.A for x in result_by_batch] + return np.ravel(result_full), np.ravel(result_by_batch) @@ -1234,12 +1238,13 @@ def check_fit_score_takes_y(name, estimator_orig): # check that all estimators accept an optional y # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) - X = rnd.uniform(size=(10, 3)) + n_samples = 30 + X = rnd.uniform(size=(n_samples, 3)) X = pairwise_estimator_convert_X(X, estimator_orig) if _safe_tags(estimator_orig, 'binary_only'): - y = np.arange(10) % 2 + y = np.arange(n_samples) % 2 else: - y = np.arange(10) % 3 + y = np.arange(n_samples) % 3 estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) @@ -1801,7 +1806,7 @@ def check_estimators_fit_returns_self(name, estimator_orig, n_centers = 2 else: n_centers = 3 - X, y = make_blobs(random_state=0, n_samples=9, centers=n_centers) + X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers) # some want non-negative input X -= X.min() X = pairwise_estimator_convert_X(X, estimator_orig) @@ -1839,11 +1844,14 @@ def check_supervised_y_2d(name, estimator_orig): # These only work on 2d, so this test makes no sense return rnd = np.random.RandomState(0) - X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) + n_samples = 30 + X = pairwise_estimator_convert_X( + rnd.uniform(size=(n_samples, 3)), estimator_orig + ) if tags['binary_only']: - y = np.arange(10) % 2 + y = np.arange(n_samples) % 2 else: - y = np.arange(10) % 3 + y = np.arange(n_samples) % 3 y = _enforce_estimator_tags_y(estimator_orig, y) estimator = clone(estimator_orig) set_random_state(estimator) @@ -2177,7 +2185,7 @@ def check_estimators_overwrite_params(name, estimator_orig): n_centers = 2 else: n_centers = 3 - X, y = make_blobs(random_state=0, n_samples=9, centers=n_centers) + X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers) # some want non-negative input X -= X.min() X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) @@ -2269,9 +2277,10 @@ def check_sparsify_coefficients(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_classifier_data_not_an_array(name, estimator_orig): - X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) + X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1], + [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]]) X = pairwise_estimator_convert_X(X, estimator_orig) - y = [1, 1, 1, 2, 2, 2] + y = [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2] y = _enforce_estimator_tags_y(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) From fe052e6e6b8b3967de26cca3c5333211633013b6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 19 Sep 2019 09:13:00 -0400 Subject: [PATCH 34/53] set n_features_in_ for stacking estimators --- sklearn/ensemble/_stacking.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index c2a09c54b4622..dbf51fdcfbcc7 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -182,6 +182,7 @@ def fit(self, X, y, sample_weight=None): delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight) for est in all_estimators if est != 'drop' ) + self.n_features_in_ = self.estimators_[0].n_features_in_ self.named_estimators_ = Bunch() est_fitted_idx = 0 From 9a205dd8faf09a953ed968c6774fb57bcf779a5e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 25 Sep 2019 14:04:41 -0400 Subject: [PATCH 35/53] dont hardcode attribute in init for sparsecoder --- sklearn/decomposition/dict_learning.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 9ed705d680059..501b259422533 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -1024,7 +1024,6 @@ def __init__(self, dictionary, transform_algorithm='omp', transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter) self.components_ = dictionary - self.n_features_in_ = dictionary.shape[1] def fit(self, X, y=None): """Do nothing and return the estimator unchanged @@ -1045,6 +1044,10 @@ def fit(self, X, y=None): """ return self + @property + def n_features_in_(self): + return self.components_.shape[1] + class DictionaryLearning(SparseCodingMixin, BaseEstimator): """Dictionary learning From 0e81156e556150c569cfe54637b9cb8c80b18bfb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 13 Jan 2020 11:18:03 -0500 Subject: [PATCH 36/53] more merge --- sklearn/feature_selection/_rfe.py | 8 ++------ sklearn/impute/_base.py | 4 ---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 1417fd04f9eac..d91665e252686 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -492,13 +492,9 @@ def fit(self, X, y, groups=None): train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). """ -<<<<<<< HEAD:sklearn/feature_selection/rfe.py X, y = self._validate_X_y(X, y, accept_sparse="csr", - ensure_min_features=2) -======= - X, y = check_X_y(X, y, "csr", ensure_min_features=2, - force_all_finite=False) ->>>>>>> 19479d7af1711f1bb403eca1c02eebf212999091:sklearn/feature_selection/_rfe.py + ensure_min_features=2, + force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index d202af7f1e9ae..437d0e8ab6d0b 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -411,12 +411,8 @@ def transform(self, X): """ check_is_fitted(self) -<<<<<<< HEAD X = self._validate_input(X, in_fit=False) -======= - X = self._validate_input(X) X_indicator = super()._transform_indicator(X) ->>>>>>> 19479d7af1711f1bb403eca1c02eebf212999091 statistics = self.statistics_ From d4d92bc1c230b4d0344f88847a32277de98e66bc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 13 Jan 2020 15:26:22 -0500 Subject: [PATCH 37/53] fixed some bugs --- sklearn/impute/_base.py | 1 - sklearn/naive_bayes.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 437d0e8ab6d0b..038780ef118a6 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -272,7 +272,6 @@ def fit(self, X, y=None): ------- self : SimpleImputer """ - X = self._validate_input(X) X = self._validate_input(X, in_fit=True) super()._fit_indicator(X) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 38117b4182dfa..a73d786c2eb7c 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -473,7 +473,7 @@ def _check_X(self, X): return check_array(X, accept_sparse='csr') def _check_X_y(self, X, y): - return check_X_y(X, y, accept_sparse='csr') + return self._validate_X_y(X, y, accept_sparse='csr') def _update_class_log_prior(self, class_prior=None): n_classes = len(self.classes_) @@ -607,7 +607,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = self._validate_X_y(X, y) + X, y = self._check_X_y(X, y) _, n_features = X.shape self.n_features_ = n_features @@ -1154,8 +1154,8 @@ def _check_X(self, X): return X def _check_X_y(self, X, y): - X, y = check_X_y(X, y, dtype='int', accept_sparse=False, - force_all_finite=True) + X, y = self._validate_X_y(X, y, dtype='int', accept_sparse=False, + force_all_finite=True) if np.any(X < 0): raise ValueError("X must not contain negative values.") return X, y From 40cd141ad8c43ebc9bbea6787be7415dc415c3b9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 10:04:40 -0500 Subject: [PATCH 38/53] fixed more bugs --- sklearn/base.py | 6 ++++-- sklearn/linear_model/_ridge.py | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 829b5b5cfac95..19f74c4b45477 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -355,12 +355,14 @@ def _validate_n_features(self, X, check_n_features): def _validate_X(self, X, check_n_features=False, **check_array_params): X = check_array(X, **check_array_params) - self._validate_n_features(X, check_n_features) + if check_array_params.get('ensure_2d', True): + self._validate_n_features(X, check_n_features) return X def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params): X, y = check_X_y(X, y, **check_X_y_params) - self._validate_n_features(X, check_n_features) + if check_X_y_params.get('ensure_2d', True): + self._validate_n_features(X, check_n_features) return X, y class ClassifierMixin: diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index b1ee3d94b929a..577e171aa3cf3 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -918,8 +918,8 @@ def fit(self, X, y, sample_weight=None): """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - self._validate_X_y(X, y, accept_sparse=_accept_sparse, - multi_output=True, y_numeric=False) + X, y = self._validate_X_y(X, y, accept_sparse=_accept_sparse, + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) @@ -1879,8 +1879,8 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True, y_numeric=False) + X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) From b3251fef069b70a9216a6c382e38d3ffd15a394e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 11:07:17 -0500 Subject: [PATCH 39/53] fixed warnings --- sklearn/tests/test_dummy.py | 1 + sklearn/utils/estimator_checks.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index f814cc47474b2..38abb0b158fd3 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -756,6 +756,7 @@ def test_dtype_of_classifier_probas(strategy): assert probas.dtype == np.float64 +@pytest.mark.filterwarnings("ignore:The default value of strategy.*") # 0.24 @pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier)) def test_n_features_in_(Dummy): X = [[1, 2]] diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 095328c82f219..d5775853205ab 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2856,7 +2856,7 @@ def check_n_features_in(name, estimator_orig): n_samples = 100 X = rng.normal(loc=100, size=(n_samples, 2)) - X = pairwise_estimator_convert_X(X, estimator) + X = _pairwise_estimator_convert_X(X, estimator) if is_regressor(estimator_orig): y = rng.normal(size=n_samples) else: From 7e73a244d1d8ca49f70c2ee012ee195c28f3a39c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 13:10:17 -0500 Subject: [PATCH 40/53] use _validate_data() method --- sklearn/base.py | 40 ++++++----- sklearn/calibration.py | 4 +- sklearn/cluster/_affinity_propagation.py | 2 +- sklearn/cluster/_agglomerative.py | 6 +- sklearn/cluster/_bicluster.py | 2 +- sklearn/cluster/_birch.py | 2 +- sklearn/cluster/_dbscan.py | 2 +- sklearn/cluster/_kmeans.py | 10 +-- sklearn/cluster/_mean_shift.py | 2 +- sklearn/cluster/_optics.py | 2 +- sklearn/cluster/_spectral.py | 4 +- sklearn/compose/_column_transformer.py | 4 +- sklearn/covariance/_empirical_covariance.py | 2 +- sklearn/covariance/_graph_lasso.py | 6 +- sklearn/covariance/_robust_covariance.py | 2 +- sklearn/covariance/_shrunk_covariance.py | 6 +- sklearn/cross_decomposition/_pls.py | 8 +-- sklearn/decomposition/_dict_learning.py | 4 +- sklearn/decomposition/_factor_analysis.py | 2 +- sklearn/decomposition/_fastica.py | 4 +- sklearn/decomposition/_incremental_pca.py | 4 +- sklearn/decomposition/_kernel_pca.py | 2 +- sklearn/decomposition/_lda.py | 24 +++---- sklearn/decomposition/_nmf.py | 2 +- sklearn/decomposition/_pca.py | 4 +- sklearn/decomposition/_sparse_pca.py | 4 +- sklearn/decomposition/_truncated_svd.py | 4 +- sklearn/discriminant_analysis.py | 6 +- sklearn/ensemble/_bagging.py | 2 +- sklearn/ensemble/_forest.py | 2 +- sklearn/ensemble/_gb.py | 4 +- .../gradient_boosting.py | 4 +- sklearn/ensemble/_weight_boosting.py | 34 ++++----- sklearn/feature_selection/_rfe.py | 8 +-- .../_univariate_selection.py | 4 +- .../feature_selection/_variance_threshold.py | 5 +- sklearn/gaussian_process/_gpc.py | 8 +-- sklearn/gaussian_process/_gpr.py | 8 +-- sklearn/impute/_base.py | 16 ++--- sklearn/impute/_iterative.py | 4 +- sklearn/impute/_knn.py | 6 +- sklearn/kernel_approximation.py | 8 +-- sklearn/kernel_ridge.py | 4 +- sklearn/linear_model/_base.py | 4 +- sklearn/linear_model/_bayes.py | 6 +- sklearn/linear_model/_coordinate_descent.py | 24 +++---- sklearn/linear_model/_huber.py | 2 +- sklearn/linear_model/_least_angle.py | 6 +- sklearn/linear_model/_logistic.py | 12 ++-- sklearn/linear_model/_omp.py | 6 +- sklearn/linear_model/_ransac.py | 2 +- sklearn/linear_model/_ridge.py | 22 +++--- sklearn/linear_model/_stochastic_gradient.py | 12 ++-- sklearn/linear_model/_theil_sen.py | 2 +- sklearn/manifold/_locally_linear.py | 2 +- sklearn/manifold/_mds.py | 2 +- sklearn/manifold/_spectral_embedding.py | 4 +- sklearn/manifold/_t_sne.py | 10 +-- sklearn/mixture/_base.py | 2 +- sklearn/multiclass.py | 4 +- sklearn/multioutput.py | 4 +- sklearn/naive_bayes.py | 8 +-- sklearn/neighbors/_base.py | 2 +- sklearn/neighbors/_kde.py | 2 +- sklearn/neighbors/_nca.py | 2 +- sklearn/neighbors/_nearest_centroid.py | 4 +- .../neural_network/_multilayer_perceptron.py | 8 +-- sklearn/neural_network/_rbm.py | 2 +- sklearn/preprocessing/_data.py | 70 +++++++++---------- sklearn/preprocessing/_discretization.py | 2 +- .../preprocessing/_function_transformer.py | 2 +- sklearn/random_projection.py | 2 +- sklearn/semi_supervised/_label_propagation.py | 2 +- sklearn/svm/_base.py | 6 +- sklearn/svm/_classes.py | 12 ++-- sklearn/tree/_classes.py | 2 +- sklearn/utils/tests/test_estimator_checks.py | 32 ++++----- 77 files changed, 281 insertions(+), 284 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 19f74c4b45477..c3bb97c5d7a61 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -337,33 +337,37 @@ def _get_tags(self): collected_tags.update(more_tags) return collected_tags - def _validate_n_features(self, X, check_n_features): - if check_n_features: + def _validate_n_features(self, X, reset): + n_features = X.shape[1] + + if reset: + self.n_features_in_ = n_features + else: if not hasattr(self, 'n_features_in_'): raise RuntimeError( - "check_n_features is True but there is no n_features_in_ " + "reset parameter is False but there is no n_features_in_ " "attribute." ) - if X.shape[1] != self.n_features_in_: + if n_features != self.n_features_in_: raise ValueError( 'X has {} features, but this {} is expecting {} features ' - 'as input.'.format(X.shape[1], self.__class__.__name__, + 'as input.'.format(n_features, self.__class__.__name__, self.n_features_in_) ) + + def _validate_data(self, X, y=None, reset=True, **check_params): + if y is None: + X = check_array(X, **check_params) + out = X else: - self.n_features_in_ = X.shape[1] - - def _validate_X(self, X, check_n_features=False, **check_array_params): - X = check_array(X, **check_array_params) - if check_array_params.get('ensure_2d', True): - self._validate_n_features(X, check_n_features) - return X - - def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params): - X, y = check_X_y(X, y, **check_X_y_params) - if check_X_y_params.get('ensure_2d', True): - self._validate_n_features(X, check_n_features) - return X, y + X, y = check_X_y(X, y, **check_params) + out = X, y + + if check_params.get('ensure_2d', True): + self._validate_n_features(X, reset=reset) + + return out + class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/calibration.py b/sklearn/calibration.py index e06f4217122c9..6bb68122aaa57 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -131,8 +131,8 @@ def fit(self, X, y, sample_weight=None): self : object Returns an instance of self. """ - X, y = self._validate_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'], - force_all_finite=False, allow_nd=True) + X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo'], + force_all_finite=False, allow_nd=True) X, y = indexable(X, y) le = LabelBinarizer().fit(y) self.classes_ = le.classes_ diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 4360bef52c8db..aa06fb30b1669 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -374,7 +374,7 @@ def fit(self, X, y=None): accept_sparse = False else: accept_sparse = 'csr' - X = self._validate_X(X, accept_sparse=accept_sparse) + X = self._validate_data(X, accept_sparse=accept_sparse) if self.affinity == "precomputed": self.affinity_matrix_ = X elif self.affinity == "euclidean": diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 9bb31f85a36c4..b29b1078333cc 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -805,7 +805,7 @@ def fit(self, X, y=None): ------- self """ - X = self._validate_X(X, ensure_min_samples=2, estimator=self) + X = self._validate_data(X, ensure_min_samples=2, estimator=self) memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: @@ -1051,8 +1051,8 @@ def fit(self, X, y=None, **params): ------- self """ - X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], - ensure_min_features=2, estimator=self) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + ensure_min_features=2, estimator=self) n_features_in_ = self.n_features_in_ AgglomerativeClustering.fit(self, X.T, **params) # Need to restore n_features_in_ attribute that was overridden in diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index c7d04707d1571..37eca3c7ec4c2 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -115,7 +115,7 @@ def fit(self, X, y=None): y : Ignored """ - X = self._validate_X(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() self._fit(X) return self diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 153f793277e9c..d1cf13662d26a 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -459,7 +459,7 @@ def fit(self, X, y=None): return self._fit(X) def _fit(self, X): - X = self._validate_X(X, accept_sparse='csr', copy=self.copy) + X = self._validate_data(X, accept_sparse='csr', copy=self.copy) threshold = self.threshold branching_factor = self.branching_factor diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index bddf63a889efd..c258ce3d5f406 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -307,7 +307,7 @@ def fit(self, X, y=None, sample_weight=None): self """ - X = self._validate_X(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') if not self.eps > 0.0: raise ValueError("eps must be positive.") diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 57b67c162a018..6d8decfe1fd05 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -854,9 +854,9 @@ def fit(self, X, y=None, sample_weight=None): # avoid forcing order when copy_x=False order = "C" if self.copy_x else None - X = self._validate_X(X, accept_sparse='csr', - dtype=[np.float64, np.float32], - order=order, copy=self.copy_x) + X = self._validate_data(X, accept_sparse='csr', + dtype=[np.float64, np.float32], + order=order, copy=self.copy_x) # verify that the number of samples given is larger than k if _num_samples(X) < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( @@ -1505,8 +1505,8 @@ def fit(self, X, y=None, sample_weight=None): self """ random_state = check_random_state(self.random_state) - X = self._validate_X(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse="csr", order='C', + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 96f83ed9ef086..3d0dc6304dd5a 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -367,7 +367,7 @@ def fit(self, X, y=None): y : Ignored """ - X = self._validate_X(X) + X = self._validate_data(X) bandwidth = self.bandwidth if bandwidth is None: bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index e6d1f09b77ed9..d28941c7d8401 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -244,7 +244,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = self._validate_X(X, dtype=np.float) + X = self._validate_data(X, dtype=np.float) if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 1d54a84b93d64..e76822d6b3732 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -474,8 +474,8 @@ def fit(self, X, y=None): self """ - X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64, ensure_min_samples=2) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, ensure_min_samples=2) allow_squared = self.affinity in ["precomputed", "precomputed_nearest_neighbors"] if X.shape[0] == X.shape[1] and not allow_squared: diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 42e65c61623bb..8e87cc937ce7a 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -512,7 +512,7 @@ def fit_transform(self, X, y=None): self._feature_names_in = None X = _check_X(X) # set n_features_in_ attribute - self._validate_n_features(X, check_n_features=False) + self._validate_n_features(X) self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) @@ -586,7 +586,7 @@ def transform(self, X): 'and for transform when using the ' 'remainder keyword') - # TODO: also call _validate_n_features(check_n_features=True) in 0.24 + # TODO: also call _validate_n_features(reset=False) in 0.24 self._validate_features(X.shape[1], X_feature_names) Xs = self._fit_transform(X, None, _transform_one, fitted=True) self._validate_output(Xs) diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 02c48fc5824ae..9da2be98335de 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -191,7 +191,7 @@ def fit(self, X, y=None): self : object """ - X = self._validate_X(X) + X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 91d29e0bc43b7..26e5408048c65 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -377,8 +377,8 @@ def fit(self, X, y=None): y : (ignored) """ # Covariance does not make sense for a single feature - X = self._validate_X(X, ensure_min_features=2, ensure_min_samples=2, - estimator=self) + X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2, + estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) @@ -644,7 +644,7 @@ def fit(self, X, y=None): y : (ignored) """ # Covariance does not make sense for a single feature - X = self._validate_X(X, ensure_min_features=2, estimator=self) + X = self._validate_data(X, ensure_min_features=2, estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 46711d3eb2afc..2081874b03110 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -636,7 +636,7 @@ def fit(self, X, y=None): self : object """ - X = self._validate_X(X, ensure_min_samples=2, estimator='MinCovDet') + X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet') random_state = check_random_state(self.random_state) n_samples, n_features = X.shape # check that the empirical covariance is full rank diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 9240b1d81716e..d86474b40ec89 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -143,7 +143,7 @@ def fit(self, X, y=None): self : object """ - X = self._validate_X(X) + X = self._validate_data(X) # Not calling the parent object to fit, to avoid a potential # matrix inversion when setting the precision if self.assume_centered: @@ -419,7 +419,7 @@ def fit(self, X, y=None): """ # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) - X = self._validate_X(X) + X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: @@ -572,7 +572,7 @@ def fit(self, X, y=None): self : object """ - X = self._validate_X(X) + X = self._validate_data(X) # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) if self.assume_centered: diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index af2a5218002d4..af81ece6baf58 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -277,8 +277,8 @@ def fit(self, X, Y): # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) - X = self._validate_X(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_data(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) @@ -886,8 +886,8 @@ def fit(self, X, Y): """ # copy since this will contains the centered data check_consistent_length(X, Y) - X = self._validate_X(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_data(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 3582dd23ac8dc..49b78a0916e7a 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1221,7 +1221,7 @@ def fit(self, X, y=None): Returns the object itself """ random_state = check_random_state(self.random_state) - X = self._validate_X(X) + X = self._validate_data(X) if self.n_components is None: n_components = X.shape[1] else: @@ -1428,7 +1428,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = self._validate_X(X) + X = self._validate_data(X) U, (A, B), self.n_iter_ = dict_learning_online( X, self.n_components, self.alpha, diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 15ce4dc31af9b..7147fd452559c 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -168,7 +168,7 @@ def fit(self, X, y=None): ------- self """ - X = self._validate_X(X, copy=self.copy, dtype=np.float64) + X = self._validate_data(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape n_components = self.n_components diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 1558827b3db06..ef9f376bba66d 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -427,8 +427,8 @@ def _fit(self, X, compute_sources=False): # This validates twice but there is not clean way to avoid validation # in fastica(). Please see issue 14897. - self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T + self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args random_state = check_random_state(self.random_state) diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index e62dc0189d55f..2a0d19d373dbb 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -194,8 +194,8 @@ def fit(self, X, y=None): self.singular_values_ = None self.noise_variance_ = None - X = self._validate_X(X, accept_sparse=['csr', 'csc', 'lil'], - copy=self.copy, dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'lil'], + copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 3a2ea007e0307..b1f83c8e0ff81 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -275,7 +275,7 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - X = self._validate_X(X, accept_sparse='csr', copy=self.copy_X) + X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X) self._centerer = KernelCenterer() K = self._get_kernel(X) self._fit_transform(K) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 08fe12154e8bb..201b393374a08 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -467,7 +467,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def _more_tags(self): return {'requires_positive_X': True} - def _check_non_neg_array(self, X, check_n_features, whom): + def _check_non_neg_array(self, X, reset_n_features, whom): """check X format check X format and make sure no negative value in X. @@ -477,8 +477,8 @@ def _check_non_neg_array(self, X, check_n_features, whom): X : array-like or sparse matrix """ - X = self._validate_X(X, check_n_features=check_n_features, - accept_sparse='csr') + X = self._validate_data(X, reset=reset_n_features, + accept_sparse='csr') check_non_negative(X, whom) return X @@ -498,13 +498,13 @@ def partial_fit(self, X, y=None): """ self._check_params() first_time = not hasattr(self, 'components_') + # deactivating check for now (specific tests about error message would # break) - # TODO: uncomment when addressing check_n_features in - # predict/transform/etc. - # check_n_features = not in_fit - check_n_features = False - X = self._check_non_neg_array(X, check_n_features, + # TODO: uncomment when addressing reset in predict/transform/etc. + # reset = first_time + reset_n_features = True + X = self._check_non_neg_array(X, reset_n_features, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size @@ -548,7 +548,7 @@ def fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, check_n_features=False, + X = self._check_non_neg_array(X, reset_n_features=True, whom="LatentDirichletAllocation.fit") n_samples, n_features = X.shape max_iter = self.max_iter @@ -619,7 +619,7 @@ def _unnormalized_transform(self, X): # make sure feature size is the same in fitted model and in X X = self._check_non_neg_array( - X, check_n_features=False, + X, reset_n_features=True, whom="LatentDirichletAllocation.transform") n_samples, n_features = X.shape if n_features != self.components_.shape[1]: @@ -744,7 +744,7 @@ def score(self, X, y=None): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, check_n_features=False, + X = self._check_non_neg_array(X, reset_n_features=True, whom="LatentDirichletAllocation.score") doc_topic_distr = self._unnormalized_transform(X) @@ -775,7 +775,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, check_is_fitted(self) X = self._check_non_neg_array( - X, check_n_features=False, + X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity") if doc_topic_distr is None: diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index c4edfcb15040a..1ab996b8c8059 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1268,7 +1268,7 @@ def fit_transform(self, X, y=None, W=None, H=None): W : array, shape (n_samples, n_components) Transformed data. """ - X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=float) + X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=float) W, H, n_iter_ = non_negative_factorization( X=X, W=W, H=H, n_components=self.n_components, init=self.init, diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0fb7f41a58a67..178d83f4aeb0d 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -386,8 +386,8 @@ def _fit(self, X): raise TypeError('PCA does not support sparse input. See ' 'TruncatedSVD for a possible alternative.') - X = self._validate_X(X, dtype=[np.float64, np.float32], ensure_2d=True, - copy=self.copy) + X = self._validate_data(X, dtype=[np.float64, np.float32], + ensure_2d=True, copy=self.copy) # Handle n_components==None if self.n_components is None: diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index f5395012f9e08..4cdbe21bae0c7 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -165,7 +165,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = self._validate_X(X) + X = self._validate_data(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ @@ -363,7 +363,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = self._validate_X(X) + X = self._validate_data(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 9b6c71e05d29b..940eab56feea8 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -157,8 +157,8 @@ def fit_transform(self, X, y=None): X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - X = self._validate_X(X, accept_sparse=['csr', 'csc'], - ensure_min_features=2) + X = self._validate_data(X, accept_sparse=['csr', 'csc'], + ensure_min_features=2) random_state = check_random_state(self.random_state) if self.algorithm == "arpack": diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 14610087c1d37..7e30233b0e131 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -423,8 +423,8 @@ def fit(self, X, y): y : array, shape (n_samples,) Target values. """ - X, y = self._validate_X_y(X, y, ensure_min_samples=2, estimator=self, - dtype=[np.float64, np.float32]) + X, y = self._validate_data(X, y, ensure_min_samples=2, estimator=self, + dtype=[np.float64, np.float32]) self.classes_ = unique_labels(y) n_samples, _ = X.shape n_classes = len(self.classes_) @@ -645,7 +645,7 @@ def fit(self, X, y): y : array, shape = [n_samples] Target values (integers) """ - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 8dee9b8af33db..000e5a054627f 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -278,7 +278,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) - X, y = self._validate_X_y( + X, y = self._validate_data( X, y, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, multi_output=True ) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index da744ad4cea70..9d1cfee7e7266 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -293,7 +293,7 @@ def fit(self, X, y, sample_weight=None): self : object """ # Validate or convert input data - X = self._validate_X(X, accept_sparse="csc", dtype=DTYPE) + X = self._validate_data(X, accept_sparse="csc", dtype=DTYPE) y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index c9304ad398396..f6cc644c57d7b 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -411,8 +411,8 @@ def fit(self, X, y, sample_weight=None, monitor=None): # Check input # Since check_array converts both X and y to the same dtype, but the # trees use different types for X and y, checking them separately. - X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=DTYPE) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=DTYPE) n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d4665a1495881..5ab33e52df73d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -101,8 +101,8 @@ def fit(self, X, y): acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. - X, y = self._validate_X_y(X, y, dtype=[X_DTYPE], - force_all_finite=False) + X, y = self._validate_data(X, y, dtype=[X_DTYPE], + force_all_finite=False) y = self._encode_y(y) rng = check_random_state(self.random_state) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 9e2aaf8fb7925..4716309280699 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -71,7 +71,7 @@ def __init__(self, self.learning_rate = learning_rate self.random_state = random_state - def _validate_data(self, X): + def _check_X(self, X): return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, dtype=None) @@ -100,12 +100,12 @@ def fit(self, X, y, sample_weight=None): if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") - X, y = self._validate_X_y(X, y, - accept_sparse=['csr', 'csc'], - ensure_2d=True, - allow_nd=True, - dtype=None, - y_numeric=is_regressor(self)) + X, y = self._validate_data(X, y, + accept_sparse=['csr', 'csc'], + ensure_2d=True, + allow_nd=True, + dtype=None, + y_numeric=is_regressor(self)) sample_weight = _check_sample_weight(sample_weight, X, np.float64) sample_weight /= sample_weight.sum() @@ -216,7 +216,7 @@ def staged_score(self, X, y, sample_weight=None): ------ z : float """ - X = self._validate_data(X) + X = self._check_X(X) for y_pred in self.staged_predict(X): if is_classifier(self): @@ -611,7 +611,7 @@ def predict(self, X): y : ndarray of shape (n_samples,) The predicted classes. """ - X = self._validate_data(X) + X = self._check_X(X) pred = self.decision_function(X) @@ -641,7 +641,7 @@ def staged_predict(self, X): y : generator of ndarray of shape (n_samples,) The predicted classes. """ - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_ @@ -675,7 +675,7 @@ def decision_function(self, X): class in ``classes_``, respectively. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] @@ -718,7 +718,7 @@ def staged_decision_function(self, X): class in ``classes_``, respectively. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] @@ -787,7 +787,7 @@ def predict_proba(self, X): outputs is the same of that of the :term:`classes_` attribute. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ @@ -821,7 +821,7 @@ def staged_predict_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ @@ -847,7 +847,7 @@ def predict_log_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ - X = self._validate_data(X) + X = self._check_X(X) return np.log(self.predict_proba(X)) @@ -1115,7 +1115,7 @@ def predict(self, X): The predicted regression values. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) return self._get_median_predict(X, len(self.estimators_)) @@ -1140,7 +1140,7 @@ def staged_predict(self, X): The predicted regression values. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) for i, _ in enumerate(self.estimators_, 1): yield self._get_median_predict(X, limit=i) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index d91665e252686..6e6800ee2a6a7 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -155,7 +155,7 @@ def _fit(self, X, y, step_score=None): # self.scores_ will not be calculated when calling _fit through fit tags = self._get_tags() - X, y = self._validate_X_y( + X, y = self._validate_data( X, y, accept_sparse="csc", ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True) @@ -492,9 +492,9 @@ def fit(self, X, y, groups=None): train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). """ - X, y = self._validate_X_y(X, y, accept_sparse="csr", - ensure_min_features=2, - force_all_finite=False) + X, y = self._validate_data(X, y, accept_sparse="csr", + ensure_min_features=2, + force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 5acf44d8e0407..221e46f2a505e 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -338,8 +338,8 @@ def fit(self, X, y): ------- self : object """ - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if not callable(self.score_func): raise TypeError("The score function should be a callable, %s (%s) " diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index a160ef634be7c..6438e6b80dc0a 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -65,8 +65,9 @@ def fit(self, X, y=None): ------- self """ - X = self._validate_X(X, accept_sparse=('csr', 'csc'), dtype=np.float64, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + dtype=np.float64, + force_all_finite='allow-nan') if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 4c0c63923e5fc..d34aa0962472b 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -625,11 +625,11 @@ def fit(self, X, y): self : returns an instance of self. """ if self.kernel is None or self.kernel.requires_vector_input: - X, y = self._validate_X_y(X, y, multi_output=False, - ensure_2d=True, dtype="numeric") + X, y = self._validate_data(X, y, multi_output=False, + ensure_2d=True, dtype="numeric") else: - X, y = self._validate_X_y(X, y, multi_output=False, - ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=False, + ensure_2d=False, dtype=None) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 522f233213015..cc3fbb2f08d56 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -187,11 +187,11 @@ def fit(self, X, y): self._rng = check_random_state(self.random_state) if self.kernel_.requires_vector_input: - X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True, - ensure_2d=True, dtype="numeric") + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, + ensure_2d=True, dtype="numeric") else: - X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True, - ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, + ensure_2d=False, dtype=None) # Normalize target value if self.normalize_y: diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 038780ef118a6..bc98778d5c5d8 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -235,11 +235,10 @@ def _validate_input(self, X, in_fit): force_all_finite = "allow-nan" try: - check_n_features = not in_fit - X = self._validate_X(X, check_n_features=check_n_features, - accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, - copy=self.copy) + X = self._validate_data(X, reset=in_fit, + accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, + copy=self.copy) except ValueError as ve: if "could not convert" in str(ve): new_ve = ValueError("Cannot use {} strategy with non-numeric " @@ -595,10 +594,9 @@ def _validate_input(self, X, in_fit): force_all_finite = True else: force_all_finite = "allow-nan" - check_n_features = not in_fit - X = self._validate_X(X, check_n_features=check_n_features, - accept_sparse=('csc', 'csr'), dtype=None, - force_all_finite=force_all_finite) + X = self._validate_data(X, reset=in_fit, + accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("MissingIndicator does not support data with " diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 6463327836253..2e2fdc3f5e2de 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -503,8 +503,8 @@ def _initial_imputation(self, X): else: force_all_finite = True - X = self._validate_X(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite=force_all_finite) + X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) mask_missing_values = _get_mask(X, self.missing_values) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index b263416dd40aa..ea2fe35899304 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -178,9 +178,9 @@ def fit(self, X, y=None): raise ValueError( "Expected n_neighbors > 0. Got {}".format(self.n_neighbors)) - X = self._validate_X(X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, - copy=self.copy) + X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, + force_all_finite=force_all_finite, + copy=self.copy) super()._fit_indicator(X) _check_weights(self.weights) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index ee572d8842e21..b29b56bbc38af 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -91,7 +91,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = self._validate_X(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -197,7 +197,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = self._validate_X(X) + X = self._validate_data(X) random_state = check_random_state(self.random_state) n_features = X.shape[1] uniform = random_state.uniform(size=(n_features, self.n_components)) @@ -324,7 +324,7 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - self._validate_X(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') if self.sample_interval is None: # See reference, figure 2 c) if self.sample_steps == 1: @@ -542,7 +542,7 @@ def fit(self, X, y=None): X : array-like of shape (n_samples, n_features) Training data. """ - X = self._validate_X(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') rnd = check_random_state(self.random_state) n_samples = X.shape[0] diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index f647923867eab..c504b2cb31cb3 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -148,8 +148,8 @@ def fit(self, X, y=None, sample_weight=None): self : returns an instance of self. """ # Convert data - X, y = self._validate_X_y(X, y, accept_sparse=("csr", "csc"), - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=("csr", "csc"), + multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = check_array(sample_weight, ensure_2d=False) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index be41ec6a4bb3a..8dd5d5bdbf983 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -488,8 +488,8 @@ def fit(self, X, y, sample_weight=None): """ n_jobs_ = self.n_jobs - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 7b3cd5c3f3751..c67fc54f43157 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -190,7 +190,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError('n_iter should be greater than or equal to 1.' ' Got {!r}.'.format(self.n_iter)) - X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True) + X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -526,8 +526,8 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = self._validate_X_y(X, y, dtype=np.float64, y_numeric=True, - ensure_min_samples=2) + X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True, + ensure_min_samples=2) n_samples, n_features = X.shape coef_ = np.zeros(n_features) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index b2d2434fe68a1..43c027be1af69 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -695,11 +695,11 @@ def fit(self, X, y, check_input=True): # when bypassing checks if check_input: X_copied = self.copy_X and self.fit_intercept - X, y = self._validate_X_y(X, y, accept_sparse='csc', - order='F', - dtype=[np.float64, np.float32], - copy=X_copied, multi_output=True, - y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse='csc', + order='F', + dtype=[np.float64, np.float32], + copy=X_copied, multi_output=True, + y_numeric=True) y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False) @@ -1111,8 +1111,8 @@ def fit(self, X, y): # Let us not impose fortran ordering so far: it is # not useful for the cross-validation loop and will be done # by the model fitting itself - X = self._validate_X(X, accept_sparse='csc', - dtype=[np.float64, np.float32], copy=False) + X = self._validate_data(X, accept_sparse='csc', + dtype=[np.float64, np.float32], copy=False) if sparse.isspmatrix(X): if (hasattr(reference_to_old_X, "data") and not np.may_share_memory(reference_to_old_X.data, X.data)): @@ -1123,9 +1123,9 @@ def fit(self, X, y): copy_X = False del reference_to_old_X else: - X = self._validate_X(X, accept_sparse='csc', - dtype=[np.float64, np.float32], order='F', - copy=copy_X) + X = self._validate_data(X, accept_sparse='csc', + dtype=[np.float64, np.float32], order='F', + copy=copy_X) copy_X = False if X.shape[0] != y.shape[0]: @@ -1743,8 +1743,8 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - X = self._validate_X(X, dtype=[np.float64, np.float32], order='F', - copy=self.copy_X and self.fit_intercept) + X = self._validate_data(X, dtype=[np.float64, np.float32], order='F', + copy=self.copy_X and self.fit_intercept) y = check_array(y, dtype=X.dtype.type, ensure_2d=False) if hasattr(self, 'l1_ratio'): diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 7ab9b14168af9..1d3a3fcc73421 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = self._validate_X_y( + X, y = self._validate_data( X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 1858e1dbd6675..9f0f62471376a 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -944,7 +944,7 @@ def fit(self, X, y, Xy=None): self : object returns an instance of self. """ - X, y = self._validate_X_y(X, y, y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) alpha = getattr(self, 'alpha', 0.) if hasattr(self, 'n_nonzero_coefs'): @@ -1367,7 +1367,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = self._validate_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) X = as_float_array(X, copy=self.copy_X) y = as_float_array(y, copy=self.copy_X) @@ -1748,7 +1748,7 @@ def fit(self, X, y, copy_X=None): """ if copy_X is None: copy_X = self.copy_X - X, y = self._validate_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, copy_X) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index e215c28d2a615..9e84e56ee0284 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1339,9 +1339,9 @@ def fit(self, X, y, sample_weight=None): else: _dtype = [np.float64, np.float32] - X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=_dtype, - order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) @@ -1813,9 +1813,9 @@ def fit(self, X, y, sample_weight=None): "LogisticRegressionCV." ) - X, y = self._validate_X_y(X, y, accept_sparse='csr', dtype=np.float64, - order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) class_weight = self.class_weight diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index 7c16dcd243fdc..54b751423c933 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -641,7 +641,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = self._validate_X_y(X, y, multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True) n_features = X.shape[1] X, y, X_offset, y_offset, X_scale, Gram, Xy = \ @@ -879,8 +879,8 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = self._validate_X_y(X, y, y_numeric=True, ensure_min_features=2, - estimator=self) + X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2, + estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) cv = check_cv(self.cv, classifier=False) max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 0a746d9e49e07..cd5e3db49842d 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -246,7 +246,7 @@ def fit(self, X, y, sample_weight=None): `max_trials` randomly chosen sub-samples. """ - X = self._validate_X(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') y = check_array(y, ensure_2d=False) check_consistent_length(X, y) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 577e171aa3cf3..6c93d413752d1 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -536,10 +536,10 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = self._validate_X_y(X, y, - accept_sparse=_accept_sparse, - dtype=_dtype, - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, + accept_sparse=_accept_sparse, + dtype=_dtype, + multi_output=True, y_numeric=True) if sparse.issparse(X) and self.fit_intercept: if self.solver not in ['auto', 'sparse_cg', 'sag']: raise ValueError( @@ -918,8 +918,8 @@ def fit(self, X, y, sample_weight=None): """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = self._validate_X_y(X, y, accept_sparse=_accept_sparse, - multi_output=True, y_numeric=False) + X, y = self._validate_data(X, y, accept_sparse=_accept_sparse, + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) @@ -1447,9 +1447,9 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float64], - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float64], + multi_output=True, y_numeric=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -1879,8 +1879,8 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True, y_numeric=False) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 69fa4c2f01052..d50a0997fbb56 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -521,9 +521,9 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, if hasattr(self, "classes_"): self.classes_ = None - X, y = self._validate_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class @@ -1096,9 +1096,9 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init): - X, y = self._validate_X_y(X, y, accept_sparse="csr", copy=False, - order='C', dtype=np.float64, - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse="csr", copy=False, + order='C', dtype=np.float64, + accept_large_sparse=False) y = y.astype(np.float64, copy=False) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index bbe1b90e37af0..a29cc26cdc0a3 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -357,7 +357,7 @@ def fit(self, X, y): self : returns an instance of self. """ random_state = check_random_state(self.random_state) - X, y = self._validate_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) n_samples, n_features = X.shape n_subsamples, self.n_subpopulation_ = self._check_subparams(n_samples, n_features) diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 7b99fde631c97..6b371bd5821ac 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -656,7 +656,7 @@ def _fit_transform(self, X): n_jobs=self.n_jobs) random_state = check_random_state(self.random_state) - X = self._validate_X(X, dtype=float) + X = self._validate_data(X, dtype=float) self.nbrs_.fit(X) self.embedding_, self.reconstruction_error_ = \ locally_linear_embedding( diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 0ddf8dda7f31c..674c8e1527602 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -414,7 +414,7 @@ def fit_transform(self, X, y=None, init=None): algorithm. By default, the algorithm is initialized with a randomly chosen array. """ - X = self._validate_X(X) + X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn("The MDS API has changed. ``fit`` now constructs an" " dissimilarity matrix from data. To use a custom " diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 1052aeec9c955..c40ea7e1689b1 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -535,8 +535,8 @@ def fit(self, X, y=None): Returns the instance itself. """ - X = self._validate_X(X, accept_sparse='csr', ensure_min_samples=2, - estimator=self) + X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, + estimator=self) random_state = check_random_state(self.random_state) if isinstance(self.affinity, str): diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index be0e2df599cbf..92cb154292327 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -662,12 +662,12 @@ def _fit(self, X, skip_num_points=0): if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.method == 'barnes_hut': - X = self._validate_X(X, accept_sparse=['csr'], - ensure_min_samples=2, - dtype=[np.float32, np.float64]) + X = self._validate_data(X, accept_sparse=['csr'], + ensure_min_samples=2, + dtype=[np.float32, np.float64]) else: - X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index 56f3649f2b11c..e96978f9018f2 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -217,7 +217,7 @@ def fit_predict(self, X, y=None): Component labels. """ X = _check_X(X, self.n_components, ensure_min_samples=2) - self._validate_n_features(X, check_n_features=False) + self._validate_n_features(X, reset=True) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 041f5a90c48c6..9eeb4248f83fd 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -535,7 +535,7 @@ def fit(self, X, y): ------- self """ - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) check_classification_targets(y) self.classes_ = np.unique(y) @@ -776,7 +776,7 @@ def fit(self, X, y): ------- self """ - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {0}" "".format(self.code_size)) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 33608db22213f..1c2eecfb76d5e 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -152,7 +152,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): raise ValueError("The base estimator should implement" " a fit method") - X, y = self._validate_X_y(X, y, multi_output=True, accept_sparse=True) + X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(y) @@ -416,7 +416,7 @@ def fit(self, X, Y): ------- self : object """ - X, Y = self._validate_X_y(X, Y, multi_output=True, accept_sparse=True) + X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index a73d786c2eb7c..8a2fefe7c08a0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -203,7 +203,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) y = column_or_1d(y, warn=True) return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight) @@ -473,7 +473,7 @@ def _check_X(self, X): return check_array(X, accept_sparse='csr') def _check_X_y(self, X, y): - return self._validate_X_y(X, y, accept_sparse='csr') + return self._validate_data(X, y, accept_sparse='csr') def _update_class_log_prior(self, class_prior=None): n_classes = len(self.classes_) @@ -1154,8 +1154,8 @@ def _check_X(self, X): return X def _check_X_y(self, X, y): - X, y = self._validate_X_y(X, y, dtype='int', accept_sparse=False, - force_all_finite=True) + X, y = self._validate_data(X, y, dtype='int', accept_sparse=False, + force_all_finite=True) if np.any(X < 0): raise ValueError("X must not contain negative values.") return X, y diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index c5e6c08983348..76afc6e5dcb68 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -397,7 +397,7 @@ def _fit(self, X): X = _check_precomputed(X) self.n_features_in_ = X.shape[1] else: - X = self._validate_X(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index ec0b238b1a4a7..6a48ee6e60d27 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None): Returns instance of object. """ algorithm = self._choose_algorithm(self.algorithm, self.metric) - X = self._validate_X(X, order='C', dtype=DTYPE) + X = self._validate_data(X, order='C', dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DTYPE) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 9a0d25d332f81..278e36ffd13db 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -298,7 +298,7 @@ def _validate_params(self, X, y): """ # Validate the inputs X and y, and converts y to numerical classes. - X, y = self._validate_X_y(X, y, ensure_min_samples=2) + X, y = self._validate_data(X, y, ensure_min_samples=2) check_classification_targets(y) y = LabelEncoder().fit_transform(y) diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index c5b41db2c895f..48712c1fcfb44 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -104,9 +104,9 @@ def fit(self, X, y): # If X is sparse and the metric is "manhattan", store it in a csc # format is easier to calculate the median. if self.metric == 'manhattan': - X, y = self._validate_X_y(X, y, accept_sparse=['csc']) + X, y = self._validate_data(X, y, accept_sparse=['csc']) else: - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) is_X_sparse = sp.issparse(X) if is_X_sparse and self.shrink_threshold: raise ValueError("threshold shrinking not supported" diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index cd6b4b44a6b82..038b9c31678a7 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -942,8 +942,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", n_iter_no_change=n_iter_no_change, max_fun=max_fun) def _validate_input(self, X, y, incremental): - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) @@ -1350,8 +1350,8 @@ def predict(self, X): return y_pred def _validate_input(self, X, y, incremental): - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True, y_numeric=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) return X, y diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index b6c27de17000e..711dee806c138 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -336,7 +336,7 @@ def fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - X = self._validate_X(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) n_samples = X.shape[0] rng = check_random_state(self.random_state) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 5aecf5f879063..b1ebeadba3530 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -369,10 +369,9 @@ def partial_fit(self, X, y=None): "Consider using MaxAbsScaler instead.") first_pass = not hasattr(self, 'n_samples_seen_') - check_n_features = not first_pass - X = self._validate_X(X, check_n_features=check_n_features, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan") + X = self._validate_data(X, reset=first_pass, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite="allow-nan") data_min = np.nanmin(X, axis=0) data_max = np.nanmax(X, axis=0) @@ -695,9 +694,9 @@ def partial_fit(self, X, y=None): self : object Transformer instance. """ - X = self._validate_X(X, accept_sparse=('csr', 'csc'), - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -790,10 +789,10 @@ def transform(self, X, copy=None): check_is_fitted(self) copy = copy if copy is not None else self.copy - X = self._validate_X(X, check_n_features=True, - accept_sparse='csr', copy=copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, reset=False, + accept_sparse='csr', copy=copy, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: @@ -967,10 +966,10 @@ def partial_fit(self, X, y=None): Transformer instance. """ first_pass = not hasattr(self, 'n_samples_seen_') - check_n_features = not first_pass - X = self._validate_X(X, check_n_features=check_n_features, - accept_sparse=('csr', 'csc'), estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + X = self._validate_data(X, reset=first_pass, + accept_sparse=('csr', 'csc'), estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) @@ -1197,8 +1196,9 @@ def fit(self, X, y=None): """ # at fit, convert sparse matrices to csc for optimized computation of # the quantiles - X = self._validate_X(X, accept_sparse='csc', estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse='csc', estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: @@ -1506,7 +1506,8 @@ def fit(self, X, y=None): ------- self : instance """ - n_samples, n_features = self._validate_X(X, accept_sparse=True).shape + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape combinations = self._combinations(n_features, self.degree, self.interaction_only, self.include_bias) @@ -1812,7 +1813,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - self._validate_X(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -1946,7 +1947,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - self._validate_X(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -2026,7 +2027,7 @@ def fit(self, K, y=None): self : returns an instance of self. """ - K = self._validate_X(K, dtype=FLOAT_DTYPES) + K = self._validate_data(K, dtype=FLOAT_DTYPES) if K.shape[0] != K.shape[1]: raise ValueError("Kernel matrix must be a square matrix." @@ -2444,14 +2445,14 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False, """Check inputs before fit and transform""" # deactivating check for now (specific tests about error message would # break) - # TODO: uncomment when addressing check_n_features in - # predict/transform/etc. - # check_n_features = not in_fit - check_n_features = False - - X = self._validate_X(X, check_n_features=check_n_features, - accept_sparse='csc', copy=copy, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + # TODO: uncomment when addressing reset in predict/transform/etc. + # reset = in_fit + reset = True + + X = self._validate_data(X, reset=reset, + accept_sparse='csc', copy=copy, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. with np.errstate(invalid='ignore'): # hide NaN comparison warnings @@ -3004,15 +3005,8 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method : bool If True, check that the transformation method is valid. """ - # deactivating check for now (specific tests about error message would - # break) - # TODO: uncomment when addressing check_n_features in - # predict/transform/etc. - # check_n_features = not in_fit - check_n_features = False - X = self._validate_X(X, check_n_features=check_n_features, - ensure_2d=True, dtype=FLOAT_DTYPES, - copy=self.copy, force_all_finite='allow-nan') + X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan') with np.warnings.catch_warnings(): np.warnings.filterwarnings( diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index a4dc703e9f3cb..67641601e06f5 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -137,7 +137,7 @@ def fit(self, X, y=None): ------- self """ - X = self._validate_X(X, dtype='numeric') + X = self._validate_data(X, dtype='numeric') valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 211bad665778c..85ce3a1f845c1 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -92,7 +92,7 @@ def __init__(self, func=None, inverse_func=None, validate=False, def _check_input(self, X): if self.validate: - return self._validate_X(X, accept_sparse=self.accept_sparse) + return self._validate_data(X, accept_sparse=self.accept_sparse) return X def _check_inverse_transform(self, X): diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 508ad5f5a76fa..5ad26c2dd4c90 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -356,7 +356,7 @@ def fit(self, X, y=None): self """ - X = self._validate_X(X, accept_sparse=['csr', 'csc']) + X = self._validate_data(X, accept_sparse=['csr', 'csc']) n_samples, n_features = X.shape diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index e1bb3b3436896..a84a9950aa3ac 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -223,7 +223,7 @@ def fit(self, X, y): ------- self : object """ - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) self.X_ = X check_classification_targets(y) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index fb0ca341481b9..061923f68f6cd 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -143,9 +143,9 @@ def fit(self, X, y, sample_weight=None): raise TypeError("Sparse precomputed kernels are not supported.") self._sparse = sparse and not callable(self.kernel) - X, y = self._validate_X_y(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = self._validate_data(X, y, dtype=np.float64, + order='C', accept_sparse='csr', + accept_large_sparse=False) y = self._validate_targets(y) sample_weight = np.asarray([] diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index c795f4cd6d099..b7e1881e9331f 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -218,9 +218,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = self._validate_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) check_classification_targets(y) self.classes_ = np.unique(y) @@ -398,9 +398,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = self._validate_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) penalty = 'l2' # SVR only accepts l2 penalty self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear( X, y, self.C, self.fit_intercept, self.intercept_scaling, diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 38ea318e3796b..6ca52e0901742 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -146,7 +146,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, raise ValueError("ccp_alpha must be greater than or equal to 0") if check_input: - X = self._validate_X(X, dtype=DTYPE, accept_sparse="csc") + X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 06ed9b2b18f63..70c51c0069f7b 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -58,7 +58,7 @@ def __init__(self, key=0): self.key = key def fit(self, X, y=None): - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self def predict(self, X): @@ -73,7 +73,7 @@ def __init__(self, acceptable_key=0): def fit(self, X, y=None): self.wrong_attribute = 0 - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -83,14 +83,14 @@ def __init__(self, wrong_attribute=0): def fit(self, X, y=None): self.wrong_attribute = 1 - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self class ChangesUnderscoreAttribute(BaseEstimator): def fit(self, X, y=None): self._good_attribute = 1 - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -107,7 +107,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -124,7 +124,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -143,19 +143,19 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self class NoCheckinPredict(BaseBadClassifier): def fit(self, X, y): - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) return self class NoSparseClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = self._validate_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) if sp.issparse(X): raise ValueError("Nonsensical Error") return self @@ -167,7 +167,7 @@ def predict(self, X): class CorrectNotFittedErrorClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = self._validate_X_y(X, y) + X, y = self._validate_data(X, y) self.coef_ = np.ones(X.shape[1]) return self @@ -180,7 +180,7 @@ def predict(self, X): class NoSampleWeightPandasSeriesType(BaseEstimator): def fit(self, X, y, sample_weight=None): # Convert data - X, y = self._validate_X_y( + X, y = self._validate_data( X, y, accept_sparse=("csr", "csc"), multi_output=True, @@ -221,7 +221,7 @@ def fit(self, X, y): class BadTransformerWithoutMixin(BaseEstimator): def fit(self, X, y=None): - X = self._validate_X(X) + X = self._validate_data(X) return self def transform(self, X): @@ -232,7 +232,7 @@ def transform(self, X): class NotInvariantPredict(BaseEstimator): def fit(self, X, y): # Convert data - X, y = self._validate_X_y( + X, y = self._validate_data( X, y, accept_sparse=("csr", "csc"), multi_output=True, @@ -249,7 +249,7 @@ def predict(self, X): class LargeSparseNotSupportedClassifier(BaseEstimator): def fit(self, X, y): - X, y = self._validate_X_y( + X, y = self._validate_data( X, y, accept_sparse=("csr", "csc", "coo"), accept_large_sparse=True, @@ -270,7 +270,7 @@ def fit(self, X, y): class SparseTransformer(BaseEstimator): def fit(self, X, y=None): - self.X_shape_ = self._validate_X(X).shape + self.X_shape_ = self._validate_data(X).shape return self def fit_transform(self, X, y=None): @@ -301,7 +301,7 @@ def _more_tags(self): class RequiresPositiveYRegressor(LinearRegression): def fit(self, X, y): - X, y = self._validate_X_y(X, y, multi_output=True) + X, y = self._validate_data(X, y, multi_output=True) if (y <= 0).any(): raise ValueError('negative y values not supported!') return super().fit(X, y) From 2f448aa6b1229fc70bf8402dc9e35e9d30f58b93 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 13:49:55 -0500 Subject: [PATCH 41/53] fixed columntransformer issue --- sklearn/compose/_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 8e87cc937ce7a..ffddd316cbf7f 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -512,7 +512,7 @@ def fit_transform(self, X, y=None): self._feature_names_in = None X = _check_X(X) # set n_features_in_ attribute - self._validate_n_features(X) + self._validate_n_features(X, reset=True) self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) From a6a344d42c5e28647b739f19213d86cbc1cd1b50 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 15:08:13 -0500 Subject: [PATCH 42/53] comments --- sklearn/base.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index c3bb97c5d7a61..51f0181979c63 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -338,6 +338,18 @@ def _get_tags(self): return collected_tags def _validate_n_features(self, X, reset): + """Set the n_features_in_ attribute, or check against it. + + Parameters + ---------- + + X : ndarray or sparse matrix + The input samples + reset : bool + If True, the n_features_in_ attribute is set to X.shape[1]. Else, + the attribute must already exist and the function checks that it is + equal to X.shape[1]. + """ n_features = X.shape[1] if reset: @@ -356,6 +368,28 @@ def _validate_n_features(self, X, reset): ) def _validate_data(self, X, y=None, reset=True, **check_params): + """Validate input data and set or check the n_features_in_ attribute. + + Parameters + ---------- + + X : array-like + The input samples. + y : array-like or None, default=None + The targets. If None, check_array is called on X and check_X_y is + called otherwise. + reset : bool, default=True + Whether to reset the n_features_in_ attribute. See + _validate_n_features(). + **check_params : kwargs + Parameters passed to check_array() or check_X_y(). + + Returns + ------- + out : {ndarray, sparse matrix} or tuple of these + The validated input. A tuple is returned if y is not None. + """ + if y is None: X = check_array(X, **check_params) out = X From 9e0c3d7d4fd3d62c4a7357da4517c59f63bdfa60 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 14 Jan 2020 15:15:35 -0500 Subject: [PATCH 43/53] minor renaming --- sklearn/base.py | 6 +++--- sklearn/compose/_column_transformer.py | 4 ++-- sklearn/mixture/_base.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 51f0181979c63..2918fc75fa745 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -337,7 +337,7 @@ def _get_tags(self): collected_tags.update(more_tags) return collected_tags - def _validate_n_features(self, X, reset): + def _check_n_features(self, X, reset): """Set the n_features_in_ attribute, or check against it. Parameters @@ -380,7 +380,7 @@ def _validate_data(self, X, y=None, reset=True, **check_params): called otherwise. reset : bool, default=True Whether to reset the n_features_in_ attribute. See - _validate_n_features(). + _check_n_features(). **check_params : kwargs Parameters passed to check_array() or check_X_y(). @@ -398,7 +398,7 @@ def _validate_data(self, X, y=None, reset=True, **check_params): out = X, y if check_params.get('ensure_2d', True): - self._validate_n_features(X, reset=reset) + self._check_n_features(X, reset=reset) return out diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index ffddd316cbf7f..5b360bb94aeae 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -512,7 +512,7 @@ def fit_transform(self, X, y=None): self._feature_names_in = None X = _check_X(X) # set n_features_in_ attribute - self._validate_n_features(X, reset=True) + self._check_n_features(X, reset=True) self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) @@ -586,7 +586,7 @@ def transform(self, X): 'and for transform when using the ' 'remainder keyword') - # TODO: also call _validate_n_features(reset=False) in 0.24 + # TODO: also call _check_n_features(reset=False) in 0.24 self._validate_features(X.shape[1], X_feature_names) Xs = self._fit_transform(X, None, _transform_one, fitted=True) self._validate_output(Xs) diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index e96978f9018f2..b8877da2a7c1c 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -217,7 +217,7 @@ def fit_predict(self, X, y=None): Component labels. """ X = _check_X(X, self.n_components, ensure_min_samples=2) - self._validate_n_features(X, reset=True) + self._check_n_features(X, reset=True) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation From d7963e7f8ed6372db01f7cd416491e70241cea74 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 31 Jan 2020 10:21:21 -0500 Subject: [PATCH 44/53] Apply suggestions from code review Co-Authored-By: Guillaume Lemaitre --- sklearn/base.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 2918fc75fa745..9b08b36ecd7c0 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -338,17 +338,16 @@ def _get_tags(self): return collected_tags def _check_n_features(self, X, reset): - """Set the n_features_in_ attribute, or check against it. + """Set the `n_features_in_` attribute, or check against it. Parameters ---------- - - X : ndarray or sparse matrix - The input samples + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples. reset : bool - If True, the n_features_in_ attribute is set to X.shape[1]. Else, + If True, the `n_features_in_` attribute is set to `X.shape[1]`. Else, the attribute must already exist and the function checks that it is - equal to X.shape[1]. + equal to `X.shape[1]`. """ n_features = X.shape[1] @@ -357,7 +356,7 @@ def _check_n_features(self, X, reset): else: if not hasattr(self, 'n_features_in_'): raise RuntimeError( - "reset parameter is False but there is no n_features_in_ " + "The reset parameter is False but there is no n_features_in_ " "attribute." ) if n_features != self.n_features_in_: @@ -368,26 +367,25 @@ def _check_n_features(self, X, reset): ) def _validate_data(self, X, y=None, reset=True, **check_params): - """Validate input data and set or check the n_features_in_ attribute. + """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- - - X : array-like + X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features) The input samples. - y : array-like or None, default=None - The targets. If None, check_array is called on X and check_X_y is + y : array-like of shape (n_samples,), default=None + The targets. If None, `check_array` is called on `X` and `check_X_y` is called otherwise. reset : bool, default=True - Whether to reset the n_features_in_ attribute. See - _check_n_features(). + Whether to reset the `n_features_in_` attribute. See + :func:`_check_n_features`. **check_params : kwargs - Parameters passed to check_array() or check_X_y(). + Parameters passed to :func:`sklearn.utils.check_array` or :func:`sklearn.utils.check_X_y`. Returns ------- out : {ndarray, sparse matrix} or tuple of these - The validated input. A tuple is returned if y is not None. + The validated input. A tuple is returned if `y` is not None. """ if y is None: From d6f0451bd308306700f3960cfa654770a78e7418 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 31 Jan 2020 10:48:58 -0500 Subject: [PATCH 45/53] addressed most comments --- sklearn/base.py | 20 ++++++++++--------- sklearn/cluster/_agglomerative.py | 4 ++-- sklearn/decomposition/_fastica.py | 11 ++-------- sklearn/ensemble/_forest.py | 4 ++-- .../tests/test_dict_vectorizer.py | 3 +-- sklearn/pipeline.py | 1 + 6 files changed, 19 insertions(+), 24 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 5bbe0a215192d..9d97539687179 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -352,9 +352,9 @@ def _check_n_features(self, X, reset): X : {ndarray, sparse matrix} of shape (n_samples, n_features) The input samples. reset : bool - If True, the `n_features_in_` attribute is set to `X.shape[1]`. Else, - the attribute must already exist and the function checks that it is - equal to `X.shape[1]`. + If True, the `n_features_in_` attribute is set to `X.shape[1]`. + Else, the attribute must already exist and the function checks + that it is equal to `X.shape[1]`. """ n_features = X.shape[1] @@ -363,8 +363,8 @@ def _check_n_features(self, X, reset): else: if not hasattr(self, 'n_features_in_'): raise RuntimeError( - "The reset parameter is False but there is no n_features_in_ " - "attribute." + "The reset parameter is False but there is no " + "n_features_in_ attribute. Is this estimator fitted?" ) if n_features != self.n_features_in_: raise ValueError( @@ -378,16 +378,18 @@ def _validate_data(self, X, y=None, reset=True, **check_params): Parameters ---------- - X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features) + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) The input samples. y : array-like of shape (n_samples,), default=None - The targets. If None, `check_array` is called on `X` and `check_X_y` is - called otherwise. + The targets. If None, `check_array` is called on `X` and + `check_X_y` is called otherwise. reset : bool, default=True Whether to reset the `n_features_in_` attribute. See :func:`_check_n_features`. **check_params : kwargs - Parameters passed to :func:`sklearn.utils.check_array` or :func:`sklearn.utils.check_X_y`. + Parameters passed to :func:`sklearn.utils.check_array` or + :func:`sklearn.utils.check_X_y`. Returns ------- diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index b29b1078333cc..8d21e69c32e7f 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -1053,10 +1053,10 @@ def fit(self, X, y=None, **params): """ X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], ensure_min_features=2, estimator=self) + # save n_features_in_ attribute here to reset it after, because it will + # be overridden in AgglomerativeClustering since we passed it X.T. n_features_in_ = self.n_features_in_ AgglomerativeClustering.fit(self, X.T, **params) - # Need to restore n_features_in_ attribute that was overridden in - # AgglomerativeClustering since we passed it X.T. self.n_features_in_ = n_features_in_ return self diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index ef9f376bba66d..f9e3a148f6860 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -425,18 +425,11 @@ def _fit(self, X, compute_sources=False): X_new : array-like, shape (n_samples, n_components) """ - # This validates twice but there is not clean way to avoid validation - # in fastica(). Please see issue 14897. - self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T + X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args random_state = check_random_state(self.random_state) - # make interface compatible with other decompositions - # a copy is required only for non whitened data - X = check_array(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T - alpha = fun_args.get('alpha', 1.0) if not 1 <= alpha <= 2: raise ValueError('alpha must be in [1,2]') diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 9d1cfee7e7266..3bb6b14aaa7f1 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -293,8 +293,8 @@ def fit(self, X, y, sample_weight=None): self : object """ # Validate or convert input data - X = self._validate_data(X, accept_sparse="csc", dtype=DTYPE) - y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=True, + accept_sparse="csc", dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index a65feb2d7590b..22a7402908cf1 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -113,8 +113,7 @@ def test_deterministic_vocabulary(): def test_n_features_in(): - # For vectorizers, n_features_in_ does not make sense and it is always - # None + # For vectorizers, n_features_in_ does not make sense and does not exist. dv = DictVectorizer() assert not hasattr(dv, 'n_features_in_') d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1ad9dda276427..0a914c1cff9af 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -629,6 +629,7 @@ def _pairwise(self): @property def n_features_in_(self): + # delegate to first step (which will call _check_is_fitted) return self.steps[0][1].n_features_in_ From b917d72dd81d2002c0b19e3ff0e391440e5ada26 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 31 Jan 2020 10:57:06 -0500 Subject: [PATCH 46/53] Better comments --- sklearn/decomposition/_lda.py | 11 +++++++---- sklearn/preprocessing/_data.py | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 201b393374a08..ba68e03a16191 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -499,10 +499,13 @@ def partial_fit(self, X, y=None): self._check_params() first_time = not hasattr(self, 'components_') - # deactivating check for now (specific tests about error message would - # break) - # TODO: uncomment when addressing reset in predict/transform/etc. - # reset = first_time + # In theory reset should be equal to `first_time`, but there are tests + # checking the input number of feature and they expect a specific + # string, which is not the same one raised by check_n_features. So we + # don't check n_features_in_ here for now (it's done with adhoc code in + # the estimator anyway). + # TODO: set reset=first_time when addressing reset in + # predict/transform/etc. reset_n_features = True X = self._check_non_neg_array(X, reset_n_features, "LatentDirichletAllocation.partial_fit") diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index b43f1b9c2203a..72ad6bacd43b4 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2443,10 +2443,13 @@ def _transform_col(self, X_col, quantiles, inverse): def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): """Check inputs before fit and transform""" - # deactivating check for now (specific tests about error message would - # break) - # TODO: uncomment when addressing reset in predict/transform/etc. - # reset = in_fit + # In theory reset should be equal to `in_fit`, but there are tests + # checking the input number of feature and they expect a specific + # string, which is not the same one raised by check_n_features. So we + # don't check n_features_in_ here for now (it's done with adhoc code in + # the estimator anyway). + # TODO: set reset=in_fit when addressing reset in + # predict/transform/etc. reset = True X = self._validate_data(X, reset=reset, From 4fb756ea0d1a1622e0f11dc421de8ed6ea1a5399 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 5 Feb 2020 10:00:12 -0500 Subject: [PATCH 47/53] pep8 --- sklearn/feature_selection/_rfe.py | 14 +++++--------- sklearn/svm/_base.py | 4 ++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 0edc4f35ac679..69e3cc4de9e6c 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -493,16 +493,12 @@ def fit(self, X, y, groups=None): train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). """ -<<<<<<< HEAD - X, y = self._validate_data(X, y, accept_sparse="csr", - ensure_min_features=2, - force_all_finite=False) -======= tags = self._get_tags() - X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=not tags.get('allow_nan', True), - multi_output=True) ->>>>>>> 54c3a1fbe7ef0f6814ae6406fbc0d52804303370 + X, y = self._validate_data( + X, y, accept_sparse="csr", ensure_min_features=2, + force_all_finite=not tags.get('allow_nan', True), + multi_output=True + ) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 116b6c2b6cfeb..ee64c3fb1692e 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -148,8 +148,8 @@ def fit(self, X, y, sample_weight=None): check_consistent_length(X, y) else: X, y = self._validate_data(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + order='C', accept_sparse='csr', + accept_large_sparse=False) y = self._validate_targets(y) From 511c395223dd37ffe8fc4e031080fdca1630425b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 7 Feb 2020 10:42:37 -0500 Subject: [PATCH 48/53] Addressed comments and raise warning instad of error --- sklearn/base.py | 5 +++-- sklearn/compose/_target.py | 2 +- sklearn/utils/estimator_checks.py | 17 +++++++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 9d97539687179..58ef44eefc047 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -385,8 +385,9 @@ def _validate_data(self, X, y=None, reset=True, **check_params): The targets. If None, `check_array` is called on `X` and `check_X_y` is called otherwise. reset : bool, default=True - Whether to reset the `n_features_in_` attribute. See - :func:`_check_n_features`. + Whether to reset the `n_features_in_` attribute. + If False, the input will be checked for consistency with data + provided when reset was last True. **check_params : kwargs Parameters passed to :func:`sklearn.utils.check_array` or :func:`sklearn.utils.check_X_y`. diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 03727e82d7e90..a2bfb9a5d35d4 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -239,7 +239,7 @@ def _more_tags(self): @property def n_features_in_(self): # For consistency with other estimators we raise a AttributeError so - # that hasattr() fails if the estimator isn't fitted. + # that hasattr() returns False the estimator isn't fitted. try: check_is_fitted(self) except NotFittedError as nfe: diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f3ad8c01a46df..3b2c518d08c2c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2883,7 +2883,7 @@ def check_n_features_in(name, estimator_orig): estimator = clone(estimator_orig) set_random_state(estimator) - if 'warm_start' in estimator.get_params().keys(): + if 'warm_start' in estimator.get_params(): estimator.set_params(warm_start=False) n_samples = 100 @@ -2897,4 +2897,17 @@ def check_n_features_in(name, estimator_orig): assert not hasattr(estimator, 'n_features_in_') estimator.fit(X, y) - assert estimator.n_features_in_ == X.shape[1] + if hasattr(estimator, 'n_features_in_'): + assert estimator.n_features_in_ == X.shape[1] + else: + warnings.warn( + "As of scikit-learn 0.23, estimators should expose a " + "n_features_in_ attribute, unless the 'no_validation' tag is " + "True. This attribute should be equal to the number of features " + "passed to the fit method. " + "An error will be raised from version 0.25 when calling " + "check_estimator(). " + "See SLEP010: " + "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html", # noqa + FutureWarning + ) From c2708841dce1e42e17e1406c29e289f7778ed7e0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 7 Feb 2020 10:47:52 -0500 Subject: [PATCH 49/53] Added whatsnew --- doc/whats_new/v0.23.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index a1830229b57ec..b4f378c7b591d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -14,6 +14,14 @@ Version 0.23.0 Put the changes in their relevant module. +New `n_features_in_` attribute +------------------------------ + +Most estimators now expose a `n_features_in_` attribute. This attribute is +equal to the number of features passed to the `fit` method. See +`SLEP010 `_ +for details. + Changed models -------------- From 39d83714a8631f28cc0dabb8c9017b5f79e8183b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 7 Feb 2020 11:15:45 -0500 Subject: [PATCH 50/53] Updated estimator API --- doc/developers/develop.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 5e11f46eccdb8..7b5c3db36f526 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -226,6 +226,14 @@ the dataset, e.g. when ``X`` is a precomputed kernel matrix. Specifically, the :term:`_pairwise` property is used by ``utils.metaestimators._safe_split`` to slice rows and columns. +Universal attributes +^^^^^^^^^^^^^^^^^^^^ + +Estimators that expect rectangular input have a `n_features_in_` attribute +indicating the number of features that the estimator expects. See `SLEP010 +`_ +for details. + .. _rolling_your_own_estimator: Rolling your own estimator From 9effdbf962363784c970419952bf782d7911d2b8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 7 Feb 2020 11:17:04 -0500 Subject: [PATCH 51/53] formulation --- doc/developers/develop.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 7b5c3db36f526..1b936684eafb1 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -229,8 +229,9 @@ to slice rows and columns. Universal attributes ^^^^^^^^^^^^^^^^^^^^ -Estimators that expect rectangular input have a `n_features_in_` attribute -indicating the number of features that the estimator expects. See `SLEP010 +Estimators that expect rectangular input should have a `n_features_in_` +attribute indicating the number of features that the estimator expects. See +`SLEP010 `_ for details. From 4f8ca86e86bf949847d58a561ed1fe2f07d44d46 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 10 Feb 2020 10:32:02 -0500 Subject: [PATCH 52/53] Comment about estimator API --- doc/developers/develop.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 1b936684eafb1..db1ba3900e40a 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -229,8 +229,10 @@ to slice rows and columns. Universal attributes ^^^^^^^^^^^^^^^^^^^^ -Estimators that expect rectangular input should have a `n_features_in_` -attribute indicating the number of features that the estimator expects. See +Estimators that expect tabular input should set a `n_features_in_` +attribute at `fit` time to indicate the number of features that the estimator +expects for subsequent calls to `predict` or `transform`. +See `SLEP010 `_ for details. From a101d2d5ce4634f0373599dfa4b43f67b1ea2f7b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 11 Feb 2020 10:16:15 -0500 Subject: [PATCH 53/53] Updated changelog --- doc/whats_new/v0.23.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b4f378c7b591d..2ed63ed2fdf22 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -14,15 +14,6 @@ Version 0.23.0 Put the changes in their relevant module. -New `n_features_in_` attribute ------------------------------- - -Most estimators now expose a `n_features_in_` attribute. This attribute is -equal to the number of features passed to the `fit` method. See -`SLEP010 `_ -for details. - - Changed models -------------- @@ -275,3 +266,13 @@ Changelog - |Enhancement| add warning in :func:`utils.validation.check_array` for pandas sparse DataFrame. :pr:`16021` by :user:`Rushabh Vasani `. + +Miscellaneous +............. + +- |API| Most estimators now expose a `n_features_in_` attribute. This + attribute is equal to the number of features passed to the `fit` method. + See `SLEP010 + `_ + for details. :pr:`16112` by `Nicolas Hug`_. +