ENH Checks n_features_in_ after fitting in mixture (#19540)

thomasjpfan · web-flow · commit cf296c74ba91 · 2021-03-23T14:08:11.000+01:00
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
@@ -15,7 +15,7 @@
 from ..base import BaseEstimator
 from ..base import DensityMixin
 from ..exceptions import ConvergenceWarning
-from ..utils import check_array, check_random_state
+from ..utils import check_random_state
 from ..utils.validation import check_is_fitted
 
 
@@ -36,32 +36,6 @@ def _check_shape(param, param_shape, name):
                          "but got %s" % (name, param_shape, param.shape))
 
 
-def _check_X(X, n_components=None, n_features=None, ensure_min_samples=1):
-    """Check the input data X.
-
-    Parameters
-    ----------
-    X : array-like of shape (n_samples, n_features)
-
-    n_components : int
-
-    Returns
-    -------
-    X : array, shape (n_samples, n_features)
-    """
-    X = check_array(X, dtype=[np.float64, np.float32],
-                    ensure_min_samples=ensure_min_samples)
-    if n_components is not None and X.shape[0] < n_components:
-        raise ValueError('Expected n_samples >= n_components '
-                         'but got n_components = %d, n_samples = %d'
-                         % (n_components, X.shape[0]))
-    if n_features is not None and X.shape[1] != n_features:
-        raise ValueError("Expected the input data X have %d features, "
-                         "but got %d features"
-                         % (n_features, X.shape[1]))
-    return X
-
-
 class BaseMixture(DensityMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for mixture models.
 
@@ -217,8 +191,12 @@ def fit_predict(self, X, y=None):
         labels : array, shape (n_samples,)
             Component labels.
         """
-        X = _check_X(X, self.n_components, ensure_min_samples=2)
-        self._check_n_features(X, reset=True)
+        X = self._validate_data(X, dtype=[np.float64, np.float32],
+                                ensure_min_samples=2)
+        if X.shape[0] < self.n_components:
+            raise ValueError("Expected n_samples >= n_components "
+                             f"but got n_components = {self.n_components}, "
+                             f"n_samples = {X.shape[0]}")
         self._check_initial_parameters(X)
 
         # if we enable warm_start, we will have a unique initialisation
@@ -335,7 +313,7 @@ def score_samples(self, X):
             Log probabilities of each data point in X.
         """
         check_is_fitted(self)
-        X = _check_X(X, None, self.means_.shape[1])
+        X = self._validate_data(X, reset=False)
 
         return logsumexp(self._estimate_weighted_log_prob(X), axis=1)
 
@@ -370,7 +348,7 @@ def predict(self, X):
             Component labels.
         """
         check_is_fitted(self)
-        X = _check_X(X, None, self.means_.shape[1])
+        X = self._validate_data(X, reset=False)
         return self._estimate_weighted_log_prob(X).argmax(axis=1)
 
     def predict_proba(self, X):
@@ -389,7 +367,7 @@ def predict_proba(self, X):
             the model given each sample.
         """
         check_is_fitted(self)
-        X = _check_X(X, None, self.means_.shape[1])
+        X = self._validate_data(X, reset=False)
         _, log_resp = self._estimate_log_prob_resp(X)
         return np.exp(log_resp)
 
diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py
@@ -288,6 +288,11 @@ class BayesianGaussianMixture(BaseMixture):
             (n_features)             if 'diag',
             float                    if 'spherical'
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
@@ -582,6 +582,11 @@ class GaussianMixture(BaseMixture):
         Lower bound value on the log-likelihood (of the training data with
         respect to the model) of the best fit of EM.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -172,30 +172,6 @@ def test_gaussian_mixture_attributes():
     assert gmm.init_params == init_params
 
 
-def test_check_X():
-    from sklearn.mixture._base import _check_X
-    rng = np.random.RandomState(0)
-
-    n_samples, n_components, n_features = 10, 2, 2
-
-    X_bad_dim = rng.rand(n_components - 1, n_features)
-    assert_raise_message(ValueError,
-                         'Expected n_samples >= n_components '
-                         'but got n_components = %d, n_samples = %d'
-                         % (n_components, X_bad_dim.shape[0]),
-                         _check_X, X_bad_dim, n_components)
-
-    X_bad_dim = rng.rand(n_components, n_features + 1)
-    assert_raise_message(ValueError,
-                         'Expected the input data X have %d features, '
-                         'but got %d features'
-                         % (n_features, X_bad_dim.shape[1]),
-                         _check_X, X_bad_dim, n_components, n_features)
-
-    X = rng.rand(n_samples, n_features)
-    assert_array_equal(X, _check_X(X, n_components, n_features))
-
-
 def test_check_weights():
     rng = np.random.RandomState(0)
     rand_data = RandomData(rng)
diff --git a/sklearn/mixture/tests/test_mixture.py b/sklearn/mixture/tests/test_mixture.py
@@ -21,3 +21,19 @@ def test_gaussian_mixture_n_iter(estimator):
     estimator.set_params(max_iter=max_iter)
     estimator.fit(X)
     assert estimator.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [GaussianMixture(),
+     BayesianGaussianMixture()]
+)
+def test_mixture_n_components_greater_than_n_samples_error(estimator):
+    """Check error when n_components <= n_samples"""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 5)
+    estimator.set_params(n_components=12)
+
+    msg = "Expected n_samples >= n_components"
+    with pytest.raises(ValueError, match=msg):
+        estimator.fit(X)
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -264,7 +264,6 @@ def test_search_cv(estimator, check, request):
     'calibration',
     'compose',
     'feature_extraction',
-    'mixture',
     'model_selection',
     'multiclass',
     'multioutput',
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
@@ -193,7 +193,6 @@ def _construct_searchcv_instance(SearchCV):
     'kernel_ridge',
     'linear_model',
     'manifold',
-    'mixture',
     'model_selection',
     'multiclass',
     'multioutput',