scikit-learn · thomasjpfan · Aug 15, 2021 · Oct 20, 2016 · Oct 20, 2016 · Feb 7, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -417,6 +417,11 @@ Changelog
 - |Enhancement| Validate user-supplied gram matrix passed to linear models
   via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy <amidvidy>`.
 
+- |API| Raise a warning in :class:`linear_model.RANSACRegressor` that from
+  version 1.2, `min_samples` need to be set explicitly for model other than
+  :class:`linear_model.LinearRegression`.
+  :pr:`19390` by :user:`Shao Yang Hong <hongshaoyang>`.
+
 - |Fix| :meth:`ElasticNet.fit` no longer modifies `sample_weight` in place.
   :pr:`19055` by `Thomas Fan`_.
 

diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
@@ -86,11 +86,20 @@ class RANSACRegressor(
     min_samples : int (>= 1) or float ([0, 1]), default=None
         Minimum number of samples chosen randomly from original data. Treated
         as an absolute number of samples for `min_samples >= 1`, treated as a
-        relative number `ceil(min_samples * X.shape[0]`) for
+        relative number `ceil(min_samples * X.shape[0])` for
         `min_samples < 1`. This is typically chosen as the minimal number of
         samples necessary to estimate the given `base_estimator`. By default a
         ``sklearn.linear_model.LinearRegression()`` estimator is assumed and
-        `min_samples` is chosen as ``X.shape[1] + 1``.
+        `min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly
+        dependent upon the model, so if a `base_estimator` other than
+        :class:`linear_model.LinearRegression` is used, the user is
+        encouraged to provide a value.
+
+        .. deprecated :: 1.0
+           Not setting `min_samples` explicitly will raise an error in version
+           1.2 for models other than
+           :class:`~sklearn.linear_model.LinearRegression`. To keep the old
+           default behavior, set `min_samples=X.shape[1] + 1` explicitly.
 
     residual_threshold : float, default=None
         Maximum residual for a data sample to be classified as an inlier.
@@ -289,7 +298,15 @@ def fit(self, X, y, sample_weight=None):
             base_estimator = LinearRegression()
 
         if self.min_samples is None:
-            # assume linear model by default
+            if not isinstance(base_estimator, LinearRegression):
+                # FIXME: in 1.2, turn this warning into an error
+                warnings.warn(
+                    "From version 1.2, `min_samples` needs to be explicitely "
+                    "set otherwise an error will be raised. To keep the "
+                    "current behavior, you need to set `min_samples` to "
+                    f"`X.shape[1] + 1 that is {X.shape[1] + 1}",
+                    FutureWarning,
+                )
             min_samples = X.shape[1] + 1
         elif 0 < self.min_samples < 1:
             min_samples = np.ceil(self.min_samples * X.shape[0])

diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
@@ -8,7 +8,7 @@
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_allclose
 from sklearn.datasets import make_regression
-from sklearn.linear_model import LinearRegression, RANSACRegressor
+from sklearn.linear_model import LinearRegression, RANSACRegressor, Ridge
 from sklearn.linear_model import OrthogonalMatchingPursuit
 from sklearn.linear_model._ransac import _dynamic_max_trials
 from sklearn.exceptions import ConvergenceWarning
@@ -358,6 +358,10 @@ def test_ransac_min_n_samples():
     ransac_estimator7 = RANSACRegressor(
         base_estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0
     )
+    # GH #19390
+    ransac_estimator8 = RANSACRegressor(
+        Ridge(), min_samples=None, residual_threshold=5, random_state=0
+    )
 
     ransac_estimator1.fit(X, y)
     ransac_estimator2.fit(X, y)
@@ -383,6 +387,10 @@ def test_ransac_min_n_samples():
     with pytest.raises(ValueError):
         ransac_estimator7.fit(X, y)
 
+    err_msg = "From version 1.2, `min_samples` needs to be explicitely set"
+    with pytest.warns(FutureWarning, match=err_msg):
+        ransac_estimator8.fit(X, y)
+
 
 def test_ransac_multi_dimensional_targets():
 
@@ -574,9 +582,10 @@ def test_ransac_fit_sample_weight():
     # check that if base_estimator.fit doesn't support
     # sample_weight, raises error
     base_estimator = OrthogonalMatchingPursuit()
-    ransac_estimator = RANSACRegressor(base_estimator)
+    ransac_estimator = RANSACRegressor(base_estimator, min_samples=10)
 
-    with pytest.raises(ValueError):
+    err_msg = f"{base_estimator.__class__.__name__} does not support sample_weight."
+    with pytest.raises(ValueError, match=err_msg):
         ransac_estimator.fit(X, y, weights)
 
 

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -26,7 +26,9 @@
 from ._testing import raises
 from . import is_scalar_nan
 
+from ..linear_model import LinearRegression
 from ..linear_model import LogisticRegression
+from ..linear_model import RANSACRegressor
 from ..linear_model import Ridge
 
 from ..base import (
@@ -354,7 +356,13 @@ def _construct_instance(Estimator):
     required_parameters = getattr(Estimator, "_required_parameters", [])
     if len(required_parameters):
         if required_parameters in (["estimator"], ["base_estimator"]):
-            if issubclass(Estimator, RegressorMixin):
+            # `RANSACRegressor` will raise an error with any model other
+            # than `LinearRegression` if we don't fix `min_samples` parameter.
+            # For common test, we can enforce using `LinearRegression` that
+            # is the default estimator in `RANSACRegressor` instead of `Ridge`.
+            if issubclass(Estimator, RANSACRegressor):
+                estimator = Estimator(LinearRegression())
+            elif issubclass(Estimator, RegressorMixin):
                 estimator = Estimator(Ridge())
             else:
                 estimator = Estimator(LogisticRegression(C=1))