FIX check_estimator fails when validating SGDClassifier with log_loss (#24071)

MaxwellLZH · thomasjpfan · web-flow · commit 0e4e418eb2d3 · 2022-10-25T12:22:03.000-04:00
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -3504,7 +3504,21 @@ def check_decision_proba_consistency(name, estimator_orig):
         # inversions in case of machine level differences.
         a = estimator.predict_proba(X_test)[:, 1].round(decimals=10)
         b = estimator.decision_function(X_test).round(decimals=10)
-        assert_array_equal(rankdata(a), rankdata(b))
+
+        rank_proba, rank_score = rankdata(a), rankdata(b)
+        try:
+            assert_array_almost_equal(rank_proba, rank_score)
+        except AssertionError:
+            # Sometimes, the rounding applied on the probabilities will have
+            # ties that are not present in the scores because it is
+            # numerically more precise. In this case, we relax the test by
+            # grouping the decision function scores based on the probability
+            # rank and check that the score is monotonically increasing.
+            grouped_y_score = np.array(
+                [b[rank_proba == group].mean() for group in np.unique(rank_proba)]
+            )
+            sorted_idx = np.argsort(grouped_y_score)
+            assert_array_equal(sorted_idx, np.arange(len(sorted_idx)))
 
 
 def check_outliers_fit_predict(name, estimator_orig):
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
@@ -36,6 +36,7 @@
 from sklearn.utils import all_estimators
 from sklearn.exceptions import SkipTestWarning
 from sklearn.utils.metaestimators import available_if
+from sklearn.utils.estimator_checks import check_decision_proba_consistency
 from sklearn.utils._param_validation import Interval, StrOptions
 
 from sklearn.utils.estimator_checks import (
@@ -1159,3 +1160,13 @@ class OutlierDetectorWithConstraint(OutlierDetectorWithoutConstraint):
         detector = OutlierDetectorWithConstraint()
         with raises(AssertionError, match=err_msg):
             check_outlier_contamination(detector.__class__.__name__, detector)
+
+
+def test_decision_proba_tie_ranking():
+    """Check that in case with some probabilities ties, we relax the
+    ranking comparison with the decision function.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24025
+    """
+    estimator = SGDClassifier(loss="log_loss")
+    check_decision_proba_consistency("SGDClassifier", estimator)