FIX cross_validate with multimetric scoring returns the non-failed scorers results even if some fail (#23101)

simonandras · glemaitre · web-flow · commit d17d0f9f721d · 2022-12-28T15:16:11.000Z
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -48,6 +48,13 @@ Changelog
   :class:`ensemble.ExtraTreesClassifier` and :class:`ensemble.ExtraTreesRegressor`.
   :pr:`25177` by :user:`Tim Head <betatim>`.
 
+:mod:`sklearn.model_selection`
+..............................
+- |Fix| :func:`model_selection.cross_validate` with multimetric scoring in
+  case of some failing scorers the non-failing scorers now returns proper
+  scores instead of `error_score` values.
+  :pr:`23101` by :user:`András Simon <simonandras>` and `Thomas Fan`_.
+
 :mod:`sklearn.pipeline`
 .......................
 - |Feature| :class:`pipeline.FeatureUnion` can now use indexing notation (e.g.
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
@@ -252,7 +252,7 @@ def permutation_importance(
         scorer = check_scoring(estimator, scoring=scoring)
     else:
         scorers_dict = _check_multimetric_scoring(estimator, scoring)
-        scorer = _MultimetricScorer(**scorers_dict)
+        scorer = _MultimetricScorer(scorers=scorers_dict)
 
     baseline_score = _weights_scorer(scorer, estimator, X, y, sample_weight)
 
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
@@ -21,6 +21,7 @@
 from collections.abc import Iterable
 from functools import partial
 from collections import Counter
+from traceback import format_exc
 
 import numpy as np
 import copy
@@ -91,10 +92,16 @@ class _MultimetricScorer:
     ----------
     scorers : dict
         Dictionary mapping names to callable scorers.
+
+    raise_exc : bool, default=True
+        Whether to raise the exception in `__call__` or not. If set to `False`
+        a formatted string of the exception details is passed as result of
+        the failing scorer.
     """
 
-    def __init__(self, **scorers):
+    def __init__(self, *, scorers, raise_exc=True):
         self._scorers = scorers
+        self._raise_exc = raise_exc
 
     def __call__(self, estimator, *args, **kwargs):
         """Evaluate predicted target values."""
@@ -103,11 +110,18 @@ def __call__(self, estimator, *args, **kwargs):
         cached_call = partial(_cached_call, cache)
 
         for name, scorer in self._scorers.items():
-            if isinstance(scorer, _BaseScorer):
-                score = scorer._score(cached_call, estimator, *args, **kwargs)
-            else:
-                score = scorer(estimator, *args, **kwargs)
-            scores[name] = score
+            try:
+                if isinstance(scorer, _BaseScorer):
+                    score = scorer._score(cached_call, estimator, *args, **kwargs)
+                else:
+                    score = scorer(estimator, *args, **kwargs)
+                scores[name] = score
+            except Exception as e:
+                if self._raise_exc:
+                    raise e
+                else:
+                    scores[name] = format_exc()
+
         return scores
 
     def _use_cache(self, estimator):
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
@@ -786,7 +786,7 @@ def test_multimetric_scorer_calls_method_once(
     mock_est.classes_ = np.array([0, 1])
 
     scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers)
-    multi_scorer = _MultimetricScorer(**scorer_dict)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict)
     results = multi_scorer(mock_est, X, y)
 
     assert set(scorers) == set(results)  # compare dict keys
@@ -813,7 +813,7 @@ def predict_proba(self, X):
 
     scorers = ["roc_auc", "neg_log_loss"]
     scorer_dict = _check_multimetric_scoring(clf, scorers)
-    scorer = _MultimetricScorer(**scorer_dict)
+    scorer = _MultimetricScorer(scorers=scorer_dict)
     scorer(clf, X, y)
 
     assert predict_proba_call_cnt == 1
@@ -836,7 +836,7 @@ def predict(self, X):
 
     scorers = {"neg_mse": "neg_mean_squared_error", "r2": "roc_auc"}
     scorer_dict = _check_multimetric_scoring(clf, scorers)
-    scorer = _MultimetricScorer(**scorer_dict)
+    scorer = _MultimetricScorer(scorers=scorer_dict)
     scorer(clf, X, y)
 
     assert predict_called_cnt == 1
@@ -859,7 +859,7 @@ def test_multimetric_scorer_sanity_check():
     clf.fit(X, y)
 
     scorer_dict = _check_multimetric_scoring(clf, scorers)
-    multi_scorer = _MultimetricScorer(**scorer_dict)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict)
 
     result = multi_scorer(clf, X, y)
 
@@ -873,6 +873,49 @@ def test_multimetric_scorer_sanity_check():
         assert_allclose(value, separate_scores[score_name])
 
 
+@pytest.mark.parametrize("raise_exc", [True, False])
+def test_multimetric_scorer_exception_handling(raise_exc):
+    """Check that the calling of the `_MultimetricScorer` returns
+    exception messages in the result dict for the failing scorers
+    in case of `raise_exc` is `False` and if `raise_exc` is `True`,
+    then the proper exception is raised.
+    """
+    scorers = {
+        "failing_1": "neg_mean_squared_log_error",
+        "non_failing": "neg_median_absolute_error",
+        "failing_2": "neg_mean_squared_log_error",
+    }
+
+    X, y = make_classification(
+        n_samples=50, n_features=2, n_redundant=0, random_state=0
+    )
+    y *= -1  # neg_mean_squared_log_error fails if y contains negative values
+
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
+    multi_scorer = _MultimetricScorer(scorers=scorer_dict, raise_exc=raise_exc)
+
+    error_msg = (
+        "Mean Squared Logarithmic Error cannot be used when targets contain"
+        " negative values."
+    )
+
+    if raise_exc:
+        with pytest.raises(ValueError, match=error_msg):
+            multi_scorer(clf, X, y)
+    else:
+        result = multi_scorer(clf, X, y)
+
+        exception_message_1 = result["failing_1"]
+        score = result["non_failing"]
+        exception_message_2 = result["failing_2"]
+
+        assert isinstance(exception_message_1, str) and error_msg in exception_message_1
+        assert isinstance(score, float)
+        assert isinstance(exception_message_2, str) and error_msg in exception_message_2
+
+
 @pytest.mark.parametrize(
     "scorer_name, metric",
     [
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
@@ -758,27 +758,45 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"):
     """
     if isinstance(scorer, dict):
         # will cache method calls if needed. scorer() returns a dict
-        scorer = _MultimetricScorer(**scorer)
+        scorer = _MultimetricScorer(scorers=scorer, raise_exc=(error_score == "raise"))
 
     try:
         if y_test is None:
             scores = scorer(estimator, X_test)
         else:
             scores = scorer(estimator, X_test, y_test)
     except Exception:
-        if error_score == "raise":
+        if isinstance(scorer, _MultimetricScorer):
+            # If `_MultimetricScorer` raises exception, the `error_score`
+            # parameter is equal to "raise".
             raise
         else:
-            if isinstance(scorer, _MultimetricScorer):
-                scores = {name: error_score for name in scorer._scorers}
+            if error_score == "raise":
+                raise
             else:
                 scores = error_score
-            warnings.warn(
-                "Scoring failed. The score on this train-test partition for "
-                f"these parameters will be set to {error_score}. Details: \n"
-                f"{format_exc()}",
-                UserWarning,
-            )
+                warnings.warn(
+                    "Scoring failed. The score on this train-test partition for "
+                    f"these parameters will be set to {error_score}. Details: \n"
+                    f"{format_exc()}",
+                    UserWarning,
+                )
+
+    # Check non-raised error messages in `_MultimetricScorer`
+    if isinstance(scorer, _MultimetricScorer):
+        exception_messages = [
+            (name, str_e) for name, str_e in scores.items() if isinstance(str_e, str)
+        ]
+        if exception_messages:
+            # error_score != "raise"
+            for name, str_e in exception_messages:
+                scores[name] = error_score
+                warnings.warn(
+                    "Scoring failed. The score on this train-test partition for "
+                    f"these parameters will be set to {error_score}. Details: \n"
+                    f"{str_e}",
+                    UserWarning,
+                )
 
     error_msg = "scoring must return a number, got %s (%s) instead. (scorer=%s)"
     if isinstance(scores, dict):
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
@@ -2232,15 +2232,22 @@ def test_cross_val_score_failing_scorer(error_score):
 def test_cross_validate_failing_scorer(
     error_score, return_train_score, with_multimetric
 ):
-    # check that an estimator can fail during scoring in `cross_validate` and
-    # that we can optionally replaced it with `error_score`
+    # Check that an estimator can fail during scoring in `cross_validate` and
+    # that we can optionally replace it with `error_score`. In the multimetric
+    # case also check the result of a non-failing scorer where the other scorers
+    # are failing.
     X, y = load_iris(return_X_y=True)
     clf = LogisticRegression(max_iter=5).fit(X, y)
 
     error_msg = "This scorer is supposed to fail!!!"
     failing_scorer = partial(_failing_scorer, error_msg=error_msg)
     if with_multimetric:
-        scoring = {"score_1": failing_scorer, "score_2": failing_scorer}
+        non_failing_scorer = make_scorer(mean_squared_error)
+        scoring = {
+            "score_1": failing_scorer,
+            "score_2": non_failing_scorer,
+            "score_3": failing_scorer,
+        }
     else:
         scoring = failing_scorer
 
@@ -2272,9 +2279,15 @@ def test_cross_validate_failing_scorer(
             )
             for key in results:
                 if "_score" in key:
-                    # check the test (and optionally train score) for all
-                    # scorers that should be assigned to `error_score`.
-                    assert_allclose(results[key], error_score)
+                    if "_score_2" in key:
+                        # check the test (and optionally train) score for the
+                        # scorer that should be non-failing
+                        for i in results[key]:
+                            assert isinstance(i, float)
+                    else:
+                        # check the test (and optionally train) score for all
+                        # scorers that should be assigned to `error_score`.
+                        assert_allclose(results[key], error_score)
 
 
 def three_params_scorer(i, j, k):