diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 2f73e0a03d68f..ccfec4c9c2bb5 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -368,6 +368,10 @@ Changelog is a predefined metric listed in :func:`metrics.get_scorer_names` and early stopping is enabled. :pr:`26163` by `Thomas Fan`_. +- |Fix| Fixes :class:`ensemble.IsolationForest` when the input is a sparse matrix and + `contamination` is set to a float value. + :pr:`27645` by :user:`Guillaume Lemaitre `. + - |API| In :class:`ensemble.AdaBoostClassifier`, the `algorithm` argument `SAMME.R` was deprecated and will be removed in 1.6. :pr:`26830` by :user:`Stefanie Senger `. diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 8972e9063427c..c975f121798f0 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -340,7 +340,10 @@ def fit(self, X, y=None, sample_weight=None): # Else, define offset_ wrt contamination parameter # To avoid performing input validation a second time we call - # _score_samples rather than score_samples + # _score_samples rather than score_samples. + # _score_samples expects a CSR matrix, so we convert if necessary. + if issparse(X): + X = X.tocsr() self.offset_ = np.percentile(self._score_samples(X), 100.0 * self.contamination) return self @@ -425,7 +428,7 @@ def score_samples(self, X): The lower, the more abnormal. """ # Check data - X = self._validate_data(X, accept_sparse="csr", dtype=np.float32, reset=False) + X = self._validate_data(X, accept_sparse="csr", dtype=tree_dtype, reset=False) return self._score_samples(X) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 7a6dc0e9f95aa..22dcc92906a6b 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -341,3 +341,23 @@ def test_iforest_preserve_feature_names(): with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) model.fit(X) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_iforest_sparse_input_float_contamination(sparse_container): + """Check that `IsolationForest` accepts sparse matrix input and float value for + contamination. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27626 + """ + X, _ = make_classification(n_samples=50, n_features=4, random_state=0) + X = sparse_container(X) + X.sort_indices() + contamination = 0.1 + iforest = IsolationForest( + n_estimators=5, contamination=contamination, random_state=0 + ).fit(X) + + X_decision = iforest.decision_function(X) + assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)