From e95ee454d5b85d5b11ba1dba6fc72d9acdec5305 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Oct 2023 19:47:21 +0200 Subject: [PATCH 1/4] FIX IsolationForest accepts sparse matrix with float value contamination --- doc/whats_new/v1.4.rst | 4 ++++ sklearn/ensemble/_iforest.py | 6 ++++-- sklearn/ensemble/tests/test_iforest.py | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index a947dfe225b04..523eed4852177 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -276,6 +276,10 @@ Changelog :class:`ensemble.GradientBoostingRegressor` when trained on sparse data. :pr:`26957` by `Thomas Fan`_. +- |Fix| Fixes :class:`ensemble.IsolationForest` when the input is a sparse matrix and + `contamination` is set to a float value. + :pr:`xxx` by :user:`Guillaume Lemaitre `. + - |API| In :class:`ensemble.AdaBoostClassifier`, the `algorithm` argument `SAMME.R` was deprecated and will be removed in 1.6. :pr:`26830` by :user:`Stefanie Senger `. diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 16d5215b7e0a8..eda98bc06371c 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -348,7 +348,9 @@ def fit(self, X, y=None, sample_weight=None): # Else, define offset_ wrt contamination parameter # To avoid performing input validation a second time we call - # _score_samples rather than score_samples + # X to a CSR matrix while it was validated earlier as a CSC matrix. + if issparse(X): + X = X.tocsr() self.offset_ = np.percentile(self._score_samples(X), 100.0 * self.contamination) return self @@ -433,7 +435,7 @@ def score_samples(self, X): The lower, the more abnormal. """ # Check data - X = self._validate_data(X, accept_sparse="csr", dtype=np.float32, reset=False) + X = self._validate_data(X, accept_sparse="csr", dtype=tree_dtype, reset=False) return self._score_samples(X) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 9abe93bd8d115..6736be7023e66 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -356,3 +356,23 @@ def test_iforest_preserve_feature_names(): with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) model.fit(X) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_iforest_sparse_input_float_contamination(sparse_container): + """Check that `IsolationForest` accepts sparse matrix input and float value for + contamination. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27626 + """ + X, _ = make_classification(n_samples=50, n_features=4, random_state=0) + X = sparse_container(X) + X.sort_indices() + contamination = 0.1 + iforest = IsolationForest( + n_estimators=5, contamination=contamination, random_state=0 + ).fit(X) + + X_decision = iforest.decision_function(X) + assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination) From 565d924326f0dd9e3f9e000e04f69f1d0e31092a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Oct 2023 20:01:45 +0200 Subject: [PATCH 2/4] trigger linter From cc576c269a7035ed2522111bf2536319bcc3c11a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Oct 2023 20:07:12 +0200 Subject: [PATCH 3/4] add changelog --- doc/whats_new/v1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 523eed4852177..896b018c89620 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -278,7 +278,7 @@ Changelog - |Fix| Fixes :class:`ensemble.IsolationForest` when the input is a sparse matrix and `contamination` is set to a float value. - :pr:`xxx` by :user:`Guillaume Lemaitre `. + :pr:`27645` by :user:`Guillaume Lemaitre `. - |API| In :class:`ensemble.AdaBoostClassifier`, the `algorithm` argument `SAMME.R` was deprecated and will be removed in 1.6. :pr:`26830` by :user:`Stefanie Senger From 9bbede4ce9de147a9281e58606616a50c124920f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Nov 2023 09:04:03 +0100 Subject: [PATCH 4/4] address Adam comment --- sklearn/ensemble/_iforest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index eda98bc06371c..80ad2cb3a2dce 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -348,7 +348,8 @@ def fit(self, X, y=None, sample_weight=None): # Else, define offset_ wrt contamination parameter # To avoid performing input validation a second time we call - # X to a CSR matrix while it was validated earlier as a CSC matrix. + # _score_samples rather than score_samples. + # _score_samples expects a CSR matrix, so we convert if necessary. if issparse(X): X = X.tocsr() self.offset_ = np.percentile(self._score_samples(X), 100.0 * self.contamination)