diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index 547b4f3ed2ddc..585ead9a3be83 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -119,7 +119,8 @@ def print_outlier_ratio(y): y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') - model = IsolationForest(n_jobs=-1, random_state=random_state) + model = IsolationForest(behaviour='new', n_jobs=-1, + random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 86c8d7a8ddab7..07acd8c1d4431 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -901,6 +901,17 @@ Outlier Detection models the ``fit_predict`` method is avaiable. By :user:`Albert Thomas `. + - A ``behaviour`` parameter has been introduced in :class:`ensemble.IsolationForest` + to ensure backward compatibility. + In the old behaviour, the ``decision_function`` is independent of the ``contamination`` + parameter. A threshold attribute depending on the ``contamination`` parameter is thus + used. + In the new behaviour the ``decision_function`` is dependent on the ``contamination`` + parameter, in such a way that 0 becomes its natural threshold to detect outliers. + Setting behaviour to "old" is deprecated and will not be possible in version 0.22. + Beside, the behaviour parameter will be removed in 0.24. + :issue:`11553` by `Nicolas Goix`_. + Covariance - The :func:`covariance.graph_lasso`, :class:`covariance.GraphLasso` and diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py index b43ee95c58206..1b79072dff64f 100644 --- a/examples/ensemble/plot_isolation_forest.py +++ b/examples/ensemble/plot_isolation_forest.py @@ -40,7 +40,8 @@ X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model -clf = IsolationForest(max_samples=100, random_state=rng, contamination='auto') +clf = IsolationForest(behaviour='new', max_samples=100, + random_state=rng, contamination='auto') clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) diff --git a/examples/plot_anomaly_comparison.py b/examples/plot_anomaly_comparison.py index 201c466db71dc..f3dc0f1dddff1 100644 --- a/examples/plot_anomaly_comparison.py +++ b/examples/plot_anomaly_comparison.py @@ -80,7 +80,8 @@ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), - ("Isolation Forest", IsolationForest(contamination=outliers_fraction, + ("Isolation Forest", IsolationForest(behaviour='new', + contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction))] diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index eafdcbe9de1cf..97e60d755ad3e 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -89,6 +89,26 @@ class IsolationForest(BaseBagging, OutlierMixin): The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. + behaviour : str, default='old' + Behaviour of the ``decision_function`` which can be either 'old' or + 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` + change to match other anomaly detection algorithm API which will be + the default behaviour in the future. As explained in details in the + ``offset_`` attribute documentation, the ``decision_function`` becomes + dependent on the contamination parameter, in such a way that 0 becomes + its natural threshold to detect outliers. + + .. versionadded:: 0.20 + ``behaviour`` is added in 0.20 for back-compatibility purpose. + + .. deprecated:: 0.20 + ``behaviour='old'`` is deprecated in 0.20 and will not be possible + in 0.22. + + .. deprecated:: 0.22 + ``behaviour`` parameter will be deprecated in 0.22 and removed in + 0.24. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -114,12 +134,16 @@ class IsolationForest(BaseBagging, OutlierMixin): offset_ : float Offset used to define the decision function from the raw scores. We have the relation: ``decision_function = score_samples - offset_``. + Assuming behaviour == 'new', offset_ is defined as follows. When the contamination parameter is set to "auto", the offset is equal to -0.5 as the scores of inliers are close to 0 and the scores of outliers are close to -1. When a contamination parameter different than "auto" is provided, the offset is defined in such a way we obtain the expected number of outliers (samples with decision function < 0) in training. + Assuming the behaviour parameter is set to 'old', we always have + offset_ = -0.5, making the decision function independent from the + contamination parameter. References ---------- @@ -138,6 +162,7 @@ def __init__(self, max_features=1., bootstrap=False, n_jobs=1, + behaviour='old', random_state=None, verbose=0): super(IsolationForest, self).__init__( @@ -154,6 +179,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose) + + self.behaviour = behaviour self.contamination = contamination def _set_oob_score(self, X, y): @@ -185,6 +212,13 @@ def fit(self, X, y=None, sample_weight=None): else: self._contamination = self.contamination + if self.behaviour == 'old': + warnings.warn('behaviour="old" is deprecated and will be removed ' + 'in version 0.22. Please use behaviour="new", which ' + 'makes the decision_function change to match ' + 'other anomaly detection algorithm API.', + FutureWarning) + X = check_array(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the @@ -226,16 +260,29 @@ def fit(self, X, y=None, sample_weight=None): max_depth=max_depth, sample_weight=sample_weight) + if self.behaviour == 'old': + # in this case, decision_function = 0.5 + self.score_samples(X): + if self._contamination == "auto": + raise ValueError("contamination parameter cannot be set to " + "'auto' when behaviour == 'old'.") + + self.offset_ = -0.5 + self._threshold_ = sp.stats.scoreatpercentile( + self.decision_function(X), 100. * self._contamination) + + return self + + # else, self.behaviour == 'new': if self._contamination == "auto": # 0.5 plays a special role as described in the original paper. # we take the opposite as we consider the opposite of their score. self.offset_ = -0.5 - # need to save (depreciated) threshold_ in this case: - self._threshold_ = sp.stats.scoreatpercentile( - self.score_samples(X), 100. * 0.1) - else: - self.offset_ = sp.stats.scoreatpercentile( - self.score_samples(X), 100. * self._contamination) + return self + + # else, define offset_ wrt contamination parameter, so that the + # threshold_ attribute is implicitly 0 and is not needed anymore: + self.offset_ = sp.stats.scoreatpercentile( + self.score_samples(X), 100. * self._contamination) return self @@ -258,7 +305,8 @@ def predict(self, X): check_is_fitted(self, ["offset_"]) X = check_array(X, accept_sparse='csr') is_inlier = np.ones(X.shape[0], dtype=int) - is_inlier[self.decision_function(X) < 0] = -1 + threshold = self.threshold_ if self.behaviour == 'old' else 0 + is_inlier[self.decision_function(X) < threshold] = -1 return is_inlier def decision_function(self, X): @@ -359,11 +407,12 @@ def score_samples(self, X): @property def threshold_(self): + if self.behaviour != 'old': + raise AttributeError("threshold_ attribute does not exist when " + "behaviour != 'old'") warnings.warn("threshold_ attribute is deprecated in 0.20 and will" " be removed in 0.22.", DeprecationWarning) - if self.contamination == 'auto': - return self._threshold_ - return self.offset_ + return self._threshold_ def _average_path_length(n_samples_leaf): diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index bfeb689a78f0a..634f45a25cf4d 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -15,6 +15,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater @@ -47,6 +48,7 @@ boston.target = boston.target[perm] +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest(): """Check Isolation Forest for various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) @@ -63,6 +65,8 @@ def test_iforest(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) @@ -91,6 +95,8 @@ def test_iforest_sparse(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data @@ -128,8 +134,14 @@ def test_iforest_error(): # test X_test n_features match X_train one: assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:]) + # test threshold_ attribute error when behaviour is not old: + msg = "threshold_ attribute does not exist when behaviour != 'old'" + assert_raises_regex(AttributeError, msg, getattr, + IsolationForest(behaviour='new'), 'threshold_') + @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_recalculate_max_depth(): """Check max_depth recalculation when max_samples is reset to n_samples""" X = iris.data @@ -139,6 +151,7 @@ def test_recalculate_max_depth(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_max_samples_attribute(): X = iris.data clf = IsolationForest().fit(X) @@ -155,6 +168,8 @@ def test_max_samples_attribute(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) @@ -180,6 +195,7 @@ def test_iforest_parallel_regression(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_performance(): """Test Isolation Forest performs well""" @@ -204,13 +220,15 @@ def test_iforest_performance(): assert_greater(roc_auc_score(y_test, y_pred), 0.98) +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest for contamination in [0.25, "auto"]: - clf = IsolationForest(random_state=rng, contamination=contamination) + clf = IsolationForest(behaviour='new', random_state=rng, + contamination=contamination) clf.fit(X) decision_func = - clf.decision_function(X) pred = clf.predict(X) @@ -220,6 +238,7 @@ def test_iforest_works(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_max_samples_consistency(): # Make sure validated max_samples in iforest and BaseBagging are identical X = iris.data @@ -228,6 +247,8 @@ def test_max_samples_consistency(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) @@ -253,6 +274,7 @@ def test_iforest_average_path_length(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(contamination=0.1).fit(X_train) @@ -265,6 +287,8 @@ def test_score_samples(): clf2.score_samples([[2., 2.]])) +@pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_deprecation(): X = [[0.0], [1.0]] clf = IsolationForest() @@ -274,8 +298,23 @@ def test_deprecation(): 'in version 0.22 to "auto"', clf.fit, X) - clf = IsolationForest(contamination='auto').fit(X) + assert_warns_message(FutureWarning, + 'behaviour="old" is deprecated and will be removed ' + 'in version 0.22', + clf.fit, X) + + clf = IsolationForest().fit(X) assert_warns_message(DeprecationWarning, "threshold_ attribute is deprecated in 0.20 and will" " be removed in 0.22.", getattr, clf, "threshold_") + + +@pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:behaviour="old"') +def test_behaviour_param(): + X_train = [[1, 1], [1, 2], [2, 1]] + clf1 = IsolationForest(behaviour='old').fit(X_train) + clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train) + assert_array_equal(clf1.decision_function([[2., 2.]]), + clf2.decision_function([[2., 2.]])) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5149900c9c473..7bfd3ac0f2372 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -368,6 +368,13 @@ def set_checking_parameters(estimator): if estimator.__class__.__name__ == "TheilSenRegressor": estimator.max_subpopulation = 100 + if estimator.__class__.__name__ == "IsolationForest": + # XXX to be removed in 0.22. + # this is used because the old IsolationForest does not + # respect the outlier detection API and thus and does not + # pass the outlier detection common tests. + estimator.set_params(behaviour='new') + if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably