From 0cad3f3f16c4888f33ff5790d525bec6e0a085d8 Mon Sep 17 00:00:00 2001 From: ngoix Date: Mon, 16 Jul 2018 15:34:25 +0200 Subject: [PATCH 1/8] iforest behaviour param fix + cosmit cosmit common test update examples cosmit + fix travis attribute error instead of value + mask warning in test whatsnew --- benchmarks/bench_isolation_forest.py | 3 +- doc/whats_new/v0.20.rst | 9 ++ examples/covariance/plot_outlier_detection.py | 129 ++++++++++++++++++ examples/ensemble/plot_isolation_forest.py | 3 +- examples/plot_anomaly_comparison.py | 3 +- sklearn/ensemble/iforest.py | 58 ++++++-- sklearn/ensemble/tests/test_iforest.py | 30 +++- sklearn/utils/estimator_checks.py | 7 + 8 files changed, 227 insertions(+), 15 deletions(-) create mode 100644 examples/covariance/plot_outlier_detection.py diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index 547b4f3ed2ddc..585ead9a3be83 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -119,7 +119,8 @@ def print_outlier_ratio(y): y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') - model = IsolationForest(n_jobs=-1, random_state=random_state) + model = IsolationForest(behaviour='new', n_jobs=-1, + random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 86c8d7a8ddab7..932c11c658199 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -901,6 +901,15 @@ Outlier Detection models the ``fit_predict`` method is avaiable. By :user:`Albert Thomas `. + - A ``behaviour`` parameter has been introduced in :class:`ensemble.IsolationForest` + to ensure backward compatibility. + In the old behaviour, the ``decision_function`` is independent of the ``contamination`` + parameter. A threshold attribute depending on the ``contamination`` parameter is thus + used. + In the new behaviour the ``decision_function`` is dependent on the ``contamination`` + parameter, in such a way that 0 becomes its natural threshold to detect outliers. + :issue:`11553` by `Nicolas Goix`_. + Covariance - The :func:`covariance.graph_lasso`, :class:`covariance.GraphLasso` and diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py new file mode 100644 index 0000000000000..f56bba3eef6c5 --- /dev/null +++ b/examples/covariance/plot_outlier_detection.py @@ -0,0 +1,129 @@ +""" +========================================== +Outlier detection with several methods. +========================================== + +When the amount of contamination is known, this example illustrates three +different ways of performing :ref:`outlier_detection`: + +- based on a robust estimator of covariance, which is assuming that the + data are Gaussian distributed and performs better than the One-Class SVM + in that case. + +- using the One-Class SVM and its ability to capture the shape of the + data set, hence performing better when the data is strongly + non-Gaussian, i.e. with two well-separated clusters; + +- using the Isolation Forest algorithm, which is based on random forests and + hence more adapted to large-dimensional settings, even if it performs + quite well in the examples below. + +- using the Local Outlier Factor to measure the local deviation of a given + data point with respect to its neighbors by comparing their local density. + +The ground truth about inliers and outliers is given by the points colors +while the orange-filled area indicates which points are reported as inliers +by each method. + +Here, we assume that we know the fraction of outliers in the datasets. +Thus rather than using the 'predict' method of the objects, we set the +threshold on the decision_function to separate out the corresponding +fraction. +""" + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.font_manager + +from sklearn import svm +from sklearn.covariance import EllipticEnvelope +from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor + +print(__doc__) + +SEED = 42 +GRID_PRECISION = 100 + +rng = np.random.RandomState(SEED) + +# Example settings +n_samples = 200 +outliers_fraction = 0.25 +clusters_separation = (0, 1, 2) + +# define two outlier detection tools to be compared +classifiers = { + "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, + kernel="rbf", gamma=0.1), + "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), + "Isolation Forest": IsolationForest(behaviour='new', + max_samples=n_samples, + contamination=outliers_fraction, + random_state=rng), + "Local Outlier Factor": LocalOutlierFactor( + n_neighbors=35, + contamination=outliers_fraction)} + +# Compare given classifiers under given settings +xx, yy = np.meshgrid(np.linspace(-7, 7, GRID_PRECISION), + np.linspace(-7, 7, GRID_PRECISION)) +n_outliers = int(outliers_fraction * n_samples) +n_inliers = n_samples - n_outliers +ground_truth = np.ones(n_samples, dtype=int) +ground_truth[-n_outliers:] = -1 + +# Fit the problem with varying cluster separation +for _, offset in enumerate(clusters_separation): + np.random.seed(SEED) + # Data generation + X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset + X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset + X = np.concatenate([X1, X2], axis=0) + # Add outliers + X = np.concatenate([X, np.random.uniform(low=-6, high=6, + size=(n_outliers, 2))], axis=0) + + # Fit the model + plt.figure(figsize=(9, 7)) + for i, (clf_name, clf) in enumerate(classifiers.items()): + # fit the data and tag outliers + if clf_name == "Local Outlier Factor": + y_pred = clf.fit_predict(X) + scores_pred = clf.negative_outlier_factor_ + else: + clf.fit(X) + scores_pred = clf.decision_function(X) + y_pred = clf.predict(X) + n_errors = (y_pred != ground_truth).sum() + # plot the levels lines and the points + if clf_name == "Local Outlier Factor": + # decision_function is private for LOF + Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) + else: + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + subplot = plt.subplot(2, 2, i + 1) + subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), + cmap=plt.cm.Blues_r) + a = subplot.contour(xx, yy, Z, levels=[0], + linewidths=2, colors='red') + subplot.contourf(xx, yy, Z, levels=[0, Z.max()], + colors='orange') + b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', + s=20, edgecolor='k') + c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', + s=20, edgecolor='k') + subplot.axis('tight') + subplot.legend( + [a.collections[0], b, c], + ['learned decision function', 'true inliers', 'true outliers'], + prop=matplotlib.font_manager.FontProperties(size=10), + loc='lower right') + subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) + subplot.set_xlim((-7, 7)) + subplot.set_ylim((-7, 7)) + plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) + plt.suptitle("Outlier detection") + +plt.show() diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py index b43ee95c58206..1b79072dff64f 100644 --- a/examples/ensemble/plot_isolation_forest.py +++ b/examples/ensemble/plot_isolation_forest.py @@ -40,7 +40,8 @@ X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model -clf = IsolationForest(max_samples=100, random_state=rng, contamination='auto') +clf = IsolationForest(behaviour='new', max_samples=100, + random_state=rng, contamination='auto') clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) diff --git a/examples/plot_anomaly_comparison.py b/examples/plot_anomaly_comparison.py index 201c466db71dc..f3dc0f1dddff1 100644 --- a/examples/plot_anomaly_comparison.py +++ b/examples/plot_anomaly_comparison.py @@ -80,7 +80,8 @@ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), - ("Isolation Forest", IsolationForest(contamination=outliers_fraction, + ("Isolation Forest", IsolationForest(behaviour='new', + contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction))] diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index eafdcbe9de1cf..bf99a2db30464 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -89,6 +89,15 @@ class IsolationForest(BaseBagging, OutlierMixin): The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. + behaviour: str, optional (default='old') + Accepted values are 'old' or 'new'. Behaviour of the decision_function. + Default "behaviour" parameter will change to "new" in version 0.22. + Passing behaviour="new" makes the decision_function change to match + other anomaly detection algorithm API, as explained in details in the + offset_ attribute documentation. Basically, the decision_function + becomes dependent on the contamination parameter, in such a way that + 0 becomes its natural threshold to detect outliers. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -114,12 +123,16 @@ class IsolationForest(BaseBagging, OutlierMixin): offset_ : float Offset used to define the decision function from the raw scores. We have the relation: ``decision_function = score_samples - offset_``. + Assuming behaviour == 'new', offset_ is defined as follows. When the contamination parameter is set to "auto", the offset is equal to -0.5 as the scores of inliers are close to 0 and the scores of outliers are close to -1. When a contamination parameter different than "auto" is provided, the offset is defined in such a way we obtain the expected number of outliers (samples with decision function < 0) in training. + Assuming the behaviour parameter is set to 'old', we always have + offset_ = -0.5, making the decision function independent from the + contamination parameter. References ---------- @@ -138,6 +151,7 @@ def __init__(self, max_features=1., bootstrap=False, n_jobs=1, + behaviour='old', random_state=None, verbose=0): super(IsolationForest, self).__init__( @@ -154,8 +168,17 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose) + + self.behaviour = behaviour self.contamination = contamination + if behaviour == 'old': + warnings.warn('Default "behaviour" parameter will change to "new" ' + 'in version 0.22. Passing behaviour="new" makes ' + 'IsolationForest decision_function change to match ' + 'other anomaly detection algorithm API.', + FutureWarning) + def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by iforest") @@ -226,16 +249,29 @@ def fit(self, X, y=None, sample_weight=None): max_depth=max_depth, sample_weight=sample_weight) + if self.behaviour == 'old': + # in this case, decision_function = 0.5 + self.score_samples(X): + if self._contamination == "auto": + raise ValueError("contamination parameter cannot be set to " + "'auto' when behaviour == 'old'.") + + self.offset_ = -0.5 + self._threshold_ = sp.stats.scoreatpercentile( + self.decision_function(X), 100. * self._contamination) + + return self + + # else, self.behaviour == 'new': if self._contamination == "auto": # 0.5 plays a special role as described in the original paper. # we take the opposite as we consider the opposite of their score. self.offset_ = -0.5 - # need to save (depreciated) threshold_ in this case: - self._threshold_ = sp.stats.scoreatpercentile( - self.score_samples(X), 100. * 0.1) - else: - self.offset_ = sp.stats.scoreatpercentile( - self.score_samples(X), 100. * self._contamination) + return self + + # else, define offset_ wrt contamination parameter, so that the + # threshold_ attribute is implicitly 0 and is not needed anymore: + self.offset_ = sp.stats.scoreatpercentile( + self.score_samples(X), 100. * self._contamination) return self @@ -258,7 +294,8 @@ def predict(self, X): check_is_fitted(self, ["offset_"]) X = check_array(X, accept_sparse='csr') is_inlier = np.ones(X.shape[0], dtype=int) - is_inlier[self.decision_function(X) < 0] = -1 + threshold = self.threshold_ if self.behaviour == 'old' else 0 + is_inlier[self.decision_function(X) < threshold] = -1 return is_inlier def decision_function(self, X): @@ -359,11 +396,12 @@ def score_samples(self, X): @property def threshold_(self): + if self.behaviour != 'old': + raise AttributeError("threshold_ attribute does not exist when " + "behaviour != 'old'") warnings.warn("threshold_ attribute is deprecated in 0.20 and will" " be removed in 0.22.", DeprecationWarning) - if self.contamination == 'auto': - return self._threshold_ - return self.offset_ + return self._threshold_ def _average_path_length(n_samples_leaf): diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index bfeb689a78f0a..8cfa2263a650b 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -15,6 +15,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater @@ -47,6 +48,7 @@ boston.target = boston.target[perm] +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest(): """Check Isolation Forest for various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) @@ -63,6 +65,7 @@ def test_iforest(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) @@ -91,6 +94,7 @@ def test_iforest_sparse(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data @@ -128,6 +132,11 @@ def test_iforest_error(): # test X_test n_features match X_train one: assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:]) + # test threshold_ attribute error when behaviour is not old: + msg = "threshold_ attribute does not exist when behaviour != 'old'" + assert_raises_regex(AttributeError, msg, getattr, + IsolationForest(behaviour='new'), 'threshold_') + @pytest.mark.filterwarnings('ignore:default contamination') def test_recalculate_max_depth(): @@ -155,6 +164,7 @@ def test_max_samples_attribute(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) @@ -204,13 +214,15 @@ def test_iforest_performance(): assert_greater(roc_auc_score(y_test, y_pred), 0.98) +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest for contamination in [0.25, "auto"]: - clf = IsolationForest(random_state=rng, contamination=contamination) + clf = IsolationForest(behaviour='new', random_state=rng, + contamination=contamination) clf.fit(X) decision_func = - clf.decision_function(X) pred = clf.predict(X) @@ -228,6 +240,7 @@ def test_max_samples_consistency(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:threshold_ attribute') def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) @@ -274,8 +287,21 @@ def test_deprecation(): 'in version 0.22 to "auto"', clf.fit, X) - clf = IsolationForest(contamination='auto').fit(X) + assert_warns_message(FutureWarning, + 'Default "behaviour" parameter will change to "new" ' + 'in version 0.22', + IsolationForest, ) + + clf = IsolationForest().fit(X) assert_warns_message(DeprecationWarning, "threshold_ attribute is deprecated in 0.20 and will" " be removed in 0.22.", getattr, clf, "threshold_") + + +def test_behaviour_param(): + X_train = [[1, 1], [1, 2], [2, 1]] + clf1 = IsolationForest(behaviour='old').fit(X_train) + clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train) + assert_array_equal(clf1.decision_function([[2., 2.]]), + clf2.decision_function([[2., 2.]])) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5149900c9c473..7bfd3ac0f2372 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -368,6 +368,13 @@ def set_checking_parameters(estimator): if estimator.__class__.__name__ == "TheilSenRegressor": estimator.max_subpopulation = 100 + if estimator.__class__.__name__ == "IsolationForest": + # XXX to be removed in 0.22. + # this is used because the old IsolationForest does not + # respect the outlier detection API and thus and does not + # pass the outlier detection common tests. + estimator.set_params(behaviour='new') + if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably From 0413c361d926fc0c424c806ae7ae5ab3fb8f26e5 Mon Sep 17 00:00:00 2001 From: ngoix Date: Wed, 18 Jul 2018 00:33:39 +0200 Subject: [PATCH 2/8] warning in fit not init --- sklearn/ensemble/iforest.py | 14 +++++++------- sklearn/ensemble/tests/test_iforest.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index bf99a2db30464..57f95457b4782 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -172,13 +172,6 @@ def __init__(self, self.behaviour = behaviour self.contamination = contamination - if behaviour == 'old': - warnings.warn('Default "behaviour" parameter will change to "new" ' - 'in version 0.22. Passing behaviour="new" makes ' - 'IsolationForest decision_function change to match ' - 'other anomaly detection algorithm API.', - FutureWarning) - def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by iforest") @@ -208,6 +201,13 @@ def fit(self, X, y=None, sample_weight=None): else: self._contamination = self.contamination + if self.behaviour == 'old': + warnings.warn('Default "behaviour" parameter will change to "new" ' + 'in version 0.22. Passing behaviour="new" makes ' + 'IsolationForest decision_function change to match ' + 'other anomaly detection algorithm API.', + FutureWarning) + X = check_array(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 8cfa2263a650b..afc685f393c6a 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -290,7 +290,7 @@ def test_deprecation(): assert_warns_message(FutureWarning, 'Default "behaviour" parameter will change to "new" ' 'in version 0.22', - IsolationForest, ) + clf.fit, X) clf = IsolationForest().fit(X) assert_warns_message(DeprecationWarning, From 3cc43d5872b7e5fb43f93aef152f65535f2c9d6b Mon Sep 17 00:00:00 2001 From: ngoix Date: Wed, 18 Jul 2018 11:05:01 +0200 Subject: [PATCH 3/8] catch behaviour warnings --- sklearn/ensemble/tests/test_iforest.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index afc685f393c6a..8413ddaa516a6 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -66,6 +66,7 @@ def test_iforest(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) @@ -95,6 +96,7 @@ def test_iforest_sparse(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data @@ -139,6 +141,7 @@ def test_iforest_error(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_recalculate_max_depth(): """Check max_depth recalculation when max_samples is reset to n_samples""" X = iris.data @@ -148,6 +151,7 @@ def test_recalculate_max_depth(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_max_samples_attribute(): X = iris.data clf = IsolationForest().fit(X) @@ -165,6 +169,7 @@ def test_max_samples_attribute(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) @@ -190,6 +195,7 @@ def test_iforest_parallel_regression(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_iforest_performance(): """Test Isolation Forest performs well""" @@ -232,6 +238,7 @@ def test_iforest_works(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_max_samples_consistency(): # Make sure validated max_samples in iforest and BaseBagging are identical X = iris.data @@ -241,6 +248,7 @@ def test_max_samples_consistency(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) @@ -266,6 +274,7 @@ def test_iforest_average_path_length(): @pytest.mark.filterwarnings('ignore:default contamination') +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(contamination=0.1).fit(X_train) @@ -278,6 +287,7 @@ def test_score_samples(): clf2.score_samples([[2., 2.]])) +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_deprecation(): X = [[0.0], [1.0]] clf = IsolationForest() @@ -299,6 +309,7 @@ def test_deprecation(): getattr, clf, "threshold_") +@pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_behaviour_param(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(behaviour='old').fit(X_train) From b7eac94f54939b5309dbbed1a900a43b17b12f27 Mon Sep 17 00:00:00 2001 From: ngoix Date: Wed, 18 Jul 2018 11:26:17 +0200 Subject: [PATCH 4/8] catch contam warning --- sklearn/ensemble/tests/test_iforest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 8413ddaa516a6..1781a8ee62896 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -287,6 +287,7 @@ def test_score_samples(): clf2.score_samples([[2., 2.]])) +@pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_deprecation(): X = [[0.0], [1.0]] @@ -309,6 +310,7 @@ def test_deprecation(): getattr, clf, "threshold_") +@pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:Default "behaviour"') def test_behaviour_param(): X_train = [[1, 1], [1, 2], [2, 1]] From ca6848cfc438c47c1718ad205a0de9f27b3d5b36 Mon Sep 17 00:00:00 2001 From: ngoix Date: Wed, 18 Jul 2018 12:01:37 +0200 Subject: [PATCH 5/8] fix docstring travis error --- sklearn/ensemble/iforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 57f95457b4782..0fdce6d3289bc 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -89,7 +89,7 @@ class IsolationForest(BaseBagging, OutlierMixin): The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. - behaviour: str, optional (default='old') + behaviour : str, optional (default='old') Accepted values are 'old' or 'new'. Behaviour of the decision_function. Default "behaviour" parameter will change to "new" in version 0.22. Passing behaviour="new" makes the decision_function change to match From bd2619ab5242132af0fa9eeedecd4d0379fb6587 Mon Sep 17 00:00:00 2001 From: ngoix Date: Fri, 20 Jul 2018 09:26:47 +0200 Subject: [PATCH 6/8] depreciate behaviour value instead of default --- doc/whats_new/v0.20.rst | 2 ++ sklearn/ensemble/iforest.py | 10 ++++++---- sklearn/ensemble/tests/test_iforest.py | 24 ++++++++++++------------ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 932c11c658199..07acd8c1d4431 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -908,6 +908,8 @@ Outlier Detection models used. In the new behaviour the ``decision_function`` is dependent on the ``contamination`` parameter, in such a way that 0 becomes its natural threshold to detect outliers. + Setting behaviour to "old" is deprecated and will not be possible in version 0.22. + Beside, the behaviour parameter will be removed in 0.24. :issue:`11553` by `Nicolas Goix`_. Covariance diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 0fdce6d3289bc..e5284dd16ba5b 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -91,7 +91,9 @@ class IsolationForest(BaseBagging, OutlierMixin): behaviour : str, optional (default='old') Accepted values are 'old' or 'new'. Behaviour of the decision_function. - Default "behaviour" parameter will change to "new" in version 0.22. + Setting behaviour to "old" is deprecated and will not be possible + in version 0.22. + Beside, the behaviour parameter will be removed in 0.24. Passing behaviour="new" makes the decision_function change to match other anomaly detection algorithm API, as explained in details in the offset_ attribute documentation. Basically, the decision_function @@ -202,9 +204,9 @@ def fit(self, X, y=None, sample_weight=None): self._contamination = self.contamination if self.behaviour == 'old': - warnings.warn('Default "behaviour" parameter will change to "new" ' - 'in version 0.22. Passing behaviour="new" makes ' - 'IsolationForest decision_function change to match ' + warnings.warn('behaviour="old" is deprecated and will be removed ' + 'in version 0.22. Please use behaviour="new", which ' + 'makes the decision_function change to match ' 'other anomaly detection algorithm API.', FutureWarning) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 1781a8ee62896..634f45a25cf4d 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -66,7 +66,7 @@ def test_iforest(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) @@ -96,7 +96,7 @@ def test_iforest_sparse(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data @@ -141,7 +141,7 @@ def test_iforest_error(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_recalculate_max_depth(): """Check max_depth recalculation when max_samples is reset to n_samples""" X = iris.data @@ -151,7 +151,7 @@ def test_recalculate_max_depth(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_max_samples_attribute(): X = iris.data clf = IsolationForest().fit(X) @@ -169,7 +169,7 @@ def test_max_samples_attribute(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) @@ -195,7 +195,7 @@ def test_iforest_parallel_regression(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_performance(): """Test Isolation Forest performs well""" @@ -238,7 +238,7 @@ def test_iforest_works(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_max_samples_consistency(): # Make sure validated max_samples in iforest and BaseBagging are identical X = iris.data @@ -248,7 +248,7 @@ def test_max_samples_consistency(): @pytest.mark.filterwarnings('ignore:default contamination') @pytest.mark.filterwarnings('ignore:threshold_ attribute') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) @@ -274,7 +274,7 @@ def test_iforest_average_path_length(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(contamination=0.1).fit(X_train) @@ -288,7 +288,7 @@ def test_score_samples(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_deprecation(): X = [[0.0], [1.0]] clf = IsolationForest() @@ -299,7 +299,7 @@ def test_deprecation(): clf.fit, X) assert_warns_message(FutureWarning, - 'Default "behaviour" parameter will change to "new" ' + 'behaviour="old" is deprecated and will be removed ' 'in version 0.22', clf.fit, X) @@ -311,7 +311,7 @@ def test_deprecation(): @pytest.mark.filterwarnings('ignore:default contamination') -@pytest.mark.filterwarnings('ignore:Default "behaviour"') +@pytest.mark.filterwarnings('ignore:behaviour="old"') def test_behaviour_param(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(behaviour='old').fit(X_train) From 7afe223fa4cfd798cdab5d34615d68d856fe6df2 Mon Sep 17 00:00:00 2001 From: ngoix Date: Mon, 23 Jul 2018 11:35:14 +0200 Subject: [PATCH 7/8] rm ex + change behaviour doc --- examples/covariance/plot_outlier_detection.py | 129 ------------------ sklearn/ensemble/iforest.py | 20 +++ 2 files changed, 20 insertions(+), 129 deletions(-) delete mode 100644 examples/covariance/plot_outlier_detection.py diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py deleted file mode 100644 index f56bba3eef6c5..0000000000000 --- a/examples/covariance/plot_outlier_detection.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -========================================== -Outlier detection with several methods. -========================================== - -When the amount of contamination is known, this example illustrates three -different ways of performing :ref:`outlier_detection`: - -- based on a robust estimator of covariance, which is assuming that the - data are Gaussian distributed and performs better than the One-Class SVM - in that case. - -- using the One-Class SVM and its ability to capture the shape of the - data set, hence performing better when the data is strongly - non-Gaussian, i.e. with two well-separated clusters; - -- using the Isolation Forest algorithm, which is based on random forests and - hence more adapted to large-dimensional settings, even if it performs - quite well in the examples below. - -- using the Local Outlier Factor to measure the local deviation of a given - data point with respect to its neighbors by comparing their local density. - -The ground truth about inliers and outliers is given by the points colors -while the orange-filled area indicates which points are reported as inliers -by each method. - -Here, we assume that we know the fraction of outliers in the datasets. -Thus rather than using the 'predict' method of the objects, we set the -threshold on the decision_function to separate out the corresponding -fraction. -""" - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager - -from sklearn import svm -from sklearn.covariance import EllipticEnvelope -from sklearn.ensemble import IsolationForest -from sklearn.neighbors import LocalOutlierFactor - -print(__doc__) - -SEED = 42 -GRID_PRECISION = 100 - -rng = np.random.RandomState(SEED) - -# Example settings -n_samples = 200 -outliers_fraction = 0.25 -clusters_separation = (0, 1, 2) - -# define two outlier detection tools to be compared -classifiers = { - "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, - kernel="rbf", gamma=0.1), - "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), - "Isolation Forest": IsolationForest(behaviour='new', - max_samples=n_samples, - contamination=outliers_fraction, - random_state=rng), - "Local Outlier Factor": LocalOutlierFactor( - n_neighbors=35, - contamination=outliers_fraction)} - -# Compare given classifiers under given settings -xx, yy = np.meshgrid(np.linspace(-7, 7, GRID_PRECISION), - np.linspace(-7, 7, GRID_PRECISION)) -n_outliers = int(outliers_fraction * n_samples) -n_inliers = n_samples - n_outliers -ground_truth = np.ones(n_samples, dtype=int) -ground_truth[-n_outliers:] = -1 - -# Fit the problem with varying cluster separation -for _, offset in enumerate(clusters_separation): - np.random.seed(SEED) - # Data generation - X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset - X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset - X = np.concatenate([X1, X2], axis=0) - # Add outliers - X = np.concatenate([X, np.random.uniform(low=-6, high=6, - size=(n_outliers, 2))], axis=0) - - # Fit the model - plt.figure(figsize=(9, 7)) - for i, (clf_name, clf) in enumerate(classifiers.items()): - # fit the data and tag outliers - if clf_name == "Local Outlier Factor": - y_pred = clf.fit_predict(X) - scores_pred = clf.negative_outlier_factor_ - else: - clf.fit(X) - scores_pred = clf.decision_function(X) - y_pred = clf.predict(X) - n_errors = (y_pred != ground_truth).sum() - # plot the levels lines and the points - if clf_name == "Local Outlier Factor": - # decision_function is private for LOF - Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) - else: - Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) - Z = Z.reshape(xx.shape) - subplot = plt.subplot(2, 2, i + 1) - subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), - cmap=plt.cm.Blues_r) - a = subplot.contour(xx, yy, Z, levels=[0], - linewidths=2, colors='red') - subplot.contourf(xx, yy, Z, levels=[0, Z.max()], - colors='orange') - b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', - s=20, edgecolor='k') - c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', - s=20, edgecolor='k') - subplot.axis('tight') - subplot.legend( - [a.collections[0], b, c], - ['learned decision function', 'true inliers', 'true outliers'], - prop=matplotlib.font_manager.FontProperties(size=10), - loc='lower right') - subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) - subplot.set_xlim((-7, 7)) - subplot.set_ylim((-7, 7)) - plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) - plt.suptitle("Outlier detection") - -plt.show() diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index e5284dd16ba5b..56f6c53369131 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -89,6 +89,26 @@ class IsolationForest(BaseBagging, OutlierMixin): The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. + behaviour : str, default='old' + Behaviour of the ``decision_function`` which can be either 'old' or + 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` + change to match other anomaly detection algorithm API which will be + the default behaviour in the future. As explained in details in the + ``offset_`` attribute documentation, the ``decision_function`` becomes + dependent on the contamination parameter, in such a way that 0 becomes + its natural threshold to detect outliers. + + .. versionadded:: 0.20 + ``behaviour`` is added in 0.20 for back-compatibility purpose. + + .. deprecated:: 0.20 + ``behaviour='old'`` is deprecated in 0.20 and will not be possible + in 0.22. + + .. deprecated:: 0.22 + ``behaviour`` parameter will be deprecated in 0.22 and removed in + 0.24. + behaviour : str, optional (default='old') Accepted values are 'old' or 'new'. Behaviour of the decision_function. Setting behaviour to "old" is deprecated and will not be possible From 05c905d3756558f0dbec7eb2605dbd94b8a088b6 Mon Sep 17 00:00:00 2001 From: ngoix Date: Mon, 23 Jul 2018 12:07:49 +0200 Subject: [PATCH 8/8] rm dupl doc --- sklearn/ensemble/iforest.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 56f6c53369131..97e60d755ad3e 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -109,17 +109,6 @@ class IsolationForest(BaseBagging, OutlierMixin): ``behaviour`` parameter will be deprecated in 0.22 and removed in 0.24. - behaviour : str, optional (default='old') - Accepted values are 'old' or 'new'. Behaviour of the decision_function. - Setting behaviour to "old" is deprecated and will not be possible - in version 0.22. - Beside, the behaviour parameter will be removed in 0.24. - Passing behaviour="new" makes the decision_function change to match - other anomaly detection algorithm API, as explained in details in the - offset_ attribute documentation. Basically, the decision_function - becomes dependent on the contamination parameter, in such a way that - 0 becomes its natural threshold to detect outliers. - random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator;