From 0f7247312eab04a722bce331e9f8a3c4df8c35f8 Mon Sep 17 00:00:00 2001 From: Dowon Date: Fri, 21 Dec 2018 09:32:37 +0900 Subject: [PATCH 1/9] DOC improving an import convenience --- sklearn/compose/_column_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 1e020cb95068c..2e091fb18887b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -144,6 +144,7 @@ class ColumnTransformer(_BaseComposition, TransformerMixin): Examples -------- + >>> import numpy as np >>> from sklearn.compose import ColumnTransformer >>> from sklearn.preprocessing import Normalizer >>> ct = ColumnTransformer( From c3516ed7bbdef0cf6838abe404f8ed003d6f1a44 Mon Sep 17 00:00:00 2001 From: Dowon Date: Fri, 21 Dec 2018 13:49:01 +0900 Subject: [PATCH 2/9] adding a missing example file --- examples/covariance/plot_outlier_detection.py | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 examples/covariance/plot_outlier_detection.py diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py new file mode 100644 index 0000000000000..4c6ea43418b88 --- /dev/null +++ b/examples/covariance/plot_outlier_detection.py @@ -0,0 +1,129 @@ +""" +========================================== +Outlier detection with several methods. +========================================== + +When the amount of contamination is known, this example illustrates three +different ways of performing :ref:`outlier_detection`: + +- based on a robust estimator of covariance, which is assuming that the + data are Gaussian distributed and performs better than the One-Class SVM + in that case. + +- using the One-Class SVM and its ability to capture the shape of the + data set, hence performing better when the data is strongly + non-Gaussian, i.e. with two well-separated clusters; + +- using the Isolation Forest algorithm, which is based on random forests and + hence more adapted to large-dimensional settings, even if it performs + quite well in the examples below. + +- using the Local Outlier Factor to measure the local deviation of a given + data point with respect to its neighbors by comparing their local density. + +The ground truth about inliers and outliers is given by the points colors +while the orange-filled area indicates which points are reported as inliers +by each method. + +Here, we assume that we know the fraction of outliers in the datasets. +Thus rather than using the 'predict' method of the objects, we set the +threshold on the decision_function to separate out the corresponding +fraction. +""" + +import numpy as np +from scipy import stats +import matplotlib.pyplot as plt +import matplotlib.font_manager + +from sklearn import svm +from sklearn.covariance import EllipticEnvelope +from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor + +print(__doc__) + +SEED = 42 +GRID_PRECISION = 100 + +rng = np.random.RandomState(SEED) + +# Example settings +n_samples = 200 +outliers_fraction = 0.25 +clusters_separation = (0, 1, 2) + +# define two outlier detection tools to be compared +classifiers = { + "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, + kernel="rbf", gamma=0.1), + "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), + "Isolation Forest": IsolationForest(max_samples=n_samples, + contamination=outliers_fraction, + random_state=rng), + "Local Outlier Factor": LocalOutlierFactor( + n_neighbors=35, + contamination=outliers_fraction)} + +# Compare given classifiers under given settings +xx, yy = np.meshgrid(np.linspace(-7, 7, GRID_PRECISION), + np.linspace(-7, 7, GRID_PRECISION)) +n_outliers = int(outliers_fraction * n_samples) +n_inliers = n_samples - n_outliers +ground_truth = np.ones(n_samples, dtype=int) +ground_truth[-n_outliers:] = -1 + +# Fit the problem with varying cluster separation +for _, offset in enumerate(clusters_separation): + np.random.seed(SEED) + # Data generation + X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset + X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset + X = np.concatenate([X1, X2], axis=0) + # Add outliers + X = np.concatenate([X, np.random.uniform(low=-6, high=6, + size=(n_outliers, 2))], axis=0) + + # Fit the model + plt.figure(figsize=(9, 7)) + for i, (clf_name, clf) in enumerate(classifiers.items()): + # fit the data and tag outliers + if clf_name == "Local Outlier Factor": + y_pred = clf.fit_predict(X) + scores_pred = clf.negative_outlier_factor_ + else: + clf.fit(X) + scores_pred = clf.decision_function(X) + y_pred = clf.predict(X) + n_errors = (y_pred != ground_truth).sum() + # plot the levels lines and the points + if clf_name == "Local Outlier Factor": + # decision_function is private for LOF + Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) + else: + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + subplot = plt.subplot(2, 2, i + 1) + subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), + cmap=plt.cm.Blues_r) + a = subplot.contour(xx, yy, Z, levels=[0], + linewidths=2, colors='red') + subplot.contourf(xx, yy, Z, levels=[0, Z.max()], + colors='orange') + b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', + s=20, edgecolor='k') + c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', + s=20, edgecolor='k') + subplot.axis('tight') + subplot.legend( + [a.collections[0], b, c], + ['learned decision function', 'true inliers', 'true outliers'], + prop=matplotlib.font_manager.FontProperties(size=10), + loc='lower right') + subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) + subplot.set_xlim((-7, 7)) + subplot.set_ylim((-7, 7)) + plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) + plt.suptitle("Outlier detection") + +plt.show() From f865c152cbc5b07c1e42741a3229f91bb9a1a70a Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 21 Dec 2018 14:03:53 +0900 Subject: [PATCH 3/9] Revert "DOC improving an import convenience" This reverts commit 0f7247312eab04a722bce331e9f8a3c4df8c35f8. --- sklearn/compose/_column_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 2e091fb18887b..1e020cb95068c 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -144,7 +144,6 @@ class ColumnTransformer(_BaseComposition, TransformerMixin): Examples -------- - >>> import numpy as np >>> from sklearn.compose import ColumnTransformer >>> from sklearn.preprocessing import Normalizer >>> ct = ColumnTransformer( From f7ff6d11efb81eb229fb0d87dbf69875d99c1bf0 Mon Sep 17 00:00:00 2001 From: Dowon Date: Fri, 21 Dec 2018 14:09:05 +0900 Subject: [PATCH 4/9] unused imported library remove --- examples/covariance/plot_outlier_detection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py index 4c6ea43418b88..74dcf6db67fcc 100644 --- a/examples/covariance/plot_outlier_detection.py +++ b/examples/covariance/plot_outlier_detection.py @@ -32,7 +32,6 @@ """ import numpy as np -from scipy import stats import matplotlib.pyplot as plt import matplotlib.font_manager From ea8d85cd2bd2043b05a02c5d061e0a92ba6db4df Mon Sep 17 00:00:00 2001 From: Dowon Date: Tue, 25 Dec 2018 02:35:17 +0900 Subject: [PATCH 5/9] roll back --- examples/covariance/plot_outlier_detection.py | 128 ------------------ 1 file changed, 128 deletions(-) delete mode 100644 examples/covariance/plot_outlier_detection.py diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py deleted file mode 100644 index 74dcf6db67fcc..0000000000000 --- a/examples/covariance/plot_outlier_detection.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -========================================== -Outlier detection with several methods. -========================================== - -When the amount of contamination is known, this example illustrates three -different ways of performing :ref:`outlier_detection`: - -- based on a robust estimator of covariance, which is assuming that the - data are Gaussian distributed and performs better than the One-Class SVM - in that case. - -- using the One-Class SVM and its ability to capture the shape of the - data set, hence performing better when the data is strongly - non-Gaussian, i.e. with two well-separated clusters; - -- using the Isolation Forest algorithm, which is based on random forests and - hence more adapted to large-dimensional settings, even if it performs - quite well in the examples below. - -- using the Local Outlier Factor to measure the local deviation of a given - data point with respect to its neighbors by comparing their local density. - -The ground truth about inliers and outliers is given by the points colors -while the orange-filled area indicates which points are reported as inliers -by each method. - -Here, we assume that we know the fraction of outliers in the datasets. -Thus rather than using the 'predict' method of the objects, we set the -threshold on the decision_function to separate out the corresponding -fraction. -""" - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager - -from sklearn import svm -from sklearn.covariance import EllipticEnvelope -from sklearn.ensemble import IsolationForest -from sklearn.neighbors import LocalOutlierFactor - -print(__doc__) - -SEED = 42 -GRID_PRECISION = 100 - -rng = np.random.RandomState(SEED) - -# Example settings -n_samples = 200 -outliers_fraction = 0.25 -clusters_separation = (0, 1, 2) - -# define two outlier detection tools to be compared -classifiers = { - "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, - kernel="rbf", gamma=0.1), - "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), - "Isolation Forest": IsolationForest(max_samples=n_samples, - contamination=outliers_fraction, - random_state=rng), - "Local Outlier Factor": LocalOutlierFactor( - n_neighbors=35, - contamination=outliers_fraction)} - -# Compare given classifiers under given settings -xx, yy = np.meshgrid(np.linspace(-7, 7, GRID_PRECISION), - np.linspace(-7, 7, GRID_PRECISION)) -n_outliers = int(outliers_fraction * n_samples) -n_inliers = n_samples - n_outliers -ground_truth = np.ones(n_samples, dtype=int) -ground_truth[-n_outliers:] = -1 - -# Fit the problem with varying cluster separation -for _, offset in enumerate(clusters_separation): - np.random.seed(SEED) - # Data generation - X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset - X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset - X = np.concatenate([X1, X2], axis=0) - # Add outliers - X = np.concatenate([X, np.random.uniform(low=-6, high=6, - size=(n_outliers, 2))], axis=0) - - # Fit the model - plt.figure(figsize=(9, 7)) - for i, (clf_name, clf) in enumerate(classifiers.items()): - # fit the data and tag outliers - if clf_name == "Local Outlier Factor": - y_pred = clf.fit_predict(X) - scores_pred = clf.negative_outlier_factor_ - else: - clf.fit(X) - scores_pred = clf.decision_function(X) - y_pred = clf.predict(X) - n_errors = (y_pred != ground_truth).sum() - # plot the levels lines and the points - if clf_name == "Local Outlier Factor": - # decision_function is private for LOF - Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) - else: - Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) - Z = Z.reshape(xx.shape) - subplot = plt.subplot(2, 2, i + 1) - subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), - cmap=plt.cm.Blues_r) - a = subplot.contour(xx, yy, Z, levels=[0], - linewidths=2, colors='red') - subplot.contourf(xx, yy, Z, levels=[0, Z.max()], - colors='orange') - b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', - s=20, edgecolor='k') - c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', - s=20, edgecolor='k') - subplot.axis('tight') - subplot.legend( - [a.collections[0], b, c], - ['learned decision function', 'true inliers', 'true outliers'], - prop=matplotlib.font_manager.FontProperties(size=10), - loc='lower right') - subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) - subplot.set_xlim((-7, 7)) - subplot.set_ylim((-7, 7)) - plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) - plt.suptitle("Outlier detection") - -plt.show() From d0c229d508396590c15d75a53302269feb60f0d2 Mon Sep 17 00:00:00 2001 From: Dowon Date: Tue, 25 Dec 2018 02:38:13 +0900 Subject: [PATCH 6/9] FIX: thumbs image update --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index e829a429a4b7b..683f15aced681 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -253,7 +253,7 @@ # key: first image in set # values: (number of plot in set, height of thumbnail) carousel_thumbs = {'sphx_glr_plot_classifier_comparison_001.png': 600, - 'sphx_glr_plot_outlier_detection_003.png': 372, + 'sphx_glr_plot_lof_novelty_detection_001.png': 372, 'sphx_glr_plot_gpr_co2_001.png': 350, 'sphx_glr_plot_adaboost_twoclass_001.png': 372, 'sphx_glr_plot_compare_methods_001.png': 349} From e7e97600e2775dbf132b061d89b4b4da64c36ff4 Mon Sep 17 00:00:00 2001 From: Dowon Date: Tue, 25 Dec 2018 02:39:09 +0900 Subject: [PATCH 7/9] FIX: missing link & new thumbs image updates --- doc/themes/scikit-learn/layout.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html index 91c9b7336e095..3c3c4875463f3 100644 --- a/doc/themes/scikit-learn/layout.html +++ b/doc/themes/scikit-learn/layout.html @@ -149,8 +149,8 @@ style="max-height: 200px; max-width: 629px; margin-left: -21px;">