diff --git a/doc/conf.py b/doc/conf.py index aea5d52b53da4..1113d4b2c100a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -491,6 +491,9 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/ensemble/plot_forest_importances_faces": ( "auto_examples/ensemble/plot_forest_importances" ), + "auto_examples/ensemble/plot_voting_probas": ( + "auto_examples/ensemble/plot_voting_decision_regions" + ), "auto_examples/datasets/plot_iris_dataset": ( "auto_examples/decomposition/plot_pca_iris" ), diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 35ef9f6d7bbfc..b336a25d8048d 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1410,40 +1410,17 @@ classifier 3 w3 * 0.3 w3 * 0.4 w3 * 0.3 weighted average 0.37 0.4 0.23 ================ ========== ========== ========== -Here, the predicted class label is 2, since it has the highest average probability. See -this example on :ref:`Visualising class probabilities in a Voting Classifier -` for a detailed illustration of -class probabilities averaged by soft voting. +Here, the predicted class label is 2, since it has the highest average +predicted probability. See the example on +:ref:`sphx_glr_auto_examples_ensemble_plot_voting_decision_regions.py` for a +demonstration of how the predicted class label can be obtained from the weighted +average of predicted probabilities. -Also, the following example illustrates how the decision regions may change -when a soft :class:`VotingClassifier` is used based on a linear Support -Vector Machine, a Decision Tree, and a K-nearest neighbor classifier:: +The following figure illustrates how the decision regions may change when +a soft :class:`VotingClassifier` is trained with weights on three linear +models: - >>> from sklearn import datasets - >>> from sklearn.tree import DecisionTreeClassifier - >>> from sklearn.neighbors import KNeighborsClassifier - >>> from sklearn.svm import SVC - >>> from itertools import product - >>> from sklearn.ensemble import VotingClassifier - - >>> # Loading some example data - >>> iris = datasets.load_iris() - >>> X = iris.data[:, [0, 2]] - >>> y = iris.target - - >>> # Training classifiers - >>> clf1 = DecisionTreeClassifier(max_depth=4) - >>> clf2 = KNeighborsClassifier(n_neighbors=7) - >>> clf3 = SVC(kernel='rbf', probability=True) - >>> eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], - ... voting='soft', weights=[2, 1, 2]) - - >>> clf1 = clf1.fit(X, y) - >>> clf2 = clf2.fit(X, y) - >>> clf3 = clf3.fit(X, y) - >>> eclf = eclf.fit(X, y) - -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_001.png +.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_002.png :target: ../auto_examples/ensemble/plot_voting_decision_regions.html :align: center :scale: 75% diff --git a/examples/ensemble/plot_voting_decision_regions.py b/examples/ensemble/plot_voting_decision_regions.py index d40d831fb911f..57f3f4b22b947 100644 --- a/examples/ensemble/plot_voting_decision_regions.py +++ b/examples/ensemble/plot_voting_decision_regions.py @@ -1,55 +1,111 @@ """ -================================================== -Plot the decision boundaries of a VotingClassifier -================================================== +=============================================================== +Visualizing the probabilistic predictions of a VotingClassifier +=============================================================== .. currentmodule:: sklearn -Plot the decision boundaries of a :class:`~ensemble.VotingClassifier` for two -features of the Iris dataset. +Plot the predicted class probabilities in a toy dataset predicted by three +different classifiers and averaged by the :class:`~ensemble.VotingClassifier`. -Plot the class probabilities of the first sample in a toy dataset predicted by -three different classifiers and averaged by the -:class:`~ensemble.VotingClassifier`. +First, three linear classifiers are initialized. Two are spline models with +interaction terms, one using constant extrapolation and the other using periodic +extrapolation. The third classifier is a :class:`~kernel_approximation.Nystroem` +with the default "rbf" kernel. -First, three exemplary classifiers are initialized -(:class:`~tree.DecisionTreeClassifier`, -:class:`~neighbors.KNeighborsClassifier`, and :class:`~svm.SVC`) and used to -initialize a soft-voting :class:`~ensemble.VotingClassifier` with weights `[2, -1, 2]`, which means that the predicted probabilities of the -:class:`~tree.DecisionTreeClassifier` and :class:`~svm.SVC` each count 2 times -as much as the weights of the :class:`~neighbors.KNeighborsClassifier` -classifier when the averaged probability is calculated. +In the first part of this example, these three classifiers are used to +demonstrate soft-voting using :class:`~ensemble.VotingClassifier` with weighted +average. We set `weights=[2, 1, 3]`, meaning the constant extrapolation spline +model's predictions are weighted twice as much as the periodic spline model's, +and the Nystroem model's predictions are weighted three times as much as the +periodic spline. + +The second part demonstrates how soft predictions can be converted into hard +predictions. """ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause -from itertools import product +# %% +# We first generate a noisy XOR dataset, which is a binary classification task. import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from matplotlib.colors import ListedColormap + +n_samples = 500 +rng = np.random.default_rng(0) +feature_names = ["Feature #0", "Feature #1"] +common_scatter_plot_params = dict( + cmap=ListedColormap(["tab:red", "tab:blue"]), + edgecolor="white", + linewidth=1, +) + +xor = pd.DataFrame( + np.random.RandomState(0).uniform(low=-1, high=1, size=(n_samples, 2)), + columns=feature_names, +) +noise = rng.normal(loc=0, scale=0.1, size=(n_samples, 2)) +target_xor = np.logical_xor( + xor["Feature #0"] + noise[:, 0] > 0, xor["Feature #1"] + noise[:, 1] > 0 +) + +X = xor[feature_names] +y = target_xor.astype(np.int32) + +fig, ax = plt.subplots() +ax.scatter(X["Feature #0"], X["Feature #1"], c=y, **common_scatter_plot_params) +ax.set_title("The XOR dataset") +plt.show() + +# %% +# Due to the inherent non-linear separability of the XOR dataset, tree-based +# models would often be preferred. However, appropriate feature engineering +# combined with a linear model can yield effective results, with the added +# benefit of producing better-calibrated probabilities for samples located in +# the transition regions affected by noise. +# +# We define and fit the models on the whole dataset. -from sklearn import datasets from sklearn.ensemble import VotingClassifier -from sklearn.inspection import DecisionBoundaryDisplay -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC -from sklearn.tree import DecisionTreeClassifier - -# Loading some example data -iris = datasets.load_iris() -X = iris.data[:, [0, 2]] -y = iris.target - -# Training classifiers -clf1 = DecisionTreeClassifier(max_depth=4) -clf2 = KNeighborsClassifier(n_neighbors=7) -clf3 = SVC(gamma=0.1, kernel="rbf", probability=True) +from sklearn.kernel_approximation import Nystroem +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, StandardScaler + +clf1 = make_pipeline( + SplineTransformer(degree=2, n_knots=2), + PolynomialFeatures(interaction_only=True), + LogisticRegression(C=10), +) +clf2 = make_pipeline( + SplineTransformer( + degree=2, + n_knots=4, + extrapolation="periodic", + include_bias=True, + ), + PolynomialFeatures(interaction_only=True), + LogisticRegression(C=10), +) +clf3 = make_pipeline( + StandardScaler(), + Nystroem(gamma=2, random_state=0), + LogisticRegression(C=10), +) +weights = [2, 1, 3] eclf = VotingClassifier( - estimators=[("dt", clf1), ("knn", clf2), ("svc", clf3)], + estimators=[ + ("constant splines model", clf1), + ("periodic splines model", clf2), + ("nystroem model", clf3), + ], voting="soft", - weights=[2, 1, 2], + weights=weights, ) clf1.fit(X, y) @@ -57,17 +113,106 @@ clf3.fit(X, y) eclf.fit(X, y) -# Plotting decision regions -f, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8)) -for idx, clf, tt in zip( +# %% +# Finally we use :class:`~inspection.DecisionBoundaryDisplay` to plot the +# predicted probabilities. By using a diverging colormap (such as `"RdBu"`), we +# can ensure that darker colors correspond to `predict_proba` close to either 0 +# or 1, and white corresponds to `predict_proba` of 0.5. + +from itertools import product + +from sklearn.inspection import DecisionBoundaryDisplay + +fig, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8)) +for idx, clf, title in zip( product([0, 1], [0, 1]), [clf1, clf2, clf3, eclf], - ["Decision Tree (depth=4)", "KNN (k=7)", "Kernel SVM", "Soft Voting"], + [ + "Splines with\nconstant extrapolation", + "Splines with\nperiodic extrapolation", + "RBF Nystroem", + "Soft Voting", + ], ): - DecisionBoundaryDisplay.from_estimator( - clf, X, alpha=0.4, ax=axarr[idx[0], idx[1]], response_method="predict" + disp = DecisionBoundaryDisplay.from_estimator( + clf, + X, + response_method="predict_proba", + plot_method="pcolormesh", + cmap="RdBu", + alpha=0.8, + ax=axarr[idx[0], idx[1]], + ) + axarr[idx[0], idx[1]].scatter( + X["Feature #0"], + X["Feature #1"], + c=y, + **common_scatter_plot_params, ) - axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k") - axarr[idx[0], idx[1]].set_title(tt) + axarr[idx[0], idx[1]].set_title(title) + fig.colorbar(disp.surface_, ax=axarr[idx[0], idx[1]], label="Probability estimate") plt.show() + +# %% +# As a sanity check, we can verify for a given sample that the probability +# predicted by the :class:`~ensemble.VotingClassifier` is indeed the weighted +# average of the individual classifiers' soft-predictions. +# +# In the case of binary classification such as in the present example, the +# :term:`predict_proba` arrays contain the probability of belonging to class 0 +# (here in red) as the first entry, and the probability of belonging to class 1 +# (here in blue) as the second entry. + +test_sample = pd.DataFrame({"Feature #0": [-0.5], "Feature #1": [1.5]}) +predict_probas = [est.predict_proba(test_sample).ravel() for est in eclf.estimators_] +for (est_name, _), est_probas in zip(eclf.estimators, predict_probas): + print(f"{est_name}'s predicted probabilities: {est_probas}") + +# %% +print( + "Weighted average of soft-predictions: " + f"{np.dot(weights, predict_probas) / np.sum(weights)}" +) + +# %% +# We can see that manual calculation of predicted probabilities above is +# equivalent to that produced by the `VotingClassifier`: + +print( + "Predicted probability of VotingClassifier: " + f"{eclf.predict_proba(test_sample).ravel()}" +) + +# %% +# To convert soft predictions into hard predictions when weights are provided, +# the weighted average predicted probabilities are computed for each class. +# Then, the final class label is then derived from the class label with the +# highest average probability, which corresponds to the default threshold at +# `predict_proba=0.5` in the case of binary classification. + +print( + "Class with the highest weighted average of soft-predictions: " + f"{np.argmax(np.dot(weights, predict_probas) / np.sum(weights))}" +) + +# %% +# This is equivalent to the output of `VotingClassifier`'s `predict` method: + +print(f"Predicted class of VotingClassifier: {eclf.predict(test_sample).ravel()}") + +# %% +# Soft votes can be thresholded as for any other probabilistic classifier. This +# allows you to set a threshold probability at which the positive class will be +# predicted, instead of simply selecting the class with the highest predicted +# probability. + +from sklearn.model_selection import FixedThresholdClassifier + +eclf_other_threshold = FixedThresholdClassifier( + eclf, threshold=0.7, response_method="predict_proba" +).fit(X, y) +print( + "Predicted class of thresholded VotingClassifier: " + f"{eclf_other_threshold.predict(test_sample)}" +) diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py deleted file mode 100644 index 848358ca1d208..0000000000000 --- a/examples/ensemble/plot_voting_probas.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -=========================================================== -Plot class probabilities calculated by the VotingClassifier -=========================================================== - -.. currentmodule:: sklearn - -Plot the class probabilities of the first sample in a toy dataset predicted by -three different classifiers and averaged by the -:class:`~ensemble.VotingClassifier`. - -First, three exemplary classifiers are initialized -(:class:`~linear_model.LogisticRegression`, :class:`~naive_bayes.GaussianNB`, -and :class:`~ensemble.RandomForestClassifier`) and used to initialize a -soft-voting :class:`~ensemble.VotingClassifier` with weights `[1, 1, 5]`, which -means that the predicted probabilities of the -:class:`~ensemble.RandomForestClassifier` count 5 times as much as the weights -of the other classifiers when the averaged probability is calculated. - -To visualize the probability weighting, we fit each classifier on the training -set and plot the predicted class probabilities for the first sample in this -example dataset. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -from sklearn.ensemble import RandomForestClassifier, VotingClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.naive_bayes import GaussianNB - -clf1 = LogisticRegression(max_iter=1000, random_state=123) -clf2 = RandomForestClassifier(n_estimators=100, random_state=123) -clf3 = GaussianNB() -X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) -y = np.array([1, 1, 2, 2]) - -eclf = VotingClassifier( - estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], - voting="soft", - weights=[1, 1, 5], -) - -# predict class probabilities for all classifiers -probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)] - -# get class probabilities for the first sample in the dataset -class1_1 = [pr[0, 0] for pr in probas] -class2_1 = [pr[0, 1] for pr in probas] - - -# plotting - -N = 4 # number of groups -ind = np.arange(N) # group positions -width = 0.35 # bar width - -fig, ax = plt.subplots() - -# bars for classifier 1-3 -p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k") -p2 = ax.bar( - ind + width, - np.hstack(([class2_1[:-1], [0]])), - width, - color="lightgreen", - edgecolor="k", -) - -# bars for VotingClassifier -p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k") -p4 = ax.bar( - ind + width, [0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k" -) - -# plot annotations -plt.axvline(2.8, color="k", linestyle="dashed") -ax.set_xticks(ind + width) -ax.set_xticklabels( - [ - "LogisticRegression\nweight 1", - "GaussianNB\nweight 1", - "RandomForestClassifier\nweight 5", - "VotingClassifier\n(average probabilities)", - ], - rotation=40, - ha="right", -) -plt.ylim([0, 1]) -plt.title("Class probabilities for sample 1 by different classifiers") -plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left") -plt.tight_layout() -plt.show()