scikit-learn · adrinjalali · Oct 22, 2024 · Sep 29, 2024 · Sep 29, 2024 · Oct 16, 2024
diff --git a/doc/conf.py b/doc/conf.py
@@ -488,6 +488,9 @@ def add_js_css_files(app, pagename, templatename, context, doctree):
     "auto_examples/datasets/plot_iris_dataset": (
         "auto_examples/decomposition/plot_pca_iris"
     ),
+    "auto_examples/linear_model/plot_iris_logistic": (
+        "auto_examples/linear_model/plot_logistic_multinomial"
+    ),
     "auto_examples/linear_model/plot_ols_3d": ("auto_examples/linear_model/plot_ols"),
 }
 html_context["redirects"] = redirects

diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py
@@ -1,70 +1,193 @@
 """
-====================================================
-Plot multinomial and One-vs-Rest Logistic Regression
-====================================================
+======================================================================
+Decision Boundaries of Multinomial and One-vs-Rest Logistic Regression
+======================================================================
 
-Plot decision surface of multinomial and One-vs-Rest Logistic Regression.
-The hyperplanes corresponding to the three One-vs-Rest (OVR) classifiers
-are represented by the dashed lines.
+This example compares decision boundaries of multinomial and one-vs-rest
+logistic regression on a 2D dataset with three classes.
 
+We make a comparison of the decision boundaries of both methods that is equivalent
+to call the method `predict`. In addition, we plot the hyperplanes that correspond to
+the line when the probability estimate for a class is of 0.5.
 """
 
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
+# %%
+# Dataset Generation
+# ------------------
+#
+# We generate a synthetic dataset using :func:`~sklearn.datasets.make_blobs` function.
+# The dataset consists of 1,000 samples from three different classes,
+# centered around [-5, 0], [0, 1.5], and [5, -1]. After generation, we apply a linear
+# transformation to introduce some correlation between features and make the problem
+# more challenging. This results in a 2D dataset with three overlapping classes,
+# suitable for demonstrating the differences between multinomial and one-vs-rest
+# logistic regression.
 import matplotlib.pyplot as plt
 import numpy as np
 
 from sklearn.datasets import make_blobs
-from sklearn.inspection import DecisionBoundaryDisplay
-from sklearn.linear_model import LogisticRegression
-from sklearn.multiclass import OneVsRestClassifier
 
-# make 3-class dataset for classification
 centers = [[-5, 0], [0, 1.5], [5, -1]]
-X, y = make_blobs(n_samples=1000, centers=centers, random_state=40)
+X, y = make_blobs(n_samples=1_000, centers=centers, random_state=40)
 transformation = [[0.4, 0.2], [-0.4, 1.2]]
 X = np.dot(X, transformation)
 
-for multi_class in ("multinomial", "ovr"):
-    clf = LogisticRegression(solver="sag", max_iter=100, random_state=42)
-    if multi_class == "ovr":
-        clf = OneVsRestClassifier(clf)
-    clf.fit(X, y)
+fig, ax = plt.subplots(figsize=(6, 4))
+
+scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor="black")
+ax.set(title="Synthetic Dataset", xlabel="Feature 1", ylabel="Feature 2")
+_ = ax.legend(*scatter.legend_elements(), title="Classes")
+
+
+# %%
+# Classifier Training
+# -------------------
+#
+# We train two different logistic regression classifiers: multinomial and one-vs-rest.
+# The multinomial classifier handles all classes simultaneously, while the one-vs-rest
+# approach trains a binary classifier for each class against all others.
+from sklearn.linear_model import LogisticRegression
+from sklearn.multiclass import OneVsRestClassifier
+
+logistic_regression_multinomial = LogisticRegression().fit(X, y)
+logistic_regression_ovr = OneVsRestClassifier(LogisticRegression()).fit(X, y)
+
+accuracy_multinomial = logistic_regression_multinomial.score(X, y)
+accuracy_ovr = logistic_regression_ovr.score(X, y)
 
-    # print the training scores
-    print("training score : %.3f (%s)" % (clf.score(X, y), multi_class))
+# %%
+# Decision Boundaries Visualization
+# ---------------------------------
+#
+# Let's visualize the decision boundaries of both models that is provided by the
+# method `predict` of the classifiers.
+from sklearn.inspection import DecisionBoundaryDisplay
+
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
 
-    _, ax = plt.subplots()
+for model, title, ax in [
+    (
+        logistic_regression_multinomial,
+        f"Multinomial Logistic Regression\n(Accuracy: {accuracy_multinomial:.3f})",
+        ax1,
+    ),
+    (
+        logistic_regression_ovr,
+        f"One-vs-Rest Logistic Regression\n(Accuracy: {accuracy_ovr:.3f})",
+        ax2,
+    ),
+]:
     DecisionBoundaryDisplay.from_estimator(
-        clf, X, response_method="predict", cmap=plt.cm.Paired, ax=ax
+        model,
+        X,
+        ax=ax,
+        response_method="predict",
+        alpha=0.8,
     )
-    plt.title("Decision surface of LogisticRegression (%s)" % multi_class)
-    plt.axis("tight")
-
-    # Plot also the training points
-    colors = "bry"
-    for i, color in zip(clf.classes_, colors):
-        idx = np.where(y == i)
-        plt.scatter(X[idx, 0], X[idx, 1], c=color, edgecolor="black", s=20)
-
-    # Plot the three one-against-all classifiers
-    xmin, xmax = plt.xlim()
-    ymin, ymax = plt.ylim()
-    if multi_class == "ovr":
-        coef = np.concatenate([est.coef_ for est in clf.estimators_])
-        intercept = np.concatenate([est.intercept_ for est in clf.estimators_])
+    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k")
+    legend = ax.legend(*scatter.legend_elements(), title="Classes")
+    ax.add_artist(legend)
+    ax.set_title(title)
+
+
+# %%
+# We see that the decision boundaries are different. This difference stems from their
+# approaches:
+#
+# - Multinomial logistic regression considers all classes simultaneously during
+#   optimization.
+# - One-vs-rest logistic regression fits each class independently against all others.
+#
+# These distinct strategies can lead to varying decision boundaries, especially in
+# complex multi-class problems.
+#
+# Hyperplanes Visualization
+# --------------------------
+#
+# We also visualize the hyperplanes that correspond to the line when the probability
+# estimate for a class is of 0.5.
+def plot_hyperplanes(classifier, X, ax):
+    xmin, xmax = X[:, 0].min(), X[:, 0].max()
+    ymin, ymax = X[:, 1].min(), X[:, 1].max()
+    ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax))
+
+    if isinstance(classifier, OneVsRestClassifier):
+        coef = np.concatenate([est.coef_ for est in classifier.estimators_])
+        intercept = np.concatenate([est.intercept_ for est in classifier.estimators_])
     else:
-        coef = clf.coef_
-        intercept = clf.intercept_
+        coef = classifier.coef_
+        intercept = classifier.intercept_
 
-    def plot_hyperplane(c, color):
-        def line(x0):
-            return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
+    for i in range(coef.shape[0]):
+        w = coef[i]
+        a = -w[0] / w[1]
+        xx = np.linspace(xmin, xmax)
+        yy = a * xx - (intercept[i]) / w[1]
+        ax.plot(xx, yy, "--", linewidth=3, label=f"Class {i}")
 
-        plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color)
+    return ax.get_legend_handles_labels()
 
-    for i, color in zip(clf.classes_, colors):
-        plot_hyperplane(i, color)
+
+# %%
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
+
+for model, title, ax in [
+    (
+        logistic_regression_multinomial,
+        "Multinomial Logistic Regression Hyperplanes",
+        ax1,
+    ),
+    (logistic_regression_ovr, "One-vs-Rest Logistic Regression Hyperplanes", ax2),
+]:
+    hyperplane_handles, hyperplane_labels = plot_hyperplanes(model, X, ax)
+    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k")
+    scatter_handles, scatter_labels = scatter.legend_elements()
+
+    all_handles = hyperplane_handles + scatter_handles
+    all_labels = hyperplane_labels + scatter_labels
+
+    ax.legend(all_handles, all_labels, title="Classes")
+    ax.set_title(title)
 
 plt.show()
+
+# %%
+# While the hyperplanes for classes 0 and 2 are quite similar between the two methods,
+# we observe that the hyperplane for class 1 is notably different. This difference stems
+# from the fundamental approaches of one-vs-rest and multinomial logistic regression:
+#
+# For one-vs-rest logistic regression:
+#
+# - Each hyperplane is determined independently by considering one class against all
+#   others.
+# - For class 1, the hyperplane represents the decision boundary that best separates
+#   class 1 from the combined classes 0 and 2.
+# - This binary approach can lead to simpler decision boundaries but may not capture
+#   complex relationships between all classes simultaneously.
+# - There is no possible interpretation of the conditional class probabilities.
+#
+# For multinomial logistic regression:
+#
+# - All hyperplanes are determined simultaneously, considering the relationships between
+#   all classes at once.
+# - The loss minimized by the model is a proper scoring rule, which means that the model
+#   is optimized to estimate the conditional class probabilities that are, therefore,
+#   meaningful.
+# - Each hyperplane represents the decision boundary where the probability of one class
+#   becomes higher than the others, based on the overall probability distribution.
+# - This approach can capture more nuanced relationships between classes, potentially
+#   leading to more accurate classification in multi-class problems.
+#
+# The difference in hyperplanes, especially for class 1, highlights how these methods
+# can produce different decision boundaries despite similar overall accuracy.
+#
+# In practice, using multinomial logistic regression is recommended since it minimizes a
+# well-formulated loss function, leading to better-calibrated class probabilities and
+# thus more interpretable results. When it comes to decision boundaries, one should
+# formulate a utility function to transform the class probabilities into a meaningful
+# quantity for the problem at hand. One-vs-rest allows for different decision boundaries
+# but does not allow for fine-grained control over the trade-off between the classes as
+# a utility function would.