From 330fad1b241df2fcc9e1b58806d25ebf88e046c6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 29 Sep 2024 20:38:48 +0200
Subject: [PATCH 1/3] DOC remove redundant example multiclass logistic
 regression

---
 doc/conf.py                                 |  3 ++
 examples/linear_model/plot_iris_logistic.py | 52 ---------------------
 2 files changed, 3 insertions(+), 52 deletions(-)
 delete mode 100644 examples/linear_model/plot_iris_logistic.py

diff --git a/doc/conf.py b/doc/conf.py
index 9ab1966b70e73..e14205bb0ba0f 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -475,6 +475,9 @@ def add_js_css_files(app, pagename, templatename, context, doctree):
     "auto_examples/linear_model/plot_lasso_coordinate_descent_path.py": (
         "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py"
     ),
+    "auto_examples/linear_model/plot_iris_logistic": (
+        "auto_examples/linear_model/plot_logistic_multinomial"
+    ),
 }
 html_context["redirects"] = redirects
 for old_link in redirects:
diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
deleted file mode 100644
index 481312c94c789..0000000000000
--- a/examples/linear_model/plot_iris_logistic.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-=========================================================
-Logistic Regression 3-class Classifier
-=========================================================
-
-Show below is a logistic-regression classifiers decision boundaries on the
-first two dimensions (sepal length and width) of the `iris
-<https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The datapoints
-are colored according to their labels.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-
-from sklearn import datasets
-from sklearn.inspection import DecisionBoundaryDisplay
-from sklearn.linear_model import LogisticRegression
-
-# import some data to play with
-iris = datasets.load_iris()
-X = iris.data[:, :2]  # we only take the first two features.
-Y = iris.target
-
-# Create an instance of Logistic Regression Classifier and fit the data.
-logreg = LogisticRegression(C=1e5)
-logreg.fit(X, Y)
-
-_, ax = plt.subplots(figsize=(4, 3))
-DecisionBoundaryDisplay.from_estimator(
-    logreg,
-    X,
-    cmap=plt.cm.Paired,
-    ax=ax,
-    response_method="predict",
-    plot_method="pcolormesh",
-    shading="auto",
-    xlabel="Sepal length",
-    ylabel="Sepal width",
-    eps=0.5,
-)
-
-# Plot also the training points
-plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors="k", cmap=plt.cm.Paired)
-
-
-plt.xticks(())
-plt.yticks(())
-
-plt.show()

From 30dda90641f1c109e2c61e0e57c145fe05292190 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 29 Sep 2024 23:01:23 +0200
Subject: [PATCH 2/3] improve multiclass exampel

---
 .../linear_model/plot_logistic_multinomial.py | 199 ++++++++++++++----
 1 file changed, 156 insertions(+), 43 deletions(-)

diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py
index ca9f1717fe346..60679999a59cf 100644
--- a/examples/linear_model/plot_logistic_multinomial.py
+++ b/examples/linear_model/plot_logistic_multinomial.py
@@ -1,70 +1,183 @@
 """
-====================================================
-Plot multinomial and One-vs-Rest Logistic Regression
-====================================================
+======================================================================
+Decision Boundaries of Multinomial and One-vs-Rest Logistic Regression
+======================================================================
 
-Plot decision surface of multinomial and One-vs-Rest Logistic Regression.
-The hyperplanes corresponding to the three One-vs-Rest (OVR) classifiers
-are represented by the dashed lines.
+This example compares decision boundaries of multinomial and one-vs-rest
+logistic regression on a 2D dataset with three classes.
 
+We make a comparison of the decision boundaries of both methods that is equivalent
+to call the method `predict`. In addition, we plot the hyperplanes that correspond to
+the line when the probability estimate for a class is of 0.5.
 """
 
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
+# %%
+# Dataset Generation
+# ------------------
+#
+# We generate a synthetic dataset using :func:`~sklearn.datasets.make_blobs` function.
+# The dataset consists of 1,000 samples from three different classes,
+# centered around [-5, 0], [0, 1.5], and [5, -1]. After generation, we apply a linear
+# transformation to introduce some correlation between features and make the problem
+# more challenging. This results in a 2D dataset with three overlapping classes,
+# suitable for demonstrating the differences between multinomial and one-vs-rest
+# logistic regression.
 import matplotlib.pyplot as plt
 import numpy as np
 
 from sklearn.datasets import make_blobs
-from sklearn.inspection import DecisionBoundaryDisplay
-from sklearn.linear_model import LogisticRegression
-from sklearn.multiclass import OneVsRestClassifier
 
-# make 3-class dataset for classification
 centers = [[-5, 0], [0, 1.5], [5, -1]]
-X, y = make_blobs(n_samples=1000, centers=centers, random_state=40)
+X, y = make_blobs(n_samples=1_000, centers=centers, random_state=40)
 transformation = [[0.4, 0.2], [-0.4, 1.2]]
 X = np.dot(X, transformation)
 
-for multi_class in ("multinomial", "ovr"):
-    clf = LogisticRegression(solver="sag", max_iter=100, random_state=42)
-    if multi_class == "ovr":
-        clf = OneVsRestClassifier(clf)
-    clf.fit(X, y)
+fig, ax = plt.subplots(figsize=(6, 4))
+
+scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor="black")
+ax.set(title="Synthetic Dataset", xlabel="Feature 1", ylabel="Feature 2")
+_ = ax.legend(*scatter.legend_elements(), title="Classes")
+
+
+# %%
+# Classifier Training
+# -------------------
+#
+# We train two different logistic regression classifiers: multinomial and one-vs-rest.
+# The multinomial classifier handles all classes simultaneously, while the one-vs-rest
+# approach trains a binary classifier for each class against all others.
+from sklearn.linear_model import LogisticRegression
+from sklearn.multiclass import OneVsRestClassifier
+
+logistic_regression_multinomial = LogisticRegression().fit(X, y)
+logistic_regression_ovr = OneVsRestClassifier(LogisticRegression()).fit(X, y)
+
+accuracy_multinomial = logistic_regression_multinomial.score(X, y)
+accuracy_ovr = logistic_regression_ovr.score(X, y)
 
-    # print the training scores
-    print("training score : %.3f (%s)" % (clf.score(X, y), multi_class))
+# %%
+# Decision Boundaries Visualization
+# ---------------------------------
+#
+# Let's visualize the decision boundaries of both models that is provided by the
+# method `predict` of the classifiers.
+from sklearn.inspection import DecisionBoundaryDisplay
+
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
 
-    _, ax = plt.subplots()
+for model, title, ax in [
+    (
+        logistic_regression_multinomial,
+        f"Multinomial Logistic Regression\n(Accuracy: {accuracy_multinomial:.3f})",
+        ax1,
+    ),
+    (
+        logistic_regression_ovr,
+        f"One-vs-Rest Logistic Regression\n(Accuracy: {accuracy_ovr:.3f})",
+        ax2,
+    ),
+]:
     DecisionBoundaryDisplay.from_estimator(
-        clf, X, response_method="predict", cmap=plt.cm.Paired, ax=ax
+        model,
+        X,
+        ax=ax,
+        response_method="predict",
+        alpha=0.8,
     )
-    plt.title("Decision surface of LogisticRegression (%s)" % multi_class)
-    plt.axis("tight")
-
-    # Plot also the training points
-    colors = "bry"
-    for i, color in zip(clf.classes_, colors):
-        idx = np.where(y == i)
-        plt.scatter(X[idx, 0], X[idx, 1], c=color, edgecolor="black", s=20)
-
-    # Plot the three one-against-all classifiers
-    xmin, xmax = plt.xlim()
-    ymin, ymax = plt.ylim()
-    if multi_class == "ovr":
-        coef = np.concatenate([est.coef_ for est in clf.estimators_])
-        intercept = np.concatenate([est.intercept_ for est in clf.estimators_])
+    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k")
+    legend = ax.legend(*scatter.legend_elements(), title="Classes")
+    ax.add_artist(legend)
+    ax.set_title(title)
+
+
+# %%
+# We see that the decision boundaries are different. This difference stems from their
+# approaches:
+#
+# - Multinomial logistic regression considers all classes simultaneously during
+#   optimization.
+# - One-vs-rest logistic regression fits each class independently against all others.
+#
+# These distinct strategies can lead to varying decision boundaries, especially in
+# complex multi-class problems.
+#
+# Hyperplanes Visualization
+# --------------------------
+#
+# We also visualize the hyperplanes that correspond to the line when the probability
+# estimate for a class is of 0.5.
+def plot_hyperplanes(classifier, X, ax):
+    xmin, xmax = X[:, 0].min(), X[:, 0].max()
+    ymin, ymax = X[:, 1].min(), X[:, 1].max()
+    ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax))
+
+    if isinstance(classifier, OneVsRestClassifier):
+        coef = np.concatenate([est.coef_ for est in classifier.estimators_])
+        intercept = np.concatenate([est.intercept_ for est in classifier.estimators_])
     else:
-        coef = clf.coef_
-        intercept = clf.intercept_
+        coef = classifier.coef_
+        intercept = classifier.intercept_
 
-    def plot_hyperplane(c, color):
-        def line(x0):
-            return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
+    for i in range(coef.shape[0]):
+        w = coef[i]
+        a = -w[0] / w[1]
+        xx = np.linspace(xmin, xmax)
+        yy = a * xx - (intercept[i]) / w[1]
+        ax.plot(xx, yy, "--", linewidth=3, label=f"Class {i}")
 
-        plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color)
+    return ax.get_legend_handles_labels()
 
-    for i, color in zip(clf.classes_, colors):
-        plot_hyperplane(i, color)
+
+# %%
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
+
+for model, title, ax in [
+    (
+        logistic_regression_multinomial,
+        "Multinomial Logistic Regression Hyperplanes",
+        ax1,
+    ),
+    (logistic_regression_ovr, "One-vs-Rest Logistic Regression Hyperplanes", ax2),
+]:
+    hyperplane_handles, hyperplane_labels = plot_hyperplanes(model, X, ax)
+    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k")
+    scatter_handles, scatter_labels = scatter.legend_elements()
+
+    all_handles = hyperplane_handles + scatter_handles
+    all_labels = hyperplane_labels + scatter_labels
+
+    ax.legend(all_handles, all_labels, title="Classes")
+    ax.set_title(title)
 
 plt.show()
+
+# %%
+# While the hyperplanes for classes 0 and 2 are quite similar between the two methods,
+# we observe that the hyperplane for class 1 is notably different. This difference stems
+# from the fundamental approaches of one-vs-rest and multinomial logistic regression:
+#
+# For one-vs-rest logistic regression:
+#
+# - Each hyperplane is determined independently by considering one class against all
+#   others.
+# - For class 1, the hyperplane represents the decision boundary that best separates
+#   class 1 from the combined classes 0 and 2.
+# - This binary approach can lead to simpler decision boundaries but may not capture
+#   complex relationships between all classes simultaneously.
+#
+# For multinomial logistic regression:
+#
+# - All hyperplanes are determined simultaneously, considering the relationships between
+#   all classes at once.
+# - Each hyperplane represents the decision boundary where the probability of one class
+#   becomes higher than the others, based on the overall probability distribution.
+# - This approach can capture more nuanced relationships between classes, potentially
+#   leading to more accurate classification in multi-class problems.
+#
+# The difference in hyperplanes, especially for class 1, highlights how these methods
+# can produce different decision boundaries despite similar overall accuracy. The choice
+# between one-vs-rest and multinomial logistic regression can depend on the specific
+# dataset and the nature of the classification problem.

From ea0a96db7b8dfcbb1a5a7b09af4ed59421a335c2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Wed, 16 Oct 2024 11:16:55 +0200
Subject: [PATCH 3/3] reformulate recommendations

---
 .../linear_model/plot_logistic_multinomial.py    | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py
index 60679999a59cf..c12229c81c7f1 100644
--- a/examples/linear_model/plot_logistic_multinomial.py
+++ b/examples/linear_model/plot_logistic_multinomial.py
@@ -167,17 +167,27 @@ def plot_hyperplanes(classifier, X, ax):
 #   class 1 from the combined classes 0 and 2.
 # - This binary approach can lead to simpler decision boundaries but may not capture
 #   complex relationships between all classes simultaneously.
+# - There is no possible interpretation of the conditional class probabilities.
 #
 # For multinomial logistic regression:
 #
 # - All hyperplanes are determined simultaneously, considering the relationships between
 #   all classes at once.
+# - The loss minimized by the model is a proper scoring rule, which means that the model
+#   is optimized to estimate the conditional class probabilities that are, therefore,
+#   meaningful.
 # - Each hyperplane represents the decision boundary where the probability of one class
 #   becomes higher than the others, based on the overall probability distribution.
 # - This approach can capture more nuanced relationships between classes, potentially
 #   leading to more accurate classification in multi-class problems.
 #
 # The difference in hyperplanes, especially for class 1, highlights how these methods
-# can produce different decision boundaries despite similar overall accuracy. The choice
-# between one-vs-rest and multinomial logistic regression can depend on the specific
-# dataset and the nature of the classification problem.
+# can produce different decision boundaries despite similar overall accuracy.
+#
+# In practice, using multinomial logistic regression is recommended since it minimizes a
+# well-formulated loss function, leading to better-calibrated class probabilities and
+# thus more interpretable results. When it comes to decision boundaries, one should
+# formulate a utility function to transform the class probabilities into a meaningful
+# quantity for the problem at hand. One-vs-rest allows for different decision boundaries
+# but does not allow for fine-grained control over the trade-off between the classes as
+# a utility function would.