scikit-learn · thomasjpfan · Aug 30, 2022 · Jun 20, 2022 · Jun 21, 2022 · Jun 21, 2022
diff --git a/examples/cluster/plot_adjusted_for_chance_measures.py b/examples/cluster/plot_adjusted_for_chance_measures.py
@@ -3,135 +3,231 @@
 Adjustment for chance in clustering performance evaluation
 ==========================================================
 
-The following plots demonstrate the impact of the number of clusters and
-number of samples on various clustering performance evaluation metrics.
-
-Non-adjusted measures such as the V-Measure show a dependency between
-the number of clusters and the number of samples: the mean V-Measure
-of random labeling increases significantly as the number of clusters is
-closer to the total number of samples used to compute the measure.
-
-Adjusted for chance measure such as ARI display some random variations
-centered around a mean score of 0.0 for any number of samples and
-clusters.
-
-Only adjusted measures can hence safely be used as a consensus index
-to evaluate the average stability of clustering algorithms for a given
-value of k on various overlapping sub-samples of the dataset.
+This notebook explores the impact of uniformly-distributed random labeling on
+the behavior of some clustering evaluation metrics. For such purpose, the
+metrics are computed with a fixed number of samples and as a function of the number
+of clusters assigned by the estimator. The example is divided into two
+experiments:
+
+- a first experiment with fixed "ground truth labels" (and therefore fixed
+  number of classes) and randomly "predicted labels";
+- a second experiment with varying "ground truth labels", randomly "predicted
+  labels". The "predicted labels" have the same number of classes and clusters
+  as the "ground truth labels".
 
 """
 
 # Author: Olivier Grisel <[email protected]>
+#         Arturo Amor <[email protected]>
 # License: BSD 3 clause
 
-import numpy as np
-import matplotlib.pyplot as plt
-from time import time
+# %%
+# Defining the list of metrics to evaluate
+# ----------------------------------------
+#
+# Clustering algorithms are fundamentally unsupervised learning methods.
+# However, since we assign class labels for the synthetic clusters in this
+# example, it is possible to use evaluation metrics that leverage this
+# "supervised" ground truth information to quantify the quality of the resulting
+# clusters. Examples of such metrics are the following:
+#
+# - V-measure, the harmonic mean of completeness and homogeneity;
+#
+# - Rand index, which measures how frequently pairs of data points are grouped
+#   consistently according to the result of the clustering algorithm and the
+#   ground truth class assignment;
+#
+# - Adjusted Rand index (ARI), a chance-adjusted Rand index such that a random
+#   cluster assignment has an ARI of 0.0 in expectation;
+#
+# - Mutual Information (MI) is an information theoretic measure that quantifies
+#   how dependent are the two labelings. Note that the maximum value of MI for
+#   perfect labelings depends on the number of clusters and samples;
+#
+# - Normalized Mutual Information (NMI), a Mutual Information defined between 0
+#   (no mutual information) in the limit of large number of data points and 1
+#   (perfectly matching label assignments, up to a permutation of the labels).
+#   It is not adjusted for chance: then the number of clustered data points is
+#   not large enough, the expected values of MI or NMI for random labelings can
+#   be significantly non-zero;
+#
+# - Adjusted Mutual Information (AMI), a chance-adjusted Mutual Information.
+#   Similarly to ARI, random cluster assignment has an AMI of 0.0 in
+#   expectation.
+#
+# For more information, see the :ref:`clustering_evaluation` module.
+
 from sklearn import metrics
 
+score_funcs = [
+    ("V-measure", metrics.v_measure_score),
+    ("Rand index", metrics.rand_score),
+    ("ARI", metrics.adjusted_rand_score),
+    ("MI", metrics.mutual_info_score),
+    ("NMI", metrics.normalized_mutual_info_score),
+    ("AMI", metrics.adjusted_mutual_info_score),
+]
 
-def uniform_labelings_scores(
-    score_func, n_samples, n_clusters_range, fixed_n_classes=None, n_runs=5, seed=42
-):
-    """Compute score for 2 random uniform cluster labelings.
+# %%
+# First experiment: fixed ground truth labels and growing number of clusters
+# --------------------------------------------------------------------------
+#
+# We first define a function that creates uniformly-distributed random labeling.
 
-    Both random labelings have the same number of clusters for each value
-    possible value in ``n_clusters_range``.
+import numpy as np
 
-    When fixed_n_classes is not None the first labeling is considered a ground
-    truth class assignment with fixed number of classes.
-    """
-    random_labels = np.random.RandomState(seed).randint
-    scores = np.zeros((len(n_clusters_range), n_runs))
+rng = np.random.RandomState(0)
 
-    if fixed_n_classes is not None:
-        labels_a = random_labels(low=0, high=fixed_n_classes, size=n_samples)
 
-    for i, k in enumerate(n_clusters_range):
-        for j in range(n_runs):
-            if fixed_n_classes is None:
-                labels_a = random_labels(low=0, high=k, size=n_samples)
-            labels_b = random_labels(low=0, high=k, size=n_samples)
-            scores[i, j] = score_func(labels_a, labels_b)
-    return scores
+def random_labels(n_samples, n_classes):
+    return rng.randint(low=0, high=n_classes, size=n_samples)
 
 
-def ami_score(U, V):
-    return metrics.adjusted_mutual_info_score(U, V)
+# %%
+# Another function will use the `random_labels` function to create a fixed set
+# of ground truth labels (`labels_a`) distributed in `n_classes` and then score
+# several sets of randomly "predicted" labels (`labels_b`) to assess the
+# variability of a given metric at a given `n_clusters`.
 
 
-score_funcs = [
-    metrics.adjusted_rand_score,
-    metrics.v_measure_score,
-    ami_score,
-    metrics.mutual_info_score,
-]
+def fixed_classes_uniform_labelings_scores(
+    score_func, n_samples, n_clusters_range, n_classes, n_runs=5
+):
+    scores = np.zeros((len(n_clusters_range), n_runs))
+    labels_a = random_labels(n_samples=n_samples, n_classes=n_classes)
 
-# 2 independent random clusterings with equal cluster number
+    for i, n_clusters in enumerate(n_clusters_range):
+        for j in range(n_runs):
+            labels_b = random_labels(n_samples=n_samples, n_classes=n_clusters)
+            scores[i, j] = score_func(labels_a, labels_b)
+    return scores
 
-n_samples = 100
-n_clusters_range = np.linspace(2, n_samples, 10).astype(int)
 
-plt.figure(1)
+# %%
+# In this first example we set the number of clases (true number of clusters) to
+# `n_classes=10`. The number of clusters varies over the values provided by
+# `n_clusters_range`.
+
+import matplotlib.pyplot as plt
+import matplotlib.style as style
 
+n_samples = 1000
+n_classes = 10
+n_clusters_range = np.linspace(2, 100, 10).astype(int)
 plots = []
 names = []
-for score_func in score_funcs:
-    print(
-        "Computing %s for %d values of n_clusters and n_samples=%d"
-        % (score_func.__name__, len(n_clusters_range), n_samples)
-    )
 
-    t0 = time()
-    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)
-    print("done in %0.3fs" % (time() - t0))
+style.use("seaborn-colorblind")
+plt.figure(1)
+
+for marker, (score_name, score_func) in zip("d^vx.,", score_funcs):
+
+    scores = fixed_classes_uniform_labelings_scores(
+        score_func, n_samples, n_clusters_range, n_classes=n_classes
+    )
     plots.append(
-        plt.errorbar(n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0]
+        plt.errorbar(
+            n_clusters_range,
+            scores.mean(axis=1),
+            scores.std(axis=1),
+            alpha=0.8,
+            linewidth=1,
+            marker=marker,
+        )[0]
     )
-    names.append(score_func.__name__)
+    names.append(score_name)
 
 plt.title(
-    "Clustering measures for 2 random uniform labelings\nwith equal number of clusters"
+    "Clustering measures for random uniform labeling\n"
+    f"against reference assignment with {n_classes} classes"
 )
-plt.xlabel("Number of clusters (Number of samples is fixed to %d)" % n_samples)
+plt.xlabel(f"Number of clusters (Number of samples is fixed to {n_samples})")
 plt.ylabel("Score value")
-plt.legend(plots, names)
 plt.ylim(bottom=-0.05, top=1.05)
+plt.legend(plots, names)
+plt.show()
 
+# %%
+# The Rand index saturates for `n_clusters` > `n_classes`. Other non-adjusted
+# measures such as the V-Measure show a linear dependency between the number of
+# clusters and the number of samples.
+#
+# Adjusted for chance measure, such as ARI and AMI, display some random
+# variations centered around a mean score of 0.0, independently of the number of
+# samples and clusters.
+#
+# Second experiment: varying number of classes and clusters
+# ---------------------------------------------------------
+#
+# In this section we define a similar function that uses several metrics to
+# score 2 uniformly-distributed random labelings. In this case, the number of
+# classes and assigned number of clusters are matched for each possible value in
+# `n_clusters_range`.
+
+
+def uniform_labelings_scores(score_func, n_samples, n_clusters_range, n_runs=5):
+    scores = np.zeros((len(n_clusters_range), n_runs))
 
-# Random labeling with varying n_clusters against ground class labels
-# with fixed number of clusters
+    for i, n_clusters in enumerate(n_clusters_range):
+        for j in range(n_runs):
+            labels_a = random_labels(n_samples=n_samples, n_classes=n_clusters)
+            labels_b = random_labels(n_samples=n_samples, n_classes=n_clusters)
+            scores[i, j] = score_func(labels_a, labels_b)
+    return scores
 
-n_samples = 1000
-n_clusters_range = np.linspace(2, 100, 10).astype(int)
-n_classes = 10
+
+# %%
+# In this case, we use `n_samples=100` to show the effect of having a number of
+# clusters similar or equal to the number of samples.
+
+n_samples = 100
+n_clusters_range = np.linspace(2, n_samples, 10).astype(int)
 
 plt.figure(2)
 
 plots = []
 names = []
-for score_func in score_funcs:
-    print(
-        "Computing %s for %d values of n_clusters and n_samples=%d"
-        % (score_func.__name__, len(n_clusters_range), n_samples)
-    )
 
-    t0 = time()
-    scores = uniform_labelings_scores(
-        score_func, n_samples, n_clusters_range, fixed_n_classes=n_classes
-    )
-    print("done in %0.3fs" % (time() - t0))
+for marker, (score_name, score_func) in zip("d^vx.,", score_funcs):
+
+    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)
     plots.append(
-        plt.errorbar(n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0]
+        plt.errorbar(
+            n_clusters_range,
+            np.median(scores, axis=1),
+            scores.std(axis=1),
+            alpha=0.8,
+            linewidth=2,
+            marker=marker,
+        )[0]
     )
-    names.append(score_func.__name__)
+    names.append(score_name)
 
 plt.title(
-    "Clustering measures for random uniform labeling\n"
-    "against reference assignment with %d classes" % n_classes
+    "Clustering measures for 2 random uniform labelings\nwith equal number of clusters"
 )
-plt.xlabel("Number of clusters (Number of samples is fixed to %d)" % n_samples)
+plt.xlabel(f"Number of clusters (Number of samples is fixed to {n_samples})")
 plt.ylabel("Score value")
-plt.ylim(bottom=-0.05, top=1.05)
 plt.legend(plots, names)
+plt.ylim(bottom=-0.05, top=1.05)
 plt.show()
+
+# %%
+# We observe similar results as for the first experiment: adjusted for chance
+# metrics stay constantly near zero while other metrics tend to get larger with
+# finer-grained labelings. The mean V-measure of random labeling increases
+# significantly as the number of clusters is closer to the total number of
+# samples used to compute the measure. Furthermore, raw Mutual Information is
+# unbounded from above and its scale depends on the dimensions of the clustering
+# problem and the cardinality of the ground truth classes.
+#
+# Only adjusted measures can hence be safely used as a consensus index to
+# evaluate the average stability of clustering algorithms for a given value of k
+# on various overlapping sub-samples of the dataset.
+#
+# Non-adjusted clustering evaluation metric can therefore be misleading as they
+# output large values for fine-grained labelings, one could be lead to think
+# that the labeling has captured meaningful groups while they can be totally
+# random. In particular, such non-adjusted metrics should not be used to compare
+# the results of different clustering algorithms that output a different number
+# of clusters.