diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 1b81ffd30d7af..b4535fe2cde23 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -868,7 +868,9 @@ details. metrics.adjusted_mutual_info_score metrics.adjusted_rand_score + metrics.calinski_harabaz_score metrics.completeness_score + metrics.fowlkes_mallows_score metrics.homogeneity_completeness_v_measure metrics.homogeneity_score metrics.mutual_info_score diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 70b5b1f879160..7ef9a464295d7 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1339,6 +1339,93 @@ mean of homogeneity and completeness**: `_, Hila Becker, PhD Thesis. +.. _fowlkes_mallows_scores: + +Fowlkes-Mallows scores +---------------------- + +The Fowlkes-Mallows index (:func:`sklearn.metrics.fowlkes_mallows_score`) can be +used when the ground truth class assignments of the samples is known. The +Fowlkes-Mallows score FMI is defined as the geometric mean of the +pairwise precision and recall: + +.. math:: \text{FMI} = \frac{\text{TP}}{\sqrt{(\text{TP} + \text{FP}) (\text{TP} + \text{FN})}} + +Where ``TP`` is the number of **True Positive** (i.e. the number of pair +of points that belong to the same clusters in both the true labels and the +predicted labels), ``FP`` is the number of **False Positive** (i.e. the number +of pair of points that belong to the same clusters in the true labels and not +in the predicted labels) and ``FN`` is the number of **False Negative** (i.e the +number of pair of points that belongs in the same clusters in the predicted +labels and not in the true labels). + +The score ranges from 0 to 1. A high value indicates a good similarity +between two clusters. + + >>> from sklearn import metrics + >>> labels_true = [0, 0, 0, 1, 1, 1] + >>> labels_pred = [0, 0, 1, 1, 2, 2] + + >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) # doctest: +ELLIPSIS + 0.47140... + +One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get +the same score:: + + >>> labels_pred = [1, 1, 0, 0, 3, 3] + + >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) # doctest: +ELLIPSIS + 0.47140... + +Perfect labeling is scored 1.0:: + + >>> labels_pred = labels_true[:] + >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) # doctest: +ELLIPSIS + 1.0 + +Bad (e.g. independent labelings) have zero scores:: + + >>> labels_true = [0, 1, 2, 0, 3, 4, 5, 1] + >>> labels_pred = [1, 1, 0, 0, 2, 2, 2, 2] + >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) # doctest: +ELLIPSIS + 0.0 + +Advantages +~~~~~~~~~~ + +- **Random (uniform) label assignments have a FMI score close to 0.0** + for any value of ``n_clusters`` and ``n_samples`` (which is not the + case for raw Mutual Information or the V-measure for instance). + +- **Bounded range [0, 1]**: Values close to zero indicate two label + assignments that are largely independent, while values close to one + indicate significant agreement. Further, values of exactly 0 indicate + **purely** independent label assignments and a AMI of exactly 1 indicates + that the two label assignments are equal (with or without permutation). + +- **No assumption is made on the cluster structure**: can be used + to compare clustering algorithms such as k-means which assumes isotropic + blob shapes with results of spectral clustering algorithms which can + find cluster with "folded" shapes. + + +Drawbacks +~~~~~~~~~ + +- Contrary to inertia, **FMI-based measures require the knowledge + of the ground truth classes** while almost never available in practice or + requires manual assignment by human annotators (as in the supervised learning + setting). + +.. topic:: References + + * E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two + hierarchical clusterings". Journal of the American Statistical Association. + http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf + + * `Wikipedia entry for the Fowlkes-Mallows Index + `_ + .. _silhouette_coefficient: Silhouette Coefficient @@ -1413,3 +1500,73 @@ Drawbacks * :ref:`example_cluster_plot_kmeans_silhouette_analysis.py` : In this example the silhouette analysis is used to choose an optimal value for n_clusters. + +.. _calinski_harabaz_index: + +Calinski-Harabaz Index +---------------------- + +If the ground truth labels are not known, the Calinski-Harabaz index +(:func:`sklearn.metrics.calinski_harabaz_score`) can be used to evaluate the +model, where a higher Calinski-Harabaz score relates to a model with better +defined clusters. + +For :math:`k` clusters, the Calinski-Harabaz score :math:`s` is given as the +ratio of the between-clusters dispersion mean and the within-cluster +dispersion: + +.. math:: + s(k) = \frac{\mathrm{Tr}(B_k)}{\mathrm{Tr}(W_k)} \times \frac{N - k}{k - 1} + +where :math:`B_K` is the between group dispersion matrix and :math:`W_K` +is the within-cluster dispersion matrix defined by: + +.. math:: W_k = \sum_{q=1}^k \sum_{x \in C_q} (x - c_q) (x - c_q)^T + +.. math:: B_k = \sum_q n_q (c_q - c) (c_q - c)^T + +with :math:`N` be the number of points in our data, :math:`C_q` be the set of +points in cluster :math:`q`, :math:`c_q` be the center of cluster +:math:`q`, :math:`c` be the center of :math:`E`, :math:`n_q` be the number of +points in cluster :math:`q`. + + + >>> from sklearn import metrics + >>> from sklearn.metrics import pairwise_distances + >>> from sklearn import datasets + >>> dataset = datasets.load_iris() + >>> X = dataset.data + >>> y = dataset.target + +In normal usage, the Calinski-Harabaz index is applied to the results of a +cluster analysis. + + >>> import numpy as np + >>> from sklearn.cluster import KMeans + >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X) + >>> labels = kmeans_model.labels_ + >>> metrics.calinski_harabaz_score(X, labels) # doctest: +ELLIPSIS + 560.39... + + +Advantages +~~~~~~~~~~ + +- The score is higher when clusters are dense and well separated, which relates + to a standard concept of a cluster. + +- The score is fast to compute + + +Drawbacks +~~~~~~~~~ + +- The Calinski-Harabaz index is generally higher for convex clusters than other + concepts of clusters, such as density based clusters like those obtained + through DBSCAN. + +.. topic:: References + + * CaliƄski, T., & Harabasz, J. (1974). "A dendrite method for cluster + analysis". Communications in Statistics-theory and Methods 3: 1-27. + `doi:10.1080/03610926.2011.560741 `_. diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 1497aa3807441..3d0dab18bb3ee 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -109,6 +109,14 @@ New features One can pass method names such as `predict_proba` to be used in the cross validation framework instead of the default `predict`. By `Ori Ziv`_ and `Sears Merritt`_. + - Added :func:`metrics.cluster.fowlkes_mallows_score`, the Fowlkes Mallows + Index which measures the similarity of two clusterings of a set of points + By `Arnaud Fouchet`_ and `Thierry Guillemot`_. + + - Added :func:`metrics.calinski_harabaz_score`, which computes the Calinski + and Harabaz score to evaluate the resulting clustering of a set of points. + By `Arnaud Fouchet`_ and `Thierry Guillemot`_. + Enhancements ............ @@ -4257,3 +4265,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Sears Merritt: https://github.com/merritts .. _Wenhua Yang: https://github.com/geekoala + +.. _Arnaud Fouchet: https://github.com/afouchet \ No newline at end of file diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 501599c46442e..413831939fbbc 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -39,8 +39,10 @@ from .cluster import homogeneity_score from .cluster import mutual_info_score from .cluster import normalized_mutual_info_score +from .cluster import fowlkes_mallows_score from .cluster import silhouette_samples from .cluster import silhouette_score +from .cluster import calinski_harabaz_score from .cluster import v_measure_score from .pairwise import euclidean_distances diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 8a58dc2acb669..4cda1108ece32 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -15,14 +15,16 @@ from .supervised import homogeneity_score from .supervised import mutual_info_score from .supervised import v_measure_score +from .supervised import fowlkes_mallows_score from .supervised import entropy from .unsupervised import silhouette_samples from .unsupervised import silhouette_score +from .unsupervised import calinski_harabaz_score from .bicluster import consensus_score __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score", "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "entropy", "silhouette_samples", "silhouette_score", - "consensus_score"] + "fowlkes_mallows_score", "entropy", "silhouette_samples", + "silhouette_score", "calinski_harabaz_score", "consensus_score"] diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 77c9c50436061..131c14b5078ca 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -1,4 +1,4 @@ -"""Utilities to evaluate the clustering performance of models +"""Utilities to evaluate the clustering performance of models. Functions named as *_score return a scalar value to maximize: the higher the better. @@ -7,6 +7,8 @@ # Authors: Olivier Grisel # Wei LI # Diego Molla +# Arnaud Fouchet +# Thierry Guillemot # License: BSD 3 clause from math import log @@ -26,7 +28,7 @@ def comb2(n): def check_clusterings(labels_true, labels_pred): - """Check that the two clusterings matching 1D integer arrays""" + """Check that the two clusterings matching 1D integer arrays.""" labels_true = np.asarray(labels_true) labels_pred = np.asarray(labels_pred) @@ -101,7 +103,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): # clustering measures def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): - """Rand index adjusted for chance + """Rand index adjusted for chance. The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and counting pairs that are @@ -193,9 +195,9 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): # Special limit cases: no clustering since the data is not split; # or trivial clustering where each document is assigned a unique cluster. # These are perfect matches hence return 1.0. - if (classes.shape[0] == clusters.shape[0] == 1 - or classes.shape[0] == clusters.shape[0] == 0 - or classes.shape[0] == clusters.shape[0] == len(labels_true)): + if (classes.shape[0] == clusters.shape[0] == 1 or + classes.shape[0] == clusters.shape[0] == 0 or + classes.shape[0] == clusters.shape[0] == len(labels_true)): return 1.0 contingency = contingency_matrix(labels_true, labels_pred, @@ -213,7 +215,7 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): def homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=5000): - """Compute the homogeneity and completeness and V-Measure scores at once + """Compute the homogeneity and completeness and V-Measure scores at once. Those metrics are based on normalized conditional entropy measures of the clustering labeling to evaluate given the knowledge of a Ground @@ -285,14 +287,14 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, if homogeneity + completeness == 0.0: v_measure_score = 0.0 else: - v_measure_score = (2.0 * homogeneity * completeness - / (homogeneity + completeness)) + v_measure_score = (2.0 * homogeneity * completeness / + (homogeneity + completeness)) return homogeneity, completeness, v_measure_score def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): - """Homogeneity metric of a cluster labeling given a ground truth + """Homogeneity metric of a cluster labeling given a ground truth. A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class. @@ -372,7 +374,7 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): def completeness_score(labels_true, labels_pred, max_n_classes=5000): - """Completeness metric of a cluster labeling given a ground truth + """Completeness metric of a cluster labeling given a ground truth. A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster. @@ -550,7 +552,7 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000): def mutual_info_score(labels_true, labels_pred, contingency=None, max_n_classes=5000): - """Mutual Information between two clusterings + """Mutual Information between two clusterings. The Mutual Information is a measure of the similarity between two labels of the same data. Where :math:`P(i)` is the probability of a random sample @@ -621,13 +623,13 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum()) - mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) - + contingency_nm * log_outer) + mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) + + contingency_nm * log_outer) return mi.sum() def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): - """Adjusted Mutual Information between two clusterings + """Adjusted Mutual Information between two clusterings. Adjusted Mutual Information (AMI) is an adjustment of the Mutual Information (MI) score to account for chance. It accounts for the fact that @@ -711,8 +713,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): clusters = np.unique(labels_pred) # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. - if (classes.shape[0] == clusters.shape[0] == 1 - or classes.shape[0] == clusters.shape[0] == 0): + if (classes.shape[0] == clusters.shape[0] == 1 or + classes.shape[0] == clusters.shape[0] == 0): return 1.0 contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) @@ -729,7 +731,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): - """Normalized Mutual Information between two clusterings + """Normalized Mutual Information between two clusterings. Normalized Mutual Information (NMI) is an normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual @@ -798,8 +800,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): clusters = np.unique(labels_pred) # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. - if (classes.shape[0] == clusters.shape[0] == 1 - or classes.shape[0] == clusters.shape[0] == 0): + if (classes.shape[0] == clusters.shape[0] == 1 or + classes.shape[0] == clusters.shape[0] == 0): return 1.0 contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) @@ -814,6 +816,85 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): return nmi +def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): + """Measure the similarity of two clusterings of a set of points. + + The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of + the precision and recall:: + + FMI = TP / sqrt((TP + FP) * (TP + FN)) + + Where ``TP`` is the number of **True Positive** (i.e. the number of pair of + points that belongs in the same clusters in both ``labels_true`` and + ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the + number of pair of points that belongs in the same clusters in + ``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of + **False Negative** (i.e the number of pair of points that belongs in the + same clusters in ``labels_pred`` and not in ``labels_True``). + + The score ranges from 0 to 1. A high value indicates a good similarity + between two clusters. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : int array, shape = (``n_samples``,) + A clustering of the data into disjoint subsets. + + labels_pred : array, shape = (``n_samples``, ) + A clustering of the data into disjoint subsets. + + max_n_classes : int, optional (default=5000) + Maximal number of classes handled by the Fowlkes-Mallows + metric. Setting it too high can lead to MemoryError or OS + freeze + + Returns + ------- + score : float + The resulting Fowlkes-Mallows score. + + Examples + -------- + + Perfect labelings are both homogeneous and complete, hence have + score 1.0:: + + >>> from sklearn.metrics.cluster import fowlkes_mallows_score + >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1]) + 1.0 + >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + If classes members are completely split across different clusters, + the assignment is totally random, hence the FMI is null:: + + >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3]) + 0.0 + + References + ---------- + .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two + hierarchical clusterings". Journal of the American Statistical + Association + `_ + + .. [2] `Wikipedia entry for the Fowlkes-Mallows Index + `_ + """ + labels_true, labels_pred = check_clusterings(labels_true, labels_pred,) + n_samples, = labels_true.shape + + c = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) + tk = np.dot(c.ravel(), c.ravel()) - n_samples + pk = np.sum(np.sum(c, axis=0) ** 2) - n_samples + qk = np.sum(np.sum(c, axis=1) ** 2) - n_samples + + return tk / np.sqrt(pk * qk) if tk != 0. else 0. + + def entropy(labels): """Calculates the entropy for a labeling.""" if len(labels) == 0: diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index cd12158605bbf..828c2c544574c 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -10,6 +10,7 @@ from sklearn.metrics.cluster import mutual_info_score from sklearn.metrics.cluster import expected_mutual_information from sklearn.metrics.cluster import contingency_matrix +from sklearn.metrics.cluster import fowlkes_mallows_score from sklearn.metrics.cluster import entropy from sklearn.utils.testing import assert_raise_message @@ -229,3 +230,20 @@ def test_max_n_classes(): assert_raise_message(ValueError, expected, score_func, labels_zero, labels_pred, max_n_classes=50) + + +def test_fowlkes_mallows_score(): + # General case + score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], + [0, 0, 1, 1, 2, 2]) + assert_almost_equal(score, 4. / np.sqrt(12. * 6.)) + + # Perfect match but where the label names changed + perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], + [1, 1, 1, 0, 0, 0]) + assert_almost_equal(perfect_score, 1.) + + # Worst case + worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], + [0, 1, 2, 3, 4, 5]) + assert_almost_equal(worst_score, 0.) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index b346c7e54632e..e1718e604d29a 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -2,12 +2,14 @@ from scipy.sparse import csr_matrix from sklearn import datasets -from sklearn.metrics.cluster.unsupervised import silhouette_score -from sklearn.metrics import pairwise_distances from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises_regexp +from sklearn.utils.testing import assert_raise_message +from sklearn.metrics.cluster import silhouette_score +from sklearn.metrics.cluster import calinski_harabaz_score +from sklearn.metrics import pairwise_distances def test_silhouette(): @@ -86,3 +88,32 @@ def test_non_numpy_labels(): y = dataset.target assert_equal( silhouette_score(list(X), list(y)), silhouette_score(X, y)) + + +def test_calinski_harabaz_score(): + rng = np.random.RandomState(seed=0) + + # Assert message when there is only one label + assert_raise_message(ValueError, "Number of labels is", + calinski_harabaz_score, + rng.rand(10, 2), np.zeros(10)) + + # Assert message when all point are in different clusters + assert_raise_message(ValueError, "Number of labels is", + calinski_harabaz_score, + rng.rand(10, 2), np.arange(10)) + + # Assert the value is 1. when all samples are equals + assert_equal(1., calinski_harabaz_score(np.ones((10, 2)), + [0] * 5 + [1] * 5)) + + # Assert the value is 0. when all the mean cluster are equal + assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10, + [0] * 10 + [1] * 10)) + + # General case (with non numpy arrays) + X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) + labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 + assert_almost_equal(calinski_harabaz_score(X, labels), + 45 * (40 - 4) / (5 * (4 - 1))) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 4a8ff450e7cea..ce9e35487b1b7 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -1,7 +1,8 @@ -""" Unsupervised evaluation metrics. """ +"""Unsupervised evaluation metrics.""" # Authors: Robert Layton -# +# Arnaud Fouchet +# Thierry Guillemot # License: BSD 3 clause import numpy as np @@ -12,6 +13,12 @@ from ...preprocessing import LabelEncoder +def check_number_of_labels(n_labels, n_samples): + if not 1 < n_labels < n_samples: + raise ValueError("Number of labels is %d. Valid values are 2 " + "to n_samples - 1 (inclusive)" % n_labels) + + def silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. @@ -50,12 +57,12 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, array itself, use ``metric="precomputed"``. sample_size : int or None - The size of the sample to use when computing the Silhouette Coefficient - on a random subset of the data. + The size of the sample to use when computing the Silhouette Coefficient + on a random subset of the data. If ``sample_size is None``, no sampling is used. random_state : integer or numpy.RandomState, optional - The generator used to randomly select a subset of samples if + The generator used to randomly select a subset of samples if ``sample_size is not None``. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. @@ -87,9 +94,7 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, n_labels = len(le.classes_) n_samples = X.shape[0] - if not 1 < n_labels < n_samples: - raise ValueError("Number of labels is %d. Valid values are 2 " - "to n_samples - 1 (inclusive)" % n_labels) + check_number_of_labels(n_labels, n_samples) if sample_size is not None: random_state = check_random_state(random_state) @@ -201,3 +206,53 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): sil_samples = inter_clust_dists - intra_clust_dists sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) return sil_samples + + +def calinski_harabaz_score(X, labels): + """Compute the Calinski and Harabaz score. + + The score is defined as ratio between the within-cluster dispersion and + the between-cluster dispersion. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like, shape (``n_samples``, ``n_features``) + List of ``n_features``-dimensional data points. Each row corresponds + to a single data point. + + labels : array-like, shape (``n_samples``,) + Predicted labels for each sample. + + Returns + ------- + score: float + The resulting Calinski-Harabaz score. + + References + ---------- + .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster + analysis". Communications in Statistics + `_ + """ + X, labels = check_X_y(X, labels) + le = LabelEncoder() + labels = le.fit_transform(labels) + + n_samples, _ = X.shape + n_labels = len(le.classes_) + + check_number_of_labels(n_labels, n_samples) + + extra_disp, intra_disp = 0., 0. + mean = np.mean(X, axis=0) + for k in range(n_labels): + cluster_k = X[labels == k] + mean_k = np.mean(cluster_k, axis=0) + extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2) + intra_disp += np.sum((cluster_k - mean_k) ** 2) + + return (1. if intra_disp == 0. else + extra_disp * (n_samples - n_labels) / + (intra_disp * (n_labels - 1.)))