diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 1b81ffd30d7af..b4535fe2cde23 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -868,7 +868,9 @@ details.
 
    metrics.adjusted_mutual_info_score
    metrics.adjusted_rand_score
+   metrics.calinski_harabaz_score
    metrics.completeness_score
+   metrics.fowlkes_mallows_score
    metrics.homogeneity_completeness_v_measure
    metrics.homogeneity_score
    metrics.mutual_info_score
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 70b5b1f879160..7ef9a464295d7 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1339,6 +1339,93 @@ mean of homogeneity and completeness**:
    <http://www.cs.columbia.edu/~hila/hila-thesis-distributed.pdf>`_, Hila
    Becker, PhD Thesis.
 
+.. _fowlkes_mallows_scores:
+
+Fowlkes-Mallows scores
+----------------------
+
+The Fowlkes-Mallows index (:func:`sklearn.metrics.fowlkes_mallows_score`) can be
+used when the ground truth class assignments of the samples is known. The
+Fowlkes-Mallows score FMI is defined as the geometric mean of the
+pairwise precision and recall:
+
+.. math:: \text{FMI} = \frac{\text{TP}}{\sqrt{(\text{TP} + \text{FP}) (\text{TP} + \text{FN})}}
+
+Where ``TP`` is the number of **True Positive** (i.e. the number of pair
+of points that belong to the same clusters in both the true labels and the
+predicted labels), ``FP`` is the number of **False Positive** (i.e. the number
+of pair of points that belong to the same clusters in the true labels and not
+in the predicted labels) and ``FN`` is the number of **False Negative** (i.e the
+number of pair of points that belongs in the same clusters in the predicted
+labels and not in the true labels).
+
+The score ranges from 0 to 1. A high value indicates a good similarity
+between two clusters.
+
+  >>> from sklearn import metrics
+  >>> labels_true = [0, 0, 0, 1, 1, 1]
+  >>> labels_pred = [0, 0, 1, 1, 2, 2]
+
+  >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  0.47140...
+
+One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get
+the same score::
+
+  >>> labels_pred = [1, 1, 0, 0, 3, 3]
+
+  >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  0.47140...
+
+Perfect labeling is scored 1.0::
+
+  >>> labels_pred = labels_true[:]
+  >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  1.0
+
+Bad (e.g. independent labelings) have zero scores::
+
+  >>> labels_true = [0, 1, 2, 0, 3, 4, 5, 1]
+  >>> labels_pred = [1, 1, 0, 0, 2, 2, 2, 2]
+  >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  0.0
+
+Advantages
+~~~~~~~~~~
+
+- **Random (uniform) label assignments have a FMI score close to 0.0**
+  for any value of ``n_clusters`` and ``n_samples`` (which is not the
+  case for raw Mutual Information or the V-measure for instance).
+
+- **Bounded range [0, 1]**:  Values close to zero indicate two label
+  assignments that are largely independent, while values close to one
+  indicate significant agreement. Further, values of exactly 0 indicate
+  **purely** independent label assignments and a AMI of exactly 1 indicates
+  that the two label assignments are equal (with or without permutation).
+
+- **No assumption is made on the cluster structure**: can be used
+  to compare clustering algorithms such as k-means which assumes isotropic
+  blob shapes with results of spectral clustering algorithms which can
+  find cluster with "folded" shapes.
+
+
+Drawbacks
+~~~~~~~~~
+
+- Contrary to inertia, **FMI-based measures require the knowledge
+  of the ground truth classes** while almost never available in practice or
+  requires manual assignment by human annotators (as in the supervised learning
+  setting).
+
+.. topic:: References
+
+  * E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two
+    hierarchical clusterings". Journal of the American Statistical Association.
+    http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf
+
+  * `Wikipedia entry for the Fowlkes-Mallows Index
+    <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
+
 .. _silhouette_coefficient:
 
 Silhouette Coefficient
@@ -1413,3 +1500,73 @@ Drawbacks
 
  * :ref:`example_cluster_plot_kmeans_silhouette_analysis.py` : In this example
    the silhouette analysis is used to choose an optimal value for n_clusters.
+
+.. _calinski_harabaz_index:
+
+Calinski-Harabaz Index
+----------------------
+
+If the ground truth labels are not known, the Calinski-Harabaz index
+(:func:`sklearn.metrics.calinski_harabaz_score`) can be used to evaluate the
+model, where a higher Calinski-Harabaz score relates to a model with better
+defined clusters.
+
+For :math:`k` clusters, the Calinski-Harabaz score :math:`s` is given as the
+ratio of the between-clusters dispersion mean and the within-cluster
+dispersion:
+
+.. math::
+  s(k) = \frac{\mathrm{Tr}(B_k)}{\mathrm{Tr}(W_k)} \times \frac{N - k}{k - 1}
+
+where :math:`B_K` is the between group dispersion matrix and :math:`W_K`
+is the within-cluster dispersion matrix defined by:
+
+.. math:: W_k = \sum_{q=1}^k \sum_{x \in C_q} (x - c_q) (x - c_q)^T
+
+.. math:: B_k = \sum_q n_q (c_q - c) (c_q - c)^T
+
+with :math:`N` be the number of points in our data, :math:`C_q` be the set of
+points in cluster :math:`q`, :math:`c_q` be the center of cluster
+:math:`q`, :math:`c` be the center of :math:`E`, :math:`n_q` be the number of
+points in cluster :math:`q`.
+
+
+  >>> from sklearn import metrics
+  >>> from sklearn.metrics import pairwise_distances
+  >>> from sklearn import datasets
+  >>> dataset = datasets.load_iris()
+  >>> X = dataset.data
+  >>> y = dataset.target
+
+In normal usage, the Calinski-Harabaz index is applied to the results of a
+cluster analysis.
+
+  >>> import numpy as np
+  >>> from sklearn.cluster import KMeans
+  >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
+  >>> labels = kmeans_model.labels_
+  >>> metrics.calinski_harabaz_score(X, labels)  # doctest: +ELLIPSIS
+  560.39...
+
+
+Advantages
+~~~~~~~~~~
+
+- The score is higher when clusters are dense and well separated, which relates
+  to a standard concept of a cluster.
+
+- The score is fast to compute
+
+
+Drawbacks
+~~~~~~~~~
+
+- The Calinski-Harabaz index is generally higher for convex clusters than other
+  concepts of clusters, such as density based clusters like those obtained
+  through DBSCAN.
+
+.. topic:: References
+
+ *  Caliński, T., & Harabasz, J. (1974). "A dendrite method for cluster
+    analysis". Communications in Statistics-theory and Methods 3: 1-27.
+    `doi:10.1080/03610926.2011.560741 <http://dx.doi.org/10.1080/03610926.2011.560741>`_.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 1497aa3807441..3d0dab18bb3ee 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -109,6 +109,14 @@ New features
      One can pass method names such as `predict_proba` to be used in the cross
      validation framework instead of the default `predict`. By `Ori Ziv`_ and `Sears Merritt`_.
 
+   - Added :func:`metrics.cluster.fowlkes_mallows_score`, the Fowlkes Mallows
+     Index which measures the similarity of two clusterings of a set of points
+     By `Arnaud Fouchet`_ and `Thierry Guillemot`_.
+
+   - Added :func:`metrics.calinski_harabaz_score`, which computes the Calinski
+     and Harabaz score to evaluate the resulting clustering of a set of points.
+     By `Arnaud Fouchet`_ and `Thierry Guillemot`_.
+
 Enhancements
 ............
 
@@ -4257,3 +4265,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Sears Merritt: https://github.com/merritts
 
 .. _Wenhua Yang: https://github.com/geekoala
+
+.. _Arnaud Fouchet: https://github.com/afouchet
\ No newline at end of file
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 501599c46442e..413831939fbbc 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -39,8 +39,10 @@
 from .cluster import homogeneity_score
 from .cluster import mutual_info_score
 from .cluster import normalized_mutual_info_score
+from .cluster import fowlkes_mallows_score
 from .cluster import silhouette_samples
 from .cluster import silhouette_score
+from .cluster import calinski_harabaz_score
 from .cluster import v_measure_score
 
 from .pairwise import euclidean_distances
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 8a58dc2acb669..4cda1108ece32 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -15,14 +15,16 @@
 from .supervised import homogeneity_score
 from .supervised import mutual_info_score
 from .supervised import v_measure_score
+from .supervised import fowlkes_mallows_score
 from .supervised import entropy
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
+from .unsupervised import calinski_harabaz_score
 from .bicluster import consensus_score
 
 __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score",
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "entropy", "silhouette_samples", "silhouette_score",
-           "consensus_score"]
+           "fowlkes_mallows_score", "entropy", "silhouette_samples",
+           "silhouette_score", "calinski_harabaz_score", "consensus_score"]
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 77c9c50436061..131c14b5078ca 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -1,4 +1,4 @@
-"""Utilities to evaluate the clustering performance of models
+"""Utilities to evaluate the clustering performance of models.
 
 Functions named as *_score return a scalar value to maximize: the higher the
 better.
@@ -7,6 +7,8 @@
 # Authors: Olivier Grisel <olivier.grisel@ensta.org>
 #          Wei LI <kuantkid@gmail.com>
 #          Diego Molla <dmolla-aliod@gmail.com>
+#          Arnaud Fouchet <foucheta@gmail.com>
+#          Thierry Guillemot <thierry.guillemot.work@gmail.com>
 # License: BSD 3 clause
 
 from math import log
@@ -26,7 +28,7 @@ def comb2(n):
 
 
 def check_clusterings(labels_true, labels_pred):
-    """Check that the two clusterings matching 1D integer arrays"""
+    """Check that the two clusterings matching 1D integer arrays."""
     labels_true = np.asarray(labels_true)
     labels_pred = np.asarray(labels_pred)
 
@@ -101,7 +103,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
 # clustering measures
 
 def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
-    """Rand index adjusted for chance
+    """Rand index adjusted for chance.
 
     The Rand Index computes a similarity measure between two clusterings
     by considering all pairs of samples and counting pairs that are
@@ -193,9 +195,9 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
     # Special limit cases: no clustering since the data is not split;
     # or trivial clustering where each document is assigned a unique cluster.
     # These are perfect matches hence return 1.0.
-    if (classes.shape[0] == clusters.shape[0] == 1
-            or classes.shape[0] == clusters.shape[0] == 0
-            or classes.shape[0] == clusters.shape[0] == len(labels_true)):
+    if (classes.shape[0] == clusters.shape[0] == 1 or
+            classes.shape[0] == clusters.shape[0] == 0 or
+            classes.shape[0] == clusters.shape[0] == len(labels_true)):
         return 1.0
 
     contingency = contingency_matrix(labels_true, labels_pred,
@@ -213,7 +215,7 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
 
 def homogeneity_completeness_v_measure(labels_true, labels_pred,
                                        max_n_classes=5000):
-    """Compute the homogeneity and completeness and V-Measure scores at once
+    """Compute the homogeneity and completeness and V-Measure scores at once.
 
     Those metrics are based on normalized conditional entropy measures of
     the clustering labeling to evaluate given the knowledge of a Ground
@@ -285,14 +287,14 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
     if homogeneity + completeness == 0.0:
         v_measure_score = 0.0
     else:
-        v_measure_score = (2.0 * homogeneity * completeness
-                           / (homogeneity + completeness))
+        v_measure_score = (2.0 * homogeneity * completeness /
+                           (homogeneity + completeness))
 
     return homogeneity, completeness, v_measure_score
 
 
 def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
-    """Homogeneity metric of a cluster labeling given a ground truth
+    """Homogeneity metric of a cluster labeling given a ground truth.
 
     A clustering result satisfies homogeneity if all of its clusters
     contain only data points which are members of a single class.
@@ -372,7 +374,7 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
 
 
 def completeness_score(labels_true, labels_pred, max_n_classes=5000):
-    """Completeness metric of a cluster labeling given a ground truth
+    """Completeness metric of a cluster labeling given a ground truth.
 
     A clustering result satisfies completeness if all the data points
     that are members of a given class are elements of the same cluster.
@@ -550,7 +552,7 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000):
 
 def mutual_info_score(labels_true, labels_pred, contingency=None,
                       max_n_classes=5000):
-    """Mutual Information between two clusterings
+    """Mutual Information between two clusterings.
 
     The Mutual Information is a measure of the similarity between two labels of
     the same data. Where :math:`P(i)` is the probability of a random sample
@@ -621,13 +623,13 @@ def mutual_info_score(labels_true, labels_pred, contingency=None,
     # log(a / b) should be calculated as log(a) - log(b) for
     # possible loss of precision
     log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())
-    mi = (contingency_nm * (log_contingency_nm - log(contingency_sum))
-          + contingency_nm * log_outer)
+    mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
+          contingency_nm * log_outer)
     return mi.sum()
 
 
 def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
-    """Adjusted Mutual Information between two clusterings
+    """Adjusted Mutual Information between two clusterings.
 
     Adjusted Mutual Information (AMI) is an adjustment of the Mutual
     Information (MI) score to account for chance. It accounts for the fact that
@@ -711,8 +713,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
     clusters = np.unique(labels_pred)
     # Special limit cases: no clustering since the data is not split.
     # This is a perfect match hence return 1.0.
-    if (classes.shape[0] == clusters.shape[0] == 1
-            or classes.shape[0] == clusters.shape[0] == 0):
+    if (classes.shape[0] == clusters.shape[0] == 1 or
+            classes.shape[0] == clusters.shape[0] == 0):
         return 1.0
     contingency = contingency_matrix(labels_true, labels_pred,
                                      max_n_classes=max_n_classes)
@@ -729,7 +731,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
 
 
 def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
-    """Normalized Mutual Information between two clusterings
+    """Normalized Mutual Information between two clusterings.
 
     Normalized Mutual Information (NMI) is an normalization of the Mutual
     Information (MI) score to scale the results between 0 (no mutual
@@ -798,8 +800,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
     clusters = np.unique(labels_pred)
     # Special limit cases: no clustering since the data is not split.
     # This is a perfect match hence return 1.0.
-    if (classes.shape[0] == clusters.shape[0] == 1
-            or classes.shape[0] == clusters.shape[0] == 0):
+    if (classes.shape[0] == clusters.shape[0] == 1 or
+            classes.shape[0] == clusters.shape[0] == 0):
         return 1.0
     contingency = contingency_matrix(labels_true, labels_pred,
                                      max_n_classes=max_n_classes)
@@ -814,6 +816,85 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
     return nmi
 
 
+def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000):
+    """Measure the similarity of two clusterings of a set of points.
+
+    The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of
+    the precision and recall::
+
+        FMI = TP / sqrt((TP + FP) * (TP + FN))
+
+    Where ``TP`` is the number of **True Positive** (i.e. the number of pair of
+    points that belongs in the same clusters in both ``labels_true`` and
+    ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the
+    number of pair of points that belongs in the same clusters in
+    ``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of
+    **False Negative** (i.e the number of pair of points that belongs in the
+    same clusters in ``labels_pred`` and not in ``labels_True``).
+
+    The score ranges from 0 to 1. A high value indicates a good similarity
+    between two clusters.
+
+    Read more in the :ref:`User Guide <fowlkes_mallows_scores>`.
+
+    Parameters
+    ----------
+    labels_true : int array, shape = (``n_samples``,)
+        A clustering of the data into disjoint subsets.
+
+    labels_pred : array, shape = (``n_samples``, )
+        A clustering of the data into disjoint subsets.
+
+    max_n_classes : int, optional (default=5000)
+        Maximal number of classes handled by the Fowlkes-Mallows
+        metric. Setting it too high can lead to MemoryError or OS
+        freeze
+
+    Returns
+    -------
+    score : float
+       The resulting Fowlkes-Mallows score.
+
+    Examples
+    --------
+
+    Perfect labelings are both homogeneous and complete, hence have
+    score 1.0::
+
+      >>> from sklearn.metrics.cluster import fowlkes_mallows_score
+      >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1])
+      1.0
+      >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0])
+      1.0
+
+    If classes members are completely split across different clusters,
+    the assignment is totally random, hence the FMI is null::
+
+      >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3])
+      0.0
+
+    References
+    ----------
+    .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two
+       hierarchical clusterings". Journal of the American Statistical
+       Association
+       <http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf>`_
+
+    .. [2] `Wikipedia entry for the Fowlkes-Mallows Index
+           <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
+    """
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred,)
+    n_samples, = labels_true.shape
+
+    c = contingency_matrix(labels_true, labels_pred,
+                           max_n_classes=max_n_classes)
+    tk = np.dot(c.ravel(), c.ravel()) - n_samples
+    pk = np.sum(np.sum(c, axis=0) ** 2) - n_samples
+    qk = np.sum(np.sum(c, axis=1) ** 2) - n_samples
+
+    return tk / np.sqrt(pk * qk) if tk != 0. else 0.
+
+
 def entropy(labels):
     """Calculates the entropy for a labeling."""
     if len(labels) == 0:
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index cd12158605bbf..828c2c544574c 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -10,6 +10,7 @@
 from sklearn.metrics.cluster import mutual_info_score
 from sklearn.metrics.cluster import expected_mutual_information
 from sklearn.metrics.cluster import contingency_matrix
+from sklearn.metrics.cluster import fowlkes_mallows_score
 from sklearn.metrics.cluster import entropy
 
 from sklearn.utils.testing import assert_raise_message
@@ -229,3 +230,20 @@ def test_max_n_classes():
         assert_raise_message(ValueError, expected, score_func,
                              labels_zero, labels_pred,
                              max_n_classes=50)
+
+
+def test_fowlkes_mallows_score():
+    # General case
+    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1],
+                                  [0, 0, 1, 1, 2, 2])
+    assert_almost_equal(score, 4. / np.sqrt(12. * 6.))
+
+    # Perfect match but where the label names changed
+    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1],
+                                          [1, 1, 1, 0, 0, 0])
+    assert_almost_equal(perfect_score, 1.)
+
+    # Worst case
+    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0],
+                                        [0, 1, 2, 3, 4, 5])
+    assert_almost_equal(worst_score, 0.)
diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py
index b346c7e54632e..e1718e604d29a 100644
--- a/sklearn/metrics/cluster/tests/test_unsupervised.py
+++ b/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -2,12 +2,14 @@
 from scipy.sparse import csr_matrix
 
 from sklearn import datasets
-from sklearn.metrics.cluster.unsupervised import silhouette_score
-from sklearn.metrics import pairwise_distances
 from sklearn.utils.testing import assert_false
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_raises_regexp
+from sklearn.utils.testing import assert_raise_message
+from sklearn.metrics.cluster import silhouette_score
+from sklearn.metrics.cluster import calinski_harabaz_score
+from sklearn.metrics import pairwise_distances
 
 
 def test_silhouette():
@@ -86,3 +88,32 @@ def test_non_numpy_labels():
     y = dataset.target
     assert_equal(
         silhouette_score(list(X), list(y)), silhouette_score(X, y))
+
+
+def test_calinski_harabaz_score():
+    rng = np.random.RandomState(seed=0)
+
+    # Assert message when there is only one label
+    assert_raise_message(ValueError, "Number of labels is",
+                         calinski_harabaz_score,
+                         rng.rand(10, 2), np.zeros(10))
+
+    # Assert message when all point are in different clusters
+    assert_raise_message(ValueError, "Number of labels is",
+                         calinski_harabaz_score,
+                         rng.rand(10, 2), np.arange(10))
+
+    # Assert the value is 1. when all samples are equals
+    assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
+                                            [0] * 5 + [1] * 5))
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
+                                            [0] * 10 + [1] * 10))
+
+    # General case (with non numpy arrays)
+    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
+         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    assert_almost_equal(calinski_harabaz_score(X, labels),
+                        45 * (40 - 4) / (5 * (4 - 1)))
diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index 4a8ff450e7cea..ce9e35487b1b7 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -1,7 +1,8 @@
-""" Unsupervised evaluation metrics. """
+"""Unsupervised evaluation metrics."""
 
 # Authors: Robert Layton <robertlayton@gmail.com>
-#
+#          Arnaud Fouchet <foucheta@gmail.com>
+#          Thierry Guillemot <thierry.guillemot.work@gmail.com>
 # License: BSD 3 clause
 
 import numpy as np
@@ -12,6 +13,12 @@
 from ...preprocessing import LabelEncoder
 
 
+def check_number_of_labels(n_labels, n_samples):
+    if not 1 < n_labels < n_samples:
+        raise ValueError("Number of labels is %d. Valid values are 2 "
+                         "to n_samples - 1 (inclusive)" % n_labels)
+
+
 def silhouette_score(X, labels, metric='euclidean', sample_size=None,
                      random_state=None, **kwds):
     """Compute the mean Silhouette Coefficient of all samples.
@@ -50,12 +57,12 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None,
         array itself, use ``metric="precomputed"``.
 
     sample_size : int or None
-        The size of the sample to use when computing the Silhouette Coefficient 
-        on a random subset of the data. 
+        The size of the sample to use when computing the Silhouette Coefficient
+        on a random subset of the data.
         If ``sample_size is None``, no sampling is used.
 
     random_state : integer or numpy.RandomState, optional
-        The generator used to randomly select a subset of samples if 
+        The generator used to randomly select a subset of samples if
         ``sample_size is not None``. If an integer is given, it fixes the seed.
         Defaults to the global numpy random number generator.
 
@@ -87,9 +94,7 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None,
     n_labels = len(le.classes_)
     n_samples = X.shape[0]
 
-    if not 1 < n_labels < n_samples:
-        raise ValueError("Number of labels is %d. Valid values are 2 "
-                         "to n_samples - 1 (inclusive)" % n_labels)
+    check_number_of_labels(n_labels, n_samples)
 
     if sample_size is not None:
         random_state = check_random_state(random_state)
@@ -201,3 +206,53 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
     sil_samples = inter_clust_dists - intra_clust_dists
     sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
     return sil_samples
+
+
+def calinski_harabaz_score(X, labels):
+    """Compute the Calinski and Harabaz score.
+
+    The score is defined as ratio between the within-cluster dispersion and
+    the between-cluster dispersion.
+
+    Read more in the :ref:`User Guide <calinski_harabaz_index>`.
+
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        List of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like, shape (``n_samples``,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score: float
+        The resulting Calinski-Harabaz score.
+
+    References
+    ----------
+    .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
+       analysis". Communications in Statistics
+       <http://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
+    """
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+
+    check_number_of_labels(n_labels, n_samples)
+
+    extra_disp, intra_disp = 0., 0.
+    mean = np.mean(X, axis=0)
+    for k in range(n_labels):
+        cluster_k = X[labels == k]
+        mean_k = np.mean(cluster_k, axis=0)
+        extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2)
+        intra_disp += np.sum((cluster_k - mean_k) ** 2)
+
+    return (1. if intra_disp == 0. else
+            extra_disp * (n_samples - n_labels) /
+            (intra_disp * (n_labels - 1.)))