diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index f04ae4cc41eca..199bbb95f2d88 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -270,6 +270,11 @@ Enhancements
      (`#6913 <https://github.com/scikit-learn/scikit-learn/pull/6913>`_)
      By `YenChen Lin`_.
 
+   - Added ``labels`` flag to :class:`metrics.log_loss` to to explicitly provide
+     the labels when the number of classes in ``y_true`` and ``y_pred`` differ.
+     (`#7239 <https://github.com/scikit-learn/scikit-learn/pull/7239/>`_)
+     by `Hong Guangguo`_ with help from `Mads Jensen`_ and `Nelson Liu`_.
+
 Bug fixes
 .........
 
@@ -4376,3 +4381,7 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Konstantin Podshumok: https://github.com/podshumok
 
 .. _David Staub: https://github.com/staubda
+
+.. _Hong Guangguo: https://github.com/hongguangguo
+
+.. _Mads Jensen: https://github.com/indianajensen
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 2b8a4289b6a41..608d7486da0e3 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -1544,13 +1544,15 @@ def hamming_loss(y_true, y_pred, classes=None, sample_weight=None):
         raise ValueError("{0} is not supported".format(y_type))
 
 
-def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
+def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None,
+             labels=None):
     """Log loss, aka logistic loss or cross-entropy loss.
 
     This is the loss function used in (multinomial) logistic regression
     and extensions of it such as neural networks, defined as the negative
     log-likelihood of the true labels given a probabilistic classifier's
-    predictions. For a single sample with true label yt in {0,1} and
+    predictions. The log loss is only defined for two or more labels.
+    For a single sample with true label yt in {0,1} and
     estimated probability yp that yt = 1, the log loss is
 
         -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
@@ -1562,9 +1564,13 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
     y_true : array-like or label indicator matrix
         Ground truth (correct) labels for n_samples samples.
 
-    y_pred : array-like of float, shape = (n_samples, n_classes)
+    y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
         Predicted probabilities, as returned by a classifier's
-        predict_proba method.
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`preprocessing.LabelBinarizer`.
 
     eps : float
         Log loss is undefined for p=0 or p=1, so probabilities are
@@ -1577,6 +1583,12 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
+    labels : array-like, optional (default=None)
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+        .. versionadded:: 0.18
+
     Returns
     -------
     loss : float
@@ -1596,37 +1608,58 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
     -----
     The logarithm used is the natural logarithm (base-e).
     """
+    y_pred = check_array(y_pred, ensure_2d=False)
+    check_consistent_length(y_pred, y_true)
+
     lb = LabelBinarizer()
-    T = lb.fit_transform(y_true)
-    if T.shape[1] == 1:
-        T = np.append(1 - T, T, axis=1)
 
-    y_pred = check_array(y_pred, ensure_2d=False)
-    # Clipping
-    Y = np.clip(y_pred, eps, 1 - eps)
+    if labels is not None:
+        lb.fit(labels)
+    else:
+        lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError('y_true contains only one label ({0}). Please provide '
+                             'the true labels explicitly through the labels '
+                             'argument.'.format(lb.classes_[0]))
+        else:
+            raise ValueError('The labels array needs to contain at least two labels'
+                             'for log_loss, got {0}.'.format(lb.classes_))
 
-    # This happens in cases when elements in y_pred have type "str".
-    if not isinstance(Y, np.ndarray):
-        raise ValueError("y_pred should be an array of floats.")
+    transformed_labels = lb.transform(y_true)
+
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = np.append(1 - transformed_labels,
+                                       transformed_labels, axis=1)
+
+    # Clipping
+    y_pred = np.clip(y_pred, eps, 1 - eps)
 
     # If y_pred is of single dimension, assume y_true to be binary
     # and then check.
-    if Y.ndim == 1:
-        Y = Y[:, np.newaxis]
-    if Y.shape[1] == 1:
-        Y = np.append(1 - Y, Y, axis=1)
+    if y_pred.ndim == 1:
+        y_pred = y_pred[:, np.newaxis]
+    if y_pred.shape[1] == 1:
+        y_pred = np.append(1 - y_pred, y_pred, axis=1)
 
     # Check if dimensions are consistent.
-    check_consistent_length(T, Y)
-    T = check_array(T)
-    Y = check_array(Y)
-    if T.shape[1] != Y.shape[1]:
-        raise ValueError("y_true and y_pred have different number of classes "
-                         "%d, %d" % (T.shape[1], Y.shape[1]))
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_pred.shape[1]:
+        if labels is None:
+            raise ValueError("y_true and y_pred contain different number of classes "
+                             "{0}, {1}. Please provide the true labels explicitly "
+                             "through the labels argument. Classes found in"
+                             "y_true: {2}".format(transformed_labels.shape[1],
+                                               y_pred.shape[1], lb.classes_))
+        else:
+            raise ValueError('The number of classes in labels is different '
+                             'from that in y_pred. Classes found in '
+                             'labels: {0}'.format(lb.classes_))
 
     # Renormalize
-    Y /= Y.sum(axis=1)[:, np.newaxis]
-    loss = -(T * np.log(Y)).sum(axis=1)
+    y_pred /= y_pred.sum(axis=1)[:, np.newaxis]
+    loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
 
     return _weighted_sum(loss, sample_weight, normalize)
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index e578964b600f2..c8b06b9413e5e 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -45,7 +45,6 @@
 from sklearn.metrics import zero_one_loss
 from sklearn.metrics import brier_score_loss
 
-
 from sklearn.metrics.classification import _check_targets
 from sklearn.exceptions import UndefinedMetricWarning
 
@@ -1384,6 +1383,32 @@ def test_log_loss():
     loss = log_loss(y_true, y_pred)
     assert_almost_equal(loss, 1.0383217, decimal=6)
 
+    # test labels option
+
+    y_true = [2, 2]
+    y_pred = [[0.2, 0.7], [0.6, 0.5]]
+    y_score = np.array([[0.1, 0.9], [0.1, 0.9]])
+    error_str = ('y_true contains only one label (2). Please provide '
+                 'the true labels explicitly through the labels argument.')
+    assert_raise_message(ValueError, error_str, log_loss, y_true, y_pred)
+
+    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.2, 0.3]]
+    error_str = ('Found arrays with inconsistent numbers of '
+                 'samples: [2 3]')
+    assert_raise_message(ValueError, error_str, log_loss, y_true, y_pred)
+
+    # works when the labels argument is used
+
+    true_log_loss = -np.mean(np.log(y_score[:, 1]))
+    calculated_log_loss = log_loss(y_true, y_score, labels=[1, 2])
+    assert_almost_equal(calculated_log_loss, true_log_loss)
+
+    # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1]
+    y_true = [1, 2, 2]
+    y_score2 = [[0.2, 0.7, 0.3], [0.6, 0.5, 0.3], [0.3, 0.9, 0.1]]
+    loss = log_loss(y_true, y_score2, labels=[1, 2, 3])
+    assert_almost_equal(loss, 1.0630345, decimal=6)
+
 
 def test_log_loss_pandas_input():
     # case when input is a pandas series and dataframe gh-5715