From 64e30d6ae583649698a782dca7d40f87fa93b081 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 5 Nov 2017 21:33:29 +0530
Subject: [PATCH 01/88] multiclass jaccard similarity not equal to
 accurary_score

Fixes #7332
---
 sklearn/metrics/classification.py            | 11 ++++++++++-
 sklearn/metrics/tests/test_classification.py |  4 ++++
 sklearn/svm/base.py                          |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7d8b887c66624..187f3a138cc5e 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -441,6 +441,13 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     >>> jaccard_similarity_score(np.array([[0, 1], [1, 1]]),\
         np.ones((2, 2)))
     0.75
+
+    In the multiclass case:
+
+    >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
+    >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    >>> jaccard_similarity_score(y_true, y_pred)
+    0.38888888888888884
     """
 
     # Compute accuracy for each possible representation
@@ -454,7 +461,9 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
             score = pred_and_true / pred_or_true
             score[pred_or_true == 0.0] = 1.0
     else:
-        score = y_true == y_pred
+        C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+        den = C.sum(0) + C.sum(1) - C.diagonal()
+        score = C.diagonal()/den
 
     return _weighted_sum(score, sample_weight, normalize)
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c259036807f7f..520bc99277683 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -955,6 +955,10 @@ def test_multilabel_jaccard_similarity_score():
     assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0)
     assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0)
 
+    y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
+    y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    assert_almost_equal(jaccard_similarity_score(y1, y2), 7. / 18)
+
 
 @ignore_warnings
 def test_precision_recall_f1_score_multilabel_1():
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index 0b1719562cd57..7ffd373182957 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -554,7 +554,7 @@ def predict(self, X):
     # estimators.
     def _check_proba(self):
         if not self.probability:
-            raise AttributeError("predict_proba is not available when "
+            raise AttributeError("predict_proba is not available when"
                                  " probability=False")
         if self._impl not in ('c_svc', 'nu_svc'):
             raise AttributeError("predict_proba only implemented for SVC"

From a495cfc06491d5f292c970d70ba7acab2dffee8c Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 8 Nov 2017 21:51:17 +0530
Subject: [PATCH 02/88] add space and fix input

---
 sklearn/metrics/classification.py            | 3 ++-
 sklearn/metrics/tests/test_classification.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 187f3a138cc5e..cbb71c7abfc0f 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -463,11 +463,12 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
         den = C.sum(0) + C.sum(1) - C.diagonal()
-        score = C.diagonal()/den
+        score = C.diagonal() / den
 
     return _weighted_sum(score, sample_weight, normalize)
 
 
+
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
     """Compute the Matthews correlation coefficient (MCC)
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 520bc99277683..a5ce523583b03 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -957,7 +957,7 @@ def test_multilabel_jaccard_similarity_score():
 
     y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
-    assert_almost_equal(jaccard_similarity_score(y1, y2), 7. / 18)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 18)
 
 
 @ignore_warnings

From fcba7f05122924a85f6f3303674fa19c4db9aef2 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 10 Nov 2017 14:17:37 +0530
Subject: [PATCH 03/88] score being a n_class size array and weight already
 taken care of

---
 sklearn/metrics/classification.py            | 10 ++++++----
 sklearn/metrics/tests/test_classification.py |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index cbb71c7abfc0f..382fa3c2c34d8 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -434,7 +434,7 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     >>> jaccard_similarity_score(y_true, y_pred)
     0.5
     >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
-    2
+    2.0
 
     In the multilabel case with binary label indicators:
 
@@ -460,13 +460,15 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
             pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1)
             score = pred_and_true / pred_or_true
             score[pred_or_true == 0.0] = 1.0
+        return _weighted_sum(score, sample_weight, normalize)
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
         den = C.sum(0) + C.sum(1) - C.diagonal()
         score = C.diagonal() / den
-
-    return _weighted_sum(score, sample_weight, normalize)
-
+        if normalize:
+            return np.average(score)
+        else:
+            return np.sum(score)
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index a5ce523583b03..6d5d287d43cc5 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -957,6 +957,8 @@ def test_multilabel_jaccard_similarity_score():
 
     y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                normalize=False), 7. / 6)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 18)
 
 

From d49ccab3559c7d8a4258f7c2f5b3c39119706059 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 24 Nov 2017 17:26:52 +0530
Subject: [PATCH 04/88] add space to fix printing of doctest

---
 sklearn/metrics/classification.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 382fa3c2c34d8..eb8b174278697 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -1021,6 +1021,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
     It is possible to compute per-label precisions, recalls, F1-scores and
     supports instead of averaging:
+
     >>> precision_recall_fscore_support(y_true, y_pred, average=None,
     ... labels=['pig', 'dog', 'cat'])
     ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE

From 615ac9ae335000bdb1ea8c1ae4980a91b9d891b3 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 24 Nov 2017 23:52:03 +0530
Subject: [PATCH 05/88] add support for 'average' of type 'macro', 'micro',
 'weighted'

---
 sklearn/metrics/classification.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index eb8b174278697..22faa630d53b3 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -451,6 +451,9 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     """
 
     # Compute accuracy for each possible representation
+    average_options = (None, 'micro', 'macro', 'weighted')
+    if average not in average_options:
+        raise ValueError("average has to be one of " + str(average_options))
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
     if y_type.startswith('multilabel'):
@@ -464,11 +467,20 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
         den = C.sum(0) + C.sum(1) - C.diagonal()
-        score = C.diagonal() / den
-        if normalize:
+        if average == 'macro':
+            den = C.sum(0) + C.sum(1) - C.diagonal()
+            score = C.diagonal() / den
             return np.average(score)
-        else:
-            return np.sum(score)
+        elif average == 'micro':
+            den = 2*np.sum(C) - np.sum(C.diagonal())
+            score = np.sum(C.diagonal())
+            return score / den
+        elif average == 'weighted':
+            den = C.sum(0) + C.sum(1) - C.diagonal()
+            score = C.diagonal()/den
+            if sample_weight == None:
+                sample_weight = C.sum(0)/C.sum()
+            return np.sum(sample_weight*score)
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):

From 78b2a846d9c79fbf56ebada7b3acfe07a9fde535 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 25 Nov 2017 12:01:23 +0530
Subject: [PATCH 06/88] add tests and make documentation changes

---
 doc/modules/model_evaluation.rst             | 10 ++--
 sklearn/metrics/classification.py            | 52 +++++++++++---------
 sklearn/metrics/tests/test_classification.py |  9 +++-
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4a19e27e9c11c..784b505dddbae 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -676,10 +676,14 @@ score is equal to the classification accuracy.
   >>> from sklearn.metrics import jaccard_similarity_score
   >>> y_pred = [0, 2, 1, 3]
   >>> y_true = [0, 1, 2, 3]
-  >>> jaccard_similarity_score(y_true, y_pred)
+  >>> jaccard_similarity_score(y_true, y_pred, average='macro')
   0.5
-  >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
-  2
+  >>> jaccard_similarity_score(y_true, y_pred, average='micro')
+  0.33...
+  >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
+  0.5
+  >>> jaccard_similarity_score(y_true, y_pred)
+  array([ 1.,  0.,  0.,  1. ])
 
 In the multilabel case with binary label indicators: ::
 
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 22faa630d53b3..12e04c4dc9311 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -372,7 +372,7 @@ class labels [2]_.
     return 1 - k
 
 
-def jaccard_similarity_score(y_true, y_pred, normalize=True,
+def jaccard_similarity_score(y_true, y_pred, average=None,
                              sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -391,23 +391,28 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Predicted labels, as returned by a classifier.
 
-    normalize : bool, optional (default=True)
-        If ``False``, return the sum of the Jaccard similarity coefficient
-        over the sample set. Otherwise, return the average of Jaccard
-        similarity coefficient.
-
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
+    average : string, [None (default), 'micro', 'macro', 'weighted']
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance.
+
     Returns
     -------
-    score : float
-        If ``normalize == True``, return the average Jaccard similarity
-        coefficient, else it returns the sum of the Jaccard similarity
-        coefficient over the sample set.
-
-        The best performance is 1 with ``normalize == True`` and the number
-        of samples with ``normalize == False``.
+    score: float (if average is not None) or array of float, shape =\
+            [n_unique_labels]
 
     See also
     --------
@@ -415,26 +420,24 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
 
     Notes
     -----
-    In binary and multiclass classification, this function is equivalent
-    to the ``accuracy_score``. It differs in the multilabel classification
-    problem.
+    In differs in implementation from ``accuracy_score`` from all three
+    classifications i.e. binary, mutliclass and multilabel.
 
     References
     ----------
     .. [1] `Wikipedia entry for the Jaccard index
            <https://en.wikipedia.org/wiki/Jaccard_index>`_
 
-
     Examples
     --------
     >>> import numpy as np
     >>> from sklearn.metrics import jaccard_similarity_score
     >>> y_pred = [0, 2, 1, 3]
     >>> y_true = [0, 1, 2, 3]
-    >>> jaccard_similarity_score(y_true, y_pred)
+    >>> jaccard_similarity_score(y_true, y_pred, average='macro')
     0.5
-    >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
-    2.0
+    >>> jaccard_similarity_score(y_true, y_pred, normalize='micro')
+    0.33...
 
     In the multilabel case with binary label indicators:
 
@@ -450,12 +453,14 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
     0.38888888888888884
     """
 
-    # Compute accuracy for each possible representation
     average_options = (None, 'micro', 'macro', 'weighted')
     if average not in average_options:
         raise ValueError("average has to be one of " + str(average_options))
+
+    # Compute accuracy for each possible representation
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
+
     if y_type.startswith('multilabel'):
         with np.errstate(divide='ignore', invalid='ignore'):
             # oddly, we may get an "invalid" rather than a "divide" error here
@@ -466,7 +471,6 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
         return _weighted_sum(score, sample_weight, normalize)
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
-        den = C.sum(0) + C.sum(1) - C.diagonal()
         if average == 'macro':
             den = C.sum(0) + C.sum(1) - C.diagonal()
             score = C.diagonal() / den
@@ -481,6 +485,10 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True,
             if sample_weight == None:
                 sample_weight = C.sum(0)/C.sum()
             return np.sum(sample_weight*score)
+        else:
+            den = C.sum(0) + C.sum(1) - C.diagonal()
+            score = C.diagonal() / den
+            return score
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 6d5d287d43cc5..f6ebc5abfac76 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -958,8 +958,13 @@ def test_multilabel_jaccard_similarity_score():
     y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                normalize=False), 7. / 6)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 18)
+                                                average='macro'), 7. / 18)
+    assert_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='micro'), 1. / 2)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='weighted'), 7. / 12)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
+                        np.array([2. / 3, 0., 1. / 2]))
 
 
 @ignore_warnings

From 41f7e2ba2113ef0e59ed08a4945645a2d98b9b5f Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 29 Nov 2017 12:14:22 +0530
Subject: [PATCH 07/88] use 'average' for 'multilabel' classification

---
 sklearn/metrics/classification.py            | 22 ++++++++++++++------
 sklearn/metrics/tests/test_classification.py | 10 +++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 12e04c4dc9311..819d4a8960f35 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -463,12 +463,22 @@ def jaccard_similarity_score(y_true, y_pred, average=None,
 
     if y_type.startswith('multilabel'):
         with np.errstate(divide='ignore', invalid='ignore'):
-            # oddly, we may get an "invalid" rather than a "divide" error here
-            pred_or_true = count_nonzero(y_true + y_pred, axis=1)
-            pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1)
-            score = pred_and_true / pred_or_true
-            score[pred_or_true == 0.0] = 1.0
-        return _weighted_sum(score, sample_weight, normalize)
+            pred_or_true = count_nonzero(y_true + y_pred, axis=0)
+            pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=0)
+            if average == 'macro':
+                score = pred_and_true / pred_or_true
+                n_features = y_true.shape[1]
+                return np.sum(score) / n_features
+            elif average == 'micro':
+                score = np.sum(pred_and_true) / np.sum(pred_or_true)
+                return score
+            elif average == 'weighted':
+                score = pred_and_true / pred_or_true
+                score = _weighted_sum(score, sample_weight, normalize=True)
+                return score
+            else:
+                score = pred_and_true / pred_or_true
+                return score
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
         if average == 'macro':
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index f6ebc5abfac76..8909ada0f54bd 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -966,6 +966,16 @@ def test_multilabel_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
                         np.array([2. / 3, 0., 1. / 2]))
 
+    # multilabel testing
+    y_true = np.array([[0, 1, 1], [1, 0, 0]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='macro'), 2. / 3)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='mirco'), 3. / 5)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
+                                    np.array([1. / 2, 1., 1. / 2]))
+
 
 @ignore_warnings
 def test_precision_recall_f1_score_multilabel_1():

From a7d01118f38b8295bf54521ce5a5dc2b1633cb1b Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 29 Nov 2017 18:43:57 +0530
Subject: [PATCH 08/88] introduce average='binary', average='samples'

---
 sklearn/metrics/classification.py            | 44 ++++++++++++++++++--
 sklearn/metrics/tests/test_classification.py | 10 ++++-
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 819d4a8960f35..4312b73ec6bdd 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -372,7 +372,7 @@ class labels [2]_.
     return 1 - k
 
 
-def jaccard_similarity_score(y_true, y_pred, average=None,
+def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
                              sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -394,10 +394,14 @@ def jaccard_similarity_score(y_true, y_pred, average=None,
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
-    average : string, [None (default), 'micro', 'macro', 'weighted']
+    average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
+                       'weighted',]
         If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
         ``'micro'``:
             Calculate metrics globally by counting the total true positives,
             false negatives and false positives.
@@ -408,6 +412,9 @@ def jaccard_similarity_score(y_true, y_pred, average=None,
             Calculate metrics for each label, and find their average, weighted
             by support (the number of true instances for each label). This
             alters 'macro' to account for label imbalance.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification).
 
     Returns
     -------
@@ -420,7 +427,7 @@ def jaccard_similarity_score(y_true, y_pred, average=None,
 
     Notes
     -----
-    In differs in implementation from ``accuracy_score`` from all three
+    It differs in implementation from ``accuracy_score`` from all three
     classifications i.e. binary, mutliclass and multilabel.
 
     References
@@ -453,18 +460,47 @@ def jaccard_similarity_score(y_true, y_pred, average=None,
     0.38888888888888884
     """
 
-    average_options = (None, 'micro', 'macro', 'weighted')
+    average_options = (None, 'binary', 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options:
         raise ValueError("average has to be one of " + str(average_options))
 
     # Compute accuracy for each possible representation
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
+    present_labels = unique_labels(y_true, y_pred)
 
     if y_type.startswith('multilabel'):
+        # default average in multilabel is 'samples'
+        if average == None:
+            average = 'samples'
+
         with np.errstate(divide='ignore', invalid='ignore'):
+
+            if average == 'samples':
+                pred_or_true = count_nonzero(y_true + y_pred, axis=1)
+                pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1)
+                score = pred_and_true / pred_or_true
+                score[pred_or_true == 0.0] = 1.0
+                return _weighted_sum(score, sample_weight, normalize=True)
+
+            if average == 'binary':
+                if pos_label not in present_labels:
+                    if len(present_labels) < 2:
+                        # only -ve labels
+                        return 0.
+                    else:
+                        raise ValueError("pos_label=%r is not a valid label"
+                                         ": %r" % (pos_label, present_labels))
+                y_true_pos = y_true[:, pos_label - 1]
+                y_pred_pos = y_pred[:, pos_label - 1]
+                pred_or_true = count_nonzero(y_true_pos + y_pred_pos)
+                pred_and_true = count_nonzero(y_true_pos.multiply(y_pred_pos))
+                score = pred_and_true / pred_or_true
+                return score
+
             pred_or_true = count_nonzero(y_true + y_pred, axis=0)
             pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=0)
+
             if average == 'macro':
                 score = pred_and_true / pred_or_true
                 n_features = y_true.shape[1]
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 8909ada0f54bd..5dda988af792f 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -969,12 +969,18 @@ def test_multilabel_jaccard_similarity_score():
     # multilabel testing
     y_true = np.array([[0, 1, 1], [1, 0, 0]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    # average='macro'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                 average='macro'), 2. / 3)
+    # average='micro'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                 average='mirco'), 3. / 5)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
-                                    np.array([1. / 2, 1., 1. / 2]))
+    # average='samples' (default)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 12)
+    # average='binary'
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                    average='binary', pos_label=1), 1. / 2)
+    # average='weighted'
 
 
 @ignore_warnings

From 057815a168fa4afe99be3e08c5cc875533c5cf77 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 1 Dec 2017 08:21:34 +0530
Subject: [PATCH 09/88] show errors and warning before anything

this deals with both multilabel and multiclass problems
---
 sklearn/metrics/classification.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 4312b73ec6bdd..9d9b432341fcc 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -469,6 +469,25 @@ def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
     check_consistent_length(y_true, y_pred, sample_weight)
     present_labels = unique_labels(y_true, y_pred)
 
+    if average == 'binary':
+        if y_type == 'binary':
+            if pos_label not in present_labels:
+                if len(present_labels) < 2:
+                    # only -ve labels
+                    return 0.
+                else:
+                    raise ValueError("pos_label=%r is not a valid label: "
+                                    "%r" % (pos_label, present_labels))
+            labels = [pos_label]
+        else:
+            raise ValueError("Target is %s but average='binary'. Please "
+                            "choose another average setting." % y_type)
+    elif pos_label not in (None, 1):
+        warnings.warn("Note that pos_label (set to %r) is ignored when "
+                      "average != 'binary' (got %r). You may use "
+                      "labels=[pos_label] to specify a single positive class."
+                      % (pos_label, average), UserWarning)
+
     if y_type.startswith('multilabel'):
         # default average in multilabel is 'samples'
         if average == None:
@@ -483,14 +502,6 @@ def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
                 score[pred_or_true == 0.0] = 1.0
                 return _weighted_sum(score, sample_weight, normalize=True)
 
-            if average == 'binary':
-                if pos_label not in present_labels:
-                    if len(present_labels) < 2:
-                        # only -ve labels
-                        return 0.
-                    else:
-                        raise ValueError("pos_label=%r is not a valid label"
-                                         ": %r" % (pos_label, present_labels))
                 y_true_pos = y_true[:, pos_label - 1]
                 y_pred_pos = y_pred[:, pos_label - 1]
                 pred_or_true = count_nonzero(y_true_pos + y_pred_pos)

From f1bd76fdcb3fcd829ed01bd8157752a20d12ebda Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 1 Dec 2017 10:50:14 +0530
Subject: [PATCH 10/88] write separate functions

---
 sklearn/metrics/classification.py            |  1 -
 sklearn/metrics/tests/test_classification.py | 43 ++++++++++++++------
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 9d9b432341fcc..aad42d3670977 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -473,7 +473,6 @@ def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
         if y_type == 'binary':
             if pos_label not in present_labels:
                 if len(present_labels) < 2:
-                    # only -ve labels
                     return 0.
                 else:
                     raise ValueError("pos_label=%r is not a valid label: "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 5dda988af792f..3fcfba73fbddb 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -939,6 +939,24 @@ def test_multilabel_hamming_loss():
     assert_warns(DeprecationWarning, hamming_loss, y1, y2, classes=[0, 1])
 
 
+@ignore_warnings
+def test_jaccard_similarity_score():
+    y_true = np.array([0, 0])
+    y_pred = np.array([0, 0])
+    assert_equal(jaccard_similarity_score(y_true, y_pred, average='binary',
+                                          pos_label=-1), 0.)
+
+    y_true = np.array([[0, 1, 1], [1, 0, 0]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
+                  average='binary', pos_label=-1)
+
+    y_true = np.array([0, 1, 1, 0, 2])
+    y_pred = np.array([1, 1, 1, 1, 0])
+    assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
+                  average='binary')
+
+
 def test_multilabel_jaccard_similarity_score():
     # Dense label indicator matrix format
     y1 = np.array([[0, 1, 1], [1, 0, 1]])
@@ -955,18 +973,6 @@ def test_multilabel_jaccard_similarity_score():
     assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0)
     assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0)
 
-    y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
-    y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='macro'), 7. / 18)
-    assert_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='micro'), 1. / 2)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='weighted'), 7. / 12)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
-                        np.array([2. / 3, 0., 1. / 2]))
-
-    # multilabel testing
     y_true = np.array([[0, 1, 1], [1, 0, 0]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
     # average='macro'
@@ -983,6 +989,19 @@ def test_multilabel_jaccard_similarity_score():
     # average='weighted'
 
 
+def test_multiclass_jaccard_similarity_score():
+    y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
+    y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='macro'), 7. / 18)
+    assert_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='micro'), 1. / 2)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='weighted'), 7. / 12)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
+                        np.array([2. / 3, 0., 1. / 2]))
+
+
 @ignore_warnings
 def test_precision_recall_f1_score_multilabel_1():
     # Test precision_recall_f1_score on a crafted multilabel example

From 581d540e75c9e73a76edd5faf9c3a4c9ea02e840 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 2 Dec 2017 16:22:04 +0530
Subject: [PATCH 11/88] completely okay API and improved doctest

---
 sklearn/metrics/classification.py            | 35 +++++++++++++++-----
 sklearn/metrics/tests/test_classification.py |  7 ++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index aad42d3670977..7c51c5acc6452 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -372,8 +372,8 @@ class labels [2]_.
     return 1 - k
 
 
-def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
-                             sample_weight=None):
+def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
+                             average=None, warn=True, sample_weight=None):
     """Jaccard similarity coefficient score
 
     The Jaccard index [1], or Jaccard similarity coefficient, defined as
@@ -391,8 +391,20 @@ def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Predicted labels, as returned by a classifier.
 
-    sample_weight : array-like of shape = [n_samples], optional
-        Sample weights.
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
 
     average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
                        'weighted',]
@@ -416,6 +428,12 @@ def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
             Calculate metrics for each instance, and find their average (only
             meaningful for multilabel classification).
 
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    warn : bool, for internal use
+        This determines whether warning will be raised or not.
+
     Returns
     -------
     score: float (if average is not None) or array of float, shape =\
@@ -444,20 +462,21 @@ def jaccard_similarity_score(y_true, y_pred, pos_label=1, average=None,
     >>> jaccard_similarity_score(y_true, y_pred, average='macro')
     0.5
     >>> jaccard_similarity_score(y_true, y_pred, normalize='micro')
+    ... # doctest: +ELLIPSIS
     0.33...
 
     In the multilabel case with binary label indicators:
 
-    >>> jaccard_similarity_score(np.array([[0, 1], [1, 1]]),\
-        np.ones((2, 2)))
+    >>> jaccard_similarity_score(np.array([[0, 1], [1, 1]]),
+    ... np.ones((2, 2)))
     0.75
 
     In the multiclass case:
 
     >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
-    >>> jaccard_similarity_score(y_true, y_pred)
-    0.38888888888888884
+    >>> jaccard_similarity_score(y_true, y_pred) # doctest: +ELLIPSIS
+    0.388...
     """
 
     average_options = (None, 'binary', 'micro', 'macro', 'weighted', 'samples')
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 3fcfba73fbddb..361484f962557 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -956,6 +956,13 @@ def test_jaccard_similarity_score():
     assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
                   average='binary')
 
+    assert_warns_message(UserWarning,
+                        "Note that pos_label (set to 3) is ignored when"
+                        "average != 'binary' (got None). You may use "
+                        "labels=[pos_label] to specify a single positive"
+                        "class.", jaccard_similarity_score, y_true, y_pred,
+                        pos_label=3)
+
 
 def test_multilabel_jaccard_similarity_score():
     # Dense label indicator matrix format

From aefe9216088eff084f3d5402b23f8bb98c5bdfb8 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 2 Dec 2017 18:05:02 +0530
Subject: [PATCH 12/88] fix lgtm error and better control flow

---
 sklearn/metrics/classification.py            | 34 +++++++++-----------
 sklearn/metrics/tests/test_classification.py |  6 ++--
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7c51c5acc6452..3e56533f524e2 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -506,43 +506,41 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                       "labels=[pos_label] to specify a single positive class."
                       % (pos_label, average), UserWarning)
 
+    if labels is None:
+        labels = present_labels
+        n_labels = None
+    else:
+        n_labels = len(labels)
+        labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
+                                                 assume_unique=True)])
+
     if y_type.startswith('multilabel'):
         # default average in multilabel is 'samples'
         if average == None:
             average = 'samples'
 
         with np.errstate(divide='ignore', invalid='ignore'):
+            sum_axis = 1 if average == 'samples' else 0
+
+            pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis)
+            pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                          axis=sum_axis)
 
             if average == 'samples':
-                pred_or_true = count_nonzero(y_true + y_pred, axis=1)
-                pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1)
                 score = pred_and_true / pred_or_true
                 score[pred_or_true == 0.0] = 1.0
                 return _weighted_sum(score, sample_weight, normalize=True)
-
-                y_true_pos = y_true[:, pos_label - 1]
-                y_pred_pos = y_pred[:, pos_label - 1]
-                pred_or_true = count_nonzero(y_true_pos + y_pred_pos)
-                pred_and_true = count_nonzero(y_true_pos.multiply(y_pred_pos))
-                score = pred_and_true / pred_or_true
-                return score
-
-            pred_or_true = count_nonzero(y_true + y_pred, axis=0)
-            pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=0)
-
-            if average == 'macro':
+            elif average == 'macro':
                 score = pred_and_true / pred_or_true
                 n_features = y_true.shape[1]
                 return np.sum(score) / n_features
             elif average == 'micro':
                 score = np.sum(pred_and_true) / np.sum(pred_or_true)
                 return score
-            elif average == 'weighted':
-                score = pred_and_true / pred_or_true
-                score = _weighted_sum(score, sample_weight, normalize=True)
-                return score
             else:
+                # average='weighted'
                 score = pred_and_true / pred_or_true
+                score = _weighted_sum(score, sample_weight, normalize=True)
                 return score
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 361484f962557..8a6d6ef9589fa 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -987,10 +987,12 @@ def test_multilabel_jaccard_similarity_score():
                                                 average='macro'), 2. / 3)
     # average='micro'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='mirco'), 3. / 5)
+                                                average='micro'), 3. / 5)
     # average='samples' (default)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 12)
-    # average='binary'
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                sample_weight=np.array([0.1, 0.9])), 31. / 60)
+    # average='binary' (wrong example)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                     average='binary', pos_label=1), 1. / 2)
     # average='weighted'

From 83df9581aa9a05b6da121207603264bc1f83ebfa Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 2 Dec 2017 19:12:30 +0530
Subject: [PATCH 13/88] add normalize in API

---
 sklearn/metrics/classification.py            | 32 +++++++++++++++-----
 sklearn/metrics/tests/test_classification.py |  8 ++---
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 3e56533f524e2..cc62df2ea3ba2 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,7 +373,8 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average=None, warn=True, sample_weight=None):
+                             average=None, warn=True, normalize=None,
+                             sample_weight=None):
     """Jaccard similarity coefficient score
 
     The Jaccard index [1], or Jaccard similarity coefficient, defined as
@@ -428,12 +429,17 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             Calculate metrics for each instance, and find their average (only
             meaningful for multilabel classification).
 
+    warn : bool, for internal use
+        This determines whether warning will be raised or not,
+
+    normalize: True, False or None (default)
+        whether to normalize (default) the result or return an array
+        (specified with `normalize=False`). This is only to be specified
+        in case `average='samples'`.
+
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
-    warn : bool, for internal use
-        This determines whether warning will be raised or not.
-
     Returns
     -------
     score: float (if average is not None) or array of float, shape =\
@@ -461,7 +467,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> y_true = [0, 1, 2, 3]
     >>> jaccard_similarity_score(y_true, y_pred, average='macro')
     0.5
-    >>> jaccard_similarity_score(y_true, y_pred, normalize='micro')
+    >>> jaccard_similarity_score(y_true, y_pred, average='micro')
     ... # doctest: +ELLIPSIS
     0.33...
 
@@ -506,6 +512,17 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                       "labels=[pos_label] to specify a single positive class."
                       % (pos_label, average), UserWarning)
 
+    if average == None:
+        average = 'samples'
+
+    if average == 'samples':
+        if normalize is None:
+            normalize = True
+    elif normalize is not None:
+        warnings.warn("Note that normalize (set to %r) is ignored when "
+                      "average != 'samples' (got %r)."
+                      % (normalize, average), UserWarning)
+
     if labels is None:
         labels = present_labels
         n_labels = None
@@ -516,8 +533,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     if y_type.startswith('multilabel'):
         # default average in multilabel is 'samples'
-        if average == None:
-            average = 'samples'
 
         with np.errstate(divide='ignore', invalid='ignore'):
             sum_axis = 1 if average == 'samples' else 0
@@ -529,7 +544,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             if average == 'samples':
                 score = pred_and_true / pred_or_true
                 score[pred_or_true == 0.0] = 1.0
-                return _weighted_sum(score, sample_weight, normalize=True)
+                return _weighted_sum(score, sample_weight, normalize=normalize)
             elif average == 'macro':
                 score = pred_and_true / pred_or_true
                 n_features = y_true.shape[1]
@@ -559,6 +574,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 sample_weight = C.sum(0)/C.sum()
             return np.sum(sample_weight*score)
         else:
+            # average='samples'
             den = C.sum(0) + C.sum(1) - C.diagonal()
             score = C.diagonal() / den
             return score
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 8a6d6ef9589fa..85cea82730c55 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -957,9 +957,9 @@ def test_jaccard_similarity_score():
                   average='binary')
 
     assert_warns_message(UserWarning,
-                        "Note that pos_label (set to 3) is ignored when"
+                        "Note that pos_label (set to 3) is ignored when "
                         "average != 'binary' (got None). You may use "
-                        "labels=[pos_label] to specify a single positive"
+                        "labels=[pos_label] to specify a single positive "
                         "class.", jaccard_similarity_score, y_true, y_pred,
                         pos_label=3)
 
@@ -993,8 +993,8 @@ def test_multilabel_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                 sample_weight=np.array([0.1, 0.9])), 31. / 60)
     # average='binary' (wrong example)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                    average='binary', pos_label=1), 1. / 2)
+#    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+#                                    average='binary', pos_label=1), 1. / 2)
     # average='weighted'
 
 

From 041c668afb913c09b647769f14dcccec0ce2bdf8 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 12 Dec 2017 16:18:47 +0530
Subject: [PATCH 14/88] raise ValueError for not-providing 'avergae' in
 multiclass

---
 sklearn/metrics/classification.py            | 42 ++++++++------------
 sklearn/metrics/tests/test_classification.py | 19 +++++----
 2 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index cc62df2ea3ba2..e8d8130dd9496 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -408,7 +408,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         scores for that label only.
 
     average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
-                       'weighted',]
+                       'weighted']
         If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
@@ -429,11 +429,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             Calculate metrics for each instance, and find their average (only
             meaningful for multilabel classification).
 
-    warn : bool, for internal use
+    warn : bool, optional (default=True), for internal use
         This determines whether warning will be raised or not,
 
     normalize: True, False or None (default)
-        whether to normalize (default) the result or return an array
+        whether to normalize the result or return an array
         (specified with `normalize=False`). This is only to be specified
         in case `average='samples'`.
 
@@ -449,11 +449,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     --------
     accuracy_score, hamming_loss, zero_one_loss
 
-    Notes
-    -----
-    It differs in implementation from ``accuracy_score`` from all three
-    classifications i.e. binary, mutliclass and multilabel.
-
     References
     ----------
     .. [1] `Wikipedia entry for the Jaccard index
@@ -485,8 +480,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     0.388...
     """
 
-    average_options = (None, 'binary', 'micro', 'macro', 'weighted', 'samples')
-    if average not in average_options:
+    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
+    if average not in average_options and average != 'binary':
         raise ValueError("average has to be one of " + str(average_options))
 
     # Compute accuracy for each possible representation
@@ -512,17 +507,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                       "labels=[pos_label] to specify a single positive class."
                       % (pos_label, average), UserWarning)
 
-    if average == None:
-        average = 'samples'
-
-    if average == 'samples':
-        if normalize is None:
-            normalize = True
-    elif normalize is not None:
-        warnings.warn("Note that normalize (set to %r) is ignored when "
-                      "average != 'samples' (got %r)."
-                      % (normalize, average), UserWarning)
-
     if labels is None:
         labels = present_labels
         n_labels = None
@@ -532,6 +516,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                                  assume_unique=True)])
 
     if y_type.startswith('multilabel'):
+        if average is None:
+            average = 'samples'
+        if average == 'samples' and normalize is None:
+            normalize = True
         # default average in multilabel is 'samples'
 
         with np.errstate(divide='ignore', invalid='ignore'):
@@ -573,11 +561,15 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             if sample_weight == None:
                 sample_weight = C.sum(0)/C.sum()
             return np.sum(sample_weight*score)
+#        else:
+#            # average='samples'
+#            den = C.sum(0) + C.sum(1) - C.diagonal()
+#            score = C.diagonal() / den
+#            return score
         else:
-            # average='samples'
-            den = C.sum(0) + C.sum(1) - C.diagonal()
-            score = C.diagonal() / den
-            return score
+            raise ValueError("In multiclass classification average must be "
+                             "one of ('micro', 'macro', 'weighted'), got "
+                             "average=%s." % average)
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 85cea82730c55..ae5c2552e2384 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -956,12 +956,13 @@ def test_jaccard_similarity_score():
     assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
                   average='binary')
 
-    assert_warns_message(UserWarning,
-                        "Note that pos_label (set to 3) is ignored when "
-                        "average != 'binary' (got None). You may use "
-                        "labels=[pos_label] to specify a single positive "
-                        "class.", jaccard_similarity_score, y_true, y_pred,
-                        pos_label=3)
+    # test 'pos_label'
+#    assert_warns_message(UserWarning,
+#                        "Note that pos_label (set to 3) is ignored when "
+#                        "average != 'binary' (got None). You may use "
+#                        "labels=[pos_label] to specify a single positive "
+#                        "class.", jaccard_similarity_score, y_true, y_pred,
+#                        pos_label=3)
 
 
 def test_multilabel_jaccard_similarity_score():
@@ -1007,8 +1008,10 @@ def test_multiclass_jaccard_similarity_score():
                                                 average='micro'), 1. / 2)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                 average='weighted'), 7. / 12)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred),
-                        np.array([2. / 3, 0., 1. / 2]))
+    msg = ("In multiclass classification average must be one of "
+           "('micro', 'macro', 'weighted'), got average=None.")
+    assert_raise_message(ValueError, msg, jaccard_similarity_score, y_true,
+                         y_pred)
 
 
 @ignore_warnings

From 39b92b13ea9416646a14f7ded8ad50a0ee7d5879 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 12 Dec 2017 18:25:01 +0530
Subject: [PATCH 15/88] fixed errors with multiclass for different average
 values

---
 sklearn/metrics/classification.py            | 20 ++++++++++----------
 sklearn/metrics/tests/test_classification.py |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index e8d8130dd9496..3786037229c1d 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -476,8 +476,9 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
-    >>> jaccard_similarity_score(y_true, y_pred) # doctest: +ELLIPSIS
-    0.388...
+    >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
+    ... # doctest: +ELLIPSIS
+    0.4722...
     """
 
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
@@ -556,16 +557,15 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             score = np.sum(C.diagonal())
             return score / den
         elif average == 'weighted':
+            # computation similar to average='macro', apart from computation
+            # of sample_weight below
             den = C.sum(0) + C.sum(1) - C.diagonal()
-            score = C.diagonal()/den
-            if sample_weight == None:
-                sample_weight = C.sum(0)/C.sum()
+            score = C.diagonal() / den
+            if sample_weight is None:
+                _, y_true = np.unique(y_true, return_inverse=True)
+                num = np.bincount(y_true)
+                sample_weight = num / np.sum(num)
             return np.sum(sample_weight*score)
-#        else:
-#            # average='samples'
-#            den = C.sum(0) + C.sum(1) - C.diagonal()
-#            score = C.diagonal() / den
-#            return score
         else:
             raise ValueError("In multiclass classification average must be "
                              "one of ('micro', 'macro', 'weighted'), got "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index ae5c2552e2384..b36c42bf523d7 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1007,7 +1007,7 @@ def test_multiclass_jaccard_similarity_score():
     assert_equal(jaccard_similarity_score(y_true, y_pred,
                                                 average='micro'), 1. / 2)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='weighted'), 7. / 12)
+                                                average='weighted'), 17. / 36)
     msg = ("In multiclass classification average must be one of "
            "('micro', 'macro', 'weighted'), got average=None.")
     assert_raise_message(ValueError, msg, jaccard_similarity_score, y_true,

From a0712b548285a8a6550829ebeda807c329a7b880 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 12 Dec 2017 18:48:35 +0530
Subject: [PATCH 16/88] fix tests, use assert_raise_message instead

---
 sklearn/metrics/tests/test_classification.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index b36c42bf523d7..86d6cd26a23b9 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -948,21 +948,22 @@ def test_jaccard_similarity_score():
 
     y_true = np.array([[0, 1, 1], [1, 0, 0]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
-    assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
-                  average='binary', pos_label=-1)
+    msg1 = ("Target is multilabel-indicator but average='binary'. "
+            "Please choose another average setting.")
+    assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
+                         y_pred, average='binary', pos_label=-1)
 
     y_true = np.array([0, 1, 1, 0, 2])
     y_pred = np.array([1, 1, 1, 1, 0])
     assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
                   average='binary')
 
-    # test 'pos_label'
-#    assert_warns_message(UserWarning,
-#                        "Note that pos_label (set to 3) is ignored when "
-#                        "average != 'binary' (got None). You may use "
-#                        "labels=[pos_label] to specify a single positive "
-#                        "class.", jaccard_similarity_score, y_true, y_pred,
-#                        pos_label=3)
+    assert_warns_message(UserWarning,
+                        "Note that pos_label (set to 3) is ignored when "
+                        "average != 'binary' (got 'micro'). You may use "
+                        "labels=[pos_label] to specify a single positive "
+                        "class.", jaccard_similarity_score, y_true, y_pred,
+                        average='micro', pos_label=3)
 
 
 def test_multilabel_jaccard_similarity_score():

From 113072a2de84bdde1f401772896f6bdacb18c253 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 15 Dec 2017 15:25:18 +0530
Subject: [PATCH 17/88] add common_test for jaccard_similarity_score

---
 sklearn/metrics/tests/test_common.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index e68f4024b24af..08fc4429533dc 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -127,24 +127,32 @@
     "weighted_f2_score": partial(fbeta_score, average="weighted", beta=2),
     "weighted_precision_score": partial(precision_score, average="weighted"),
     "weighted_recall_score": partial(recall_score, average="weighted"),
+    "weighted_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average="weighted"),
 
     "micro_f0.5_score": partial(fbeta_score, average="micro", beta=0.5),
     "micro_f1_score": partial(f1_score, average="micro"),
     "micro_f2_score": partial(fbeta_score, average="micro", beta=2),
     "micro_precision_score": partial(precision_score, average="micro"),
     "micro_recall_score": partial(recall_score, average="micro"),
+    "micro_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average="micro"),
 
     "macro_f0.5_score": partial(fbeta_score, average="macro", beta=0.5),
     "macro_f1_score": partial(f1_score, average="macro"),
     "macro_f2_score": partial(fbeta_score, average="macro", beta=2),
     "macro_precision_score": partial(precision_score, average="macro"),
     "macro_recall_score": partial(recall_score, average="macro"),
+    "macro_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average="macro"),
 
     "samples_f0.5_score": partial(fbeta_score, average="samples", beta=0.5),
     "samples_f1_score": partial(f1_score, average="samples"),
     "samples_f2_score": partial(fbeta_score, average="samples", beta=2),
     "samples_precision_score": partial(precision_score, average="samples"),
     "samples_recall_score": partial(recall_score, average="samples"),
+    "samples_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average="samples"),
 
     "cohen_kappa_score": cohen_kappa_score,
 }
@@ -200,6 +208,7 @@
     "samples_precision_score",
     "samples_recall_score",
     "coverage_error",
+    "samples_jaccard_similarity_score",
 
     "average_precision_score",
     "weighted_average_precision_score",
@@ -222,6 +231,8 @@
     "macro_roc_auc",
     "samples_roc_auc",
 
+    "samples_jaccard_similarity_score",
+
     # with default average='binary', multiclass is prohibited
     "precision_score",
     "recall_score",
@@ -236,7 +247,8 @@
 
 # Metrics with an "average" argument
 METRICS_WITH_AVERAGING = [
-    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score"
+    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
+    "jaccard_similarity_score"
 ]
 
 # Threshold-based metrics with an "average" argument
@@ -272,15 +284,19 @@
     "hamming_loss",
 
     "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
+    "jaccard_similarity_score",
 
     "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
     "weighted_precision_score", "weighted_recall_score",
+    "weighted_jaccard_similarity_score",
 
     "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
     "micro_precision_score", "micro_recall_score",
+    "micro_jaccard_similarity_score",
 
     "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
     "macro_precision_score", "macro_recall_score",
+    "macro_jaccard_similarity_score",
 
     "cohen_kappa_score",
 ]
@@ -316,15 +332,19 @@
 
     "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
     "weighted_precision_score", "weighted_recall_score",
+    "weighted_jaccard_similarity_score",
 
     "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
     "macro_precision_score", "macro_recall_score",
+    "macro_jaccard_similarity_score",
 
     "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
     "micro_precision_score", "micro_recall_score",
+    "micro_jaccard_similarity_score",
 
     "samples_f0.5_score", "samples_f1_score", "samples_f2_score",
     "samples_precision_score", "samples_recall_score",
+    "samples_jaccard_similarity_score"
 ]
 
 # Regression metrics with "multioutput-continuous" format support
@@ -341,6 +361,8 @@
     "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
     "zero_one_loss", "unnormalized_zero_one_loss",
 
+    "micro_jaccard_similarity_score", "macro_jaccard_similarity_score",
+
     "f1_score", "micro_f1_score", "macro_f1_score",
     "weighted_recall_score",
     # P = R = F = accuracy in multiclass case

From c52d5774688c0edaa23c393385c2535ca3fb217c Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 17 Dec 2017 15:00:19 +0530
Subject: [PATCH 18/88] use `average='none-samples'` instead of
 'normalize=False'

---
 sklearn/metrics/classification.py            | 8 +++++++-
 sklearn/metrics/tests/test_classification.py | 6 ++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 3786037229c1d..c16b233c6f6bd 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -428,6 +428,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         ``'samples'``:
             Calculate metrics for each instance, and find their average (only
             meaningful for multilabel classification).
+        ``'none-samples'``:
 
     warn : bool, optional (default=True), for internal use
         This determines whether warning will be raised or not,
@@ -437,6 +438,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         (specified with `normalize=False`). This is only to be specified
         in case `average='samples'`.
 
+        .. versionchanged: 0.20
+           'normalize' is deprecated and will be removed in 0.22, instead use
+           `average='none-samples'`
+
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
@@ -481,7 +486,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     0.4722...
     """
 
-    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
+    average_options = (None, 'micro', 'macro', 'weighted', 'samples',
+                       'none-samples')
     if average not in average_options and average != 'binary':
         raise ValueError("average has to be one of " + str(average_options))
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 86d6cd26a23b9..a95382a9b1d96 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -955,8 +955,10 @@ def test_jaccard_similarity_score():
 
     y_true = np.array([0, 1, 1, 0, 2])
     y_pred = np.array([1, 1, 1, 1, 0])
-    assert_raises(ValueError, jaccard_similarity_score, y_true, y_pred,
-                  average='binary')
+    msg2 = ("Target is multiclass but average='binary'. Please choose "
+            "another average setting.")
+    assert_raise_message(ValueError, msg2, jaccard_similarity_score, y_true,
+                         y_pred, average='binary')
 
     assert_warns_message(UserWarning,
                         "Note that pos_label (set to 3) is ignored when "

From 2e2d762f6809c131e0c5fe528d6e30856796b981 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 17 Dec 2017 18:45:43 +0530
Subject: [PATCH 19/88] average='micro' in multiclass case is equivalent to
 accuracy_score

---
 sklearn/metrics/classification.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index c16b233c6f6bd..5d4a70e237418 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -559,9 +559,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             score = C.diagonal() / den
             return np.average(score)
         elif average == 'micro':
-            den = 2*np.sum(C) - np.sum(C.diagonal())
-            score = np.sum(C.diagonal())
-            return score / den
+            # micro-average on all labels is not useful in the
+            # multiclass case. It is identical to accuracy.
+            score = y_true == y_pred
+            return _weighted_sum(score, sample_weight, normalize)
         elif average == 'weighted':
             # computation similar to average='macro', apart from computation
             # of sample_weight below

From 5504a00c1ac7f087a2fa653464bc49c95eab74d7 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 17 Dec 2017 19:33:18 +0530
Subject: [PATCH 20/88] fixes to multilabel case

---
 sklearn/metrics/classification.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 5d4a70e237418..5264d5c679c8e 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,7 +373,7 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average=None, warn=True, normalize=None,
+                             average=None, warn=True, normalize=True,
                              sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -433,10 +433,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     warn : bool, optional (default=True), for internal use
         This determines whether warning will be raised or not,
 
-    normalize: True, False or None (default)
-        whether to normalize the result or return an array
-        (specified with `normalize=False`). This is only to be specified
-        in case `average='samples'`.
+    normalize: bool, optional (defaul=True)
+        If ``False``, return the sum of the Jaccard similarity coefficient
+        over the sample set. Otherwise, return the average of Jaccard
+        similarity coefficient. This is only to be specified in case
+        `average='samples'`.
 
         .. versionchanged: 0.20
            'normalize' is deprecated and will be removed in 0.22, instead use
@@ -463,13 +464,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     --------
     >>> import numpy as np
     >>> from sklearn.metrics import jaccard_similarity_score
-    >>> y_pred = [0, 2, 1, 3]
-    >>> y_true = [0, 1, 2, 3]
-    >>> jaccard_similarity_score(y_true, y_pred, average='macro')
-    0.5
-    >>> jaccard_similarity_score(y_true, y_pred, average='micro')
-    ... # doctest: +ELLIPSIS
-    0.33...
 
     In the multilabel case with binary label indicators:
 
@@ -479,6 +473,13 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     In the multiclass case:
 
+    >>> y_pred = [0, 2, 1, 3]
+    >>> y_true = [0, 1, 2, 3]
+    >>> jaccard_similarity_score(y_true, y_pred, average='macro')
+    0.5
+    >>> jaccard_similarity_score(y_true, y_pred, average='micro')
+    ... # doctest: +ELLIPSIS
+
     >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
     >>> jaccard_similarity_score(y_true, y_pred, average='weighted')

From b30ba53c69cc410975ed9bac2182a91aeb903ca8 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 17 Dec 2017 21:06:11 +0530
Subject: [PATCH 21/88] add error message for `average='samples'` for
 non-multilable case

---
 sklearn/metrics/classification.py            | 4 ++++
 sklearn/metrics/tests/test_classification.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 5264d5c679c8e..dd1368cf6d9f8 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -553,6 +553,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 score = pred_and_true / pred_or_true
                 score = _weighted_sum(score, sample_weight, normalize=True)
                 return score
+    elif average == 'samples':
+        raise ValueError("Sample-based jaccard similarity score is "
+                         "not meaningful outside multilabel "
+                         "classification. See the accuracy_score instead.")
     else:
         C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
         if average == 'macro':
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index a95382a9b1d96..c4e1556f36625 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -959,6 +959,10 @@ def test_jaccard_similarity_score():
             "another average setting.")
     assert_raise_message(ValueError, msg2, jaccard_similarity_score, y_true,
                          y_pred, average='binary')
+    msg3 = ("Sample-based jaccard similarity score is not meaningful outside "
+            "multilabel classification. See the accuracy_score instead.")
+    assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
+                         y_pred, average='samples')
 
     assert_warns_message(UserWarning,
                         "Note that pos_label (set to 3) is ignored when "

From 8d0ca206a3664978b0ba5b71d0f9e15aed9b7c97 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 20 Dec 2017 23:08:50 +0530
Subject: [PATCH 22/88] add none-samples in common test

---
 sklearn/metrics/classification.py    | 2 +-
 sklearn/metrics/tests/test_common.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index dd1368cf6d9f8..73c6203e9d947 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,7 +373,7 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average=None, warn=True, normalize=True,
+                             average=None, warn=True, normalize=None,
                              sample_weight=None):
     """Jaccard similarity coefficient score
 
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 08fc4429533dc..83cffc2e4fca0 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -155,6 +155,9 @@
     partial(jaccard_similarity_score, average="samples"),
 
     "cohen_kappa_score": cohen_kappa_score,
+
+    "none-samples_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average='none-samples')
 }
 
 THRESHOLDED_METRICS = {
@@ -209,6 +212,7 @@
     "samples_recall_score",
     "coverage_error",
     "samples_jaccard_similarity_score",
+    "none-samples_jaccard_similarity_score",
 
     "average_precision_score",
     "weighted_average_precision_score",
@@ -232,6 +236,7 @@
     "samples_roc_auc",
 
     "samples_jaccard_similarity_score",
+    "none-samples_jaccard_similarity_score",
 
     # with default average='binary', multiclass is prohibited
     "precision_score",
@@ -305,6 +310,7 @@
 METRICS_WITH_NORMALIZE_OPTION = [
     "accuracy_score",
     "jaccard_similarity_score",
+    "samples_jaccard_similarity_score",
     "zero_one_loss",
 ]
 
@@ -344,7 +350,8 @@
 
     "samples_f0.5_score", "samples_f1_score", "samples_f2_score",
     "samples_precision_score", "samples_recall_score",
-    "samples_jaccard_similarity_score"
+    "samples_jaccard_similarity_score",
+    "none-samples_jaccard_similarity_score"
 ]
 
 # Regression metrics with "multioutput-continuous" format support

From ce89b5f23a91d02e352b5bbb39bbf6db1bd06d6d Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 28 Dec 2017 18:01:45 +0530
Subject: [PATCH 23/88] add support for `labels` in multilabel classification

---
 sklearn/metrics/classification.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 73c6203e9d947..62696561ac366 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -530,9 +530,24 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             normalize = True
         # default average in multilabel is 'samples'
 
+        if not np.all(labels == present_labels):
+            if np.max(labels) > np.max(present_labels):
+                raise ValueError('All labels must be in [0, n, labels). '
+                                 'Got %d > %d' %
+                                 (np.max(labels), np.max(present_labels)))
+            if np.min(labels) < 0:
+                raise ValueError('All labels must be in [0, n, labels). '
+                                 'Got %d < 0' % np.min(labels))
+
+        # wait for response on 'prf-bug' PR, since I'm less than 90% sure
+        if n_labels is not None:
+            y_true = y_true[:, labels[:n_labels]]
+            y_pred = y_pred[:, labels[:n_labels]]
+
         with np.errstate(divide='ignore', invalid='ignore'):
             sum_axis = 1 if average == 'samples' else 0
 
+
             pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis)
             pred_and_true = count_nonzero(y_true.multiply(y_pred),
                                           axis=sum_axis)

From 192bb2dee47536341a0e56aec5d3b2eec90001b3 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 30 Dec 2017 13:44:36 +0530
Subject: [PATCH 24/88] fix multilablel classification

labels, sample_weight seems to be working fine, though haven't
fully testing them again, will do in next commit
---
 sklearn/metrics/classification.py            | 71 +++++++++++++++-----
 sklearn/metrics/tests/test_classification.py | 22 +++++-
 2 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 62696561ac366..8739f5a02b9e4 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -524,11 +524,21 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                                  assume_unique=True)])
 
     if y_type.startswith('multilabel'):
-        if average is None:
-            average = 'samples'
-        if average == 'samples' and normalize is None:
-            normalize = True
-        # default average in multilabel is 'samples'
+        if average in (None, 'samples'):
+            if normalize is None:
+                # default is average='samples'
+                average = 'samples'
+            else:
+                if normalize:
+                    average = 'samples'
+                else:
+                    average = 'none-samples'
+                warn_message = ("'normalize' was removed in version 0.20 and "
+                                "will be removed in 0.22, instead use "
+                                "`average='%s'`." % normalize)
+                warnings.warn(warn_message, DeprecationWarning)
+        # else:
+            # otherwise what should we do? raise warning or ValueError?
 
         if not np.all(labels == present_labels):
             if np.max(labels) > np.max(present_labels):
@@ -545,28 +555,53 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             y_pred = y_pred[:, labels[:n_labels]]
 
         with np.errstate(divide='ignore', invalid='ignore'):
-            sum_axis = 1 if average == 'samples' else 0
-
-
-            pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis)
-            pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                          axis=sum_axis)
 
             if average == 'samples':
+                pred_or_true = count_nonzero(y_true + y_pred, axis=1)
+                pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                              axis=1)
+                score = pred_and_true / pred_or_true
+                score[pred_or_true == 0.0] == 1.0
+                return _weighted_sum(score, sample_weight, normalize=True)
+            elif average == 'none-samples':
+                pred_or_true = count_nonzero(y_true + y_pred, axis=1)
+                pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                              axis=1)
                 score = pred_and_true / pred_or_true
-                score[pred_or_true == 0.0] = 1.0
-                return _weighted_sum(score, sample_weight, normalize=normalize)
+                score[pred_or_true == 0.0] == 1.0
+                return _weighted_sum(score, sample_weight, normalize=False)
+            elif average == 'micro':
+                pred_or_true = count_nonzero(y_true + y_pred, axis=1,
+                                             sample_weight=sample_weight)
+                pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                              axis=1,
+                                              sample_weight=sample_weight)
+                if np.sum(pred_or_true):
+                    score = np.sum(pred_and_true) / np.sum(pred_or_true)
+                else:
+                    score = 1.
+                return score
             elif average == 'macro':
+                pred_or_true = count_nonzero(y_true + y_pred, axis=0,
+                                             sample_weight=sample_weight)
+                pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                              axis=0,
+                                              sample_weight=sample_weight)
                 score = pred_and_true / pred_or_true
+                score[pred_or_true == 0.0] == 1.0
                 n_features = y_true.shape[1]
                 return np.sum(score) / n_features
-            elif average == 'micro':
-                score = np.sum(pred_and_true) / np.sum(pred_or_true)
-                return score
             else:
-                # average='weighted'
+                pred_or_true = count_nonzero(y_true + y_pred, axis=0,
+                                             sample_weight=sample_weight)
+                pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                              axis=0,
+                                              sample_weight=sample_weight)
                 score = pred_and_true / pred_or_true
-                score = _weighted_sum(score, sample_weight, normalize=True)
+                score[pred_or_true == 0.0] == 1.0
+                weights = y_true.toarray().sum(axis=0)
+                score = _weighted_sum(score, sample_weight=weights,
+                                      normalize=True)
                 return score
     elif average == 'samples':
         raise ValueError("Sample-based jaccard similarity score is "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c4e1556f36625..df78094fc1148 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1000,10 +1000,30 @@ def test_multilabel_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 12)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                 sample_weight=np.array([0.1, 0.9])), 31. / 60)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='samples',
+                                                 labels=[0, 2]), 1. / 2)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='samples',
+                                                 labels=[1, 2]), 1. / 2)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='samples',
+                                                 sample_weight=[1, 2]), 5. / 9)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 sample_weight=[1, 2]), 4. / 7)
+    y_true = np.array([[0, 1, 1], [1, 0, 1]])
+    y_pred = np.array([[1, 1, 1], [1, 0, 1]])
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='macro'), 5. / 6)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='macro',
+                                                 sample_weight=[1, 2]), 8. / 9)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='weighted'), 7. / 8)
     # average='binary' (wrong example)
 #    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
 #                                    average='binary', pos_label=1), 1. / 2)
-    # average='weighted'
 
 
 def test_multiclass_jaccard_similarity_score():

From 149af2aeed74b91520dfd7954fb05719df806a22 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 31 Dec 2017 15:47:11 +0530
Subject: [PATCH 25/88] fix for multiclass

---
 sklearn/metrics/classification.py            | 58 ++++++++++++++------
 sklearn/metrics/tests/test_classification.py |  4 +-
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 8739f5a02b9e4..9035f2f79c37f 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -592,6 +592,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 n_features = y_true.shape[1]
                 return np.sum(score) / n_features
             else:
+                # average = 'weighted'
                 pred_or_true = count_nonzero(y_true + y_pred, axis=0,
                                              sample_weight=sample_weight)
                 pred_and_true = count_nonzero(y_true.multiply(y_pred),
@@ -608,26 +609,47 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                          "not meaningful outside multilabel "
                          "classification. See the accuracy_score instead.")
     else:
-        C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+        le = LabelEncoder()
+        le.fit(labels)
+        y_true = le.transform(y_true)
+        y_pred = le.transform(y_pred)
+        sorted_labels = le.classes_
+
+        tp = y_true == y_pred
+        tp_bins = y_true[tp]
+        if sample_weight is not None:
+            tp_bins_weights = np.asarray(sample_weight)[tp]
+        else:
+            tp_bins_weights = None
+
+        if len(tp_bins):
+            tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
+                                 minlength=len(labels))
+        else:
+            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
+        if len(y_pred):
+            pred_sum = np.bincount(y_pred, weights=sample_weight,
+                                   minlength=len(labels))
+        if len(y_true):
+            true_sum = np.bincount(y_true, weights=sample_weight,
+                                   minlength=len(labels))
+
+        indices = np.searchsorted(sorted_labels, labels[:n_labels])
+        tp_sum = tp_sum[indices]
+        true_sum = true_sum[indices]
+        pred_sum = pred_sum[indices]
+        den = true_sum + pred_sum - tp_sum
+
         if average == 'macro':
-            den = C.sum(0) + C.sum(1) - C.diagonal()
-            score = C.diagonal() / den
+            score = tp_sum / den
             return np.average(score)
-        elif average == 'micro':
-            # micro-average on all labels is not useful in the
-            # multiclass case. It is identical to accuracy.
-            score = y_true == y_pred
-            return _weighted_sum(score, sample_weight, normalize)
-        elif average == 'weighted':
-            # computation similar to average='macro', apart from computation
-            # of sample_weight below
-            den = C.sum(0) + C.sum(1) - C.diagonal()
-            score = C.diagonal() / den
-            if sample_weight is None:
-                _, y_true = np.unique(y_true, return_inverse=True)
-                num = np.bincount(y_true)
-                sample_weight = num / np.sum(num)
-            return np.sum(sample_weight*score)
+        if average == 'weighted':
+            pass
+
+        if average == 'micro':
+            tp_sum = tp_sum.sum()
+            score = tp_sum / den
+            return score
         else:
             raise ValueError("In multiclass classification average must be "
                              "one of ('micro', 'macro', 'weighted'), got "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index df78094fc1148..4cf9abbf66ed4 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1027,8 +1027,8 @@ def test_multilabel_jaccard_similarity_score():
 
 
 def test_multiclass_jaccard_similarity_score():
-    y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
-    y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    y_true = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
+    y_pred = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                 average='macro'), 7. / 18)
     assert_equal(jaccard_similarity_score(y_true, y_pred,

From 40fca72ceaad020e92e5cd9cda3a695e8fd7ce30 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Mon, 1 Jan 2018 19:00:49 +0530
Subject: [PATCH 26/88] corrected 'macro', 'weighted' for multiclass only
 'micro' remains

---
 sklearn/metrics/classification.py            | 14 ++++++++-----
 sklearn/metrics/tests/test_classification.py | 22 ++++++++++++++------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 9035f2f79c37f..42131331a7000 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -433,7 +433,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     warn : bool, optional (default=True), for internal use
         This determines whether warning will be raised or not,
 
-    normalize: bool, optional (defaul=True)
+    normalize : bool, optional (defaul=True)
         If ``False``, return the sum of the Jaccard similarity coefficient
         over the sample set. Otherwise, return the average of Jaccard
         similarity coefficient. This is only to be specified in case
@@ -615,6 +615,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         y_pred = le.transform(y_pred)
         sorted_labels = le.classes_
 
+        # labels are now from 0 to len(labels) - 1
         tp = y_true == y_pred
         tp_bins = y_true[tp]
         if sample_weight is not None:
@@ -635,18 +636,21 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                    minlength=len(labels))
 
         indices = np.searchsorted(sorted_labels, labels[:n_labels])
-        tp_sum = tp_sum[indices]
         true_sum = true_sum[indices]
         pred_sum = pred_sum[indices]
-        den = true_sum + pred_sum - tp_sum
+        tp_sum = tp_sum[indices]
 
         if average == 'macro':
+            den = true_sum + pred_sum - tp_sum
             score = tp_sum / den
             return np.average(score)
         if average == 'weighted':
-            pass
-
+            den = true_sum + pred_sum - tp_sum
+            score = tp_sum / den
+            return _weighted_sum(score, sample_weight=true_sum, normalize=True)
+        # wrong logic for 'micro'
         if average == 'micro':
+            den = (true_sum + pred_sum - tp_sum).sum()
             tp_sum = tp_sum.sum()
             score = tp_sum / den
             return score
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 4cf9abbf66ed4..7f8874317f083 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1027,14 +1027,24 @@ def test_multilabel_jaccard_similarity_score():
 
 
 def test_multiclass_jaccard_similarity_score():
-    y_true = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
-    y_pred = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    y_true = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat', 'bird', 'bird']
+    y_pred = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird', 'bird', 'cat']
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='macro'), 7. / 18)
-    assert_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='micro'), 1. / 2)
+                                                 average='macro'), 7. / 15)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='weighted'), 17. / 36)
+                                                 average='macro',
+                                                 labels=['ant', 'bird']),
+                        1. / 2)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='macro',
+                                                 labels=['ant', 'cat']),
+                        8. / 15)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='macro',
+                                                 labels=['cat', 'bird']),
+                        11. / 30)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='weighted'), 29. / 60)
     msg = ("In multiclass classification average must be one of "
            "('micro', 'macro', 'weighted'), got average=None.")
     assert_raise_message(ValueError, msg, jaccard_similarity_score, y_true,

From 4b504472d3fb49849766f1b4c333d250c2c3dcf1 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 2 Jan 2018 12:47:09 +0530
Subject: [PATCH 27/88] fix completely logic of average='micro', now only
 'binary' remains

---
 sklearn/metrics/classification.py            | 23 +++++++++++------
 sklearn/metrics/tests/test_classification.py | 27 +++++++++++++++++---
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 42131331a7000..0e13536867fe3 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -636,24 +636,31 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                    minlength=len(labels))
 
         indices = np.searchsorted(sorted_labels, labels[:n_labels])
+        tp_sum = tp_sum[indices]
+
+        if average == 'micro':
+            tp_sum = tp_sum.sum()
+            labels = le.transform(labels[:n_labels])
+            union_indices = np.where(np.isin(y_true, labels) +
+                                     np.isin(y_pred, labels) == True)[0]
+            if sample_weight is not None:
+                den = sample_weight[union_indices].sum()
+            else:
+                den = len(union_indices)
+            score = tp_sum / den
+            return score
+
         true_sum = true_sum[indices]
         pred_sum = pred_sum[indices]
-        tp_sum = tp_sum[indices]
 
         if average == 'macro':
             den = true_sum + pred_sum - tp_sum
             score = tp_sum / den
             return np.average(score)
-        if average == 'weighted':
+        elif average == 'weighted':
             den = true_sum + pred_sum - tp_sum
             score = tp_sum / den
             return _weighted_sum(score, sample_weight=true_sum, normalize=True)
-        # wrong logic for 'micro'
-        if average == 'micro':
-            den = (true_sum + pred_sum - tp_sum).sum()
-            tp_sum = tp_sum.sum()
-            score = tp_sum / den
-            return score
         else:
             raise ValueError("In multiclass classification average must be "
                              "one of ('micro', 'macro', 'weighted'), got "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 7f8874317f083..172c8b4954a41 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1021,9 +1021,6 @@ def test_multilabel_jaccard_similarity_score():
                                                  sample_weight=[1, 2]), 8. / 9)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='weighted'), 7. / 8)
-    # average='binary' (wrong example)
-#    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-#                                    average='binary', pos_label=1), 1. / 2)
 
 
 def test_multiclass_jaccard_similarity_score():
@@ -1045,6 +1042,30 @@ def test_multiclass_jaccard_similarity_score():
                         11. / 30)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='weighted'), 29. / 60)
+
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 labels=['ant', 'cat']),
+                        4. / 7)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 labels=['cat']), 2. / 5)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 labels=['ant']), 2. / 3)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 labels=['bird']), 1. / 3)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 labels=['ant', 'bird']),
+                        1. / 2)
+    weight = np.array([1, 2, 1, 1, 2, 1, 2, 3])
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='micro',
+                                                 labels=['ant', 'bird'],
+                                                 sample_weight=weight),
+                        6. / 11)
     msg = ("In multiclass classification average must be one of "
            "('micro', 'macro', 'weighted'), got average=None.")
     assert_raise_message(ValueError, msg, jaccard_similarity_score, y_true,

From fd099e5f6f7594761eadcce70d3defdad35a3378 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 2 Jan 2018 12:48:23 +0530
Subject: [PATCH 28/88] remove 'warn' from API, after discussion on PR with
 jnothman

---
 sklearn/metrics/classification.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 0e13536867fe3..4e09d2d76e2cb 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,8 +373,7 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average=None, warn=True, normalize=None,
-                             sample_weight=None):
+                             average=None, normalize=None, sample_weight=None):
     """Jaccard similarity coefficient score
 
     The Jaccard index [1], or Jaccard similarity coefficient, defined as
@@ -430,9 +429,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             meaningful for multilabel classification).
         ``'none-samples'``:
 
-    warn : bool, optional (default=True), for internal use
-        This determines whether warning will be raised or not,
-
     normalize : bool, optional (defaul=True)
         If ``False``, return the sum of the Jaccard similarity coefficient
         over the sample set. Otherwise, return the average of Jaccard

From 8c9c6145e15f82787ff706f7256e70580ecd47a8 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 2 Jan 2018 15:35:01 +0530
Subject: [PATCH 29/88] fix average='binary'

---
 sklearn/metrics/classification.py            | 2 +-
 sklearn/metrics/tests/test_classification.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 4e09d2d76e2cb..84f95673d71d7 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -634,7 +634,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         indices = np.searchsorted(sorted_labels, labels[:n_labels])
         tp_sum = tp_sum[indices]
 
-        if average == 'micro':
+        if average == 'micro' or average == 'binary':
             tp_sum = tp_sum.sum()
             labels = le.transform(labels[:n_labels])
             union_indices = np.where(np.isin(y_true, labels) +
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 172c8b4954a41..843164079c054 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1070,6 +1070,13 @@ def test_multiclass_jaccard_similarity_score():
            "('micro', 'macro', 'weighted'), got average=None.")
     assert_raise_message(ValueError, msg, jaccard_similarity_score, y_true,
                          y_pred)
+    y_true = np.array([1, 0, 1, 1, 0])
+    y_pred = np.array([1, 0, 1, 1, 1])
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='binary'), 3. / 4)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='binary',
+                                                 pos_label=0), 1. / 2)
 
 
 @ignore_warnings

From a7d3b4057ffc46491c128c1155e0fec7a68ee713 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 2 Jan 2018 15:45:38 +0530
Subject: [PATCH 30/88] fix doctest, now test_common and lgtm remain to be
 fixed

---
 sklearn/metrics/classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 84f95673d71d7..bdac1b23d98a8 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -474,7 +474,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average='macro')
     0.5
     >>> jaccard_similarity_score(y_true, y_pred, average='micro')
-    ... # doctest: +ELLIPSIS
+    0.5
 
     >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']

From 8a7e67389d48fdf657c7e66fddcb0f4fa96da970 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 3 Jan 2018 00:41:09 +0530
Subject: [PATCH 31/88] this fixes lgtm?

---
 sklearn/metrics/classification.py | 47 ++++++++++++++-----------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index bdac1b23d98a8..bcc67d8c2a397 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -609,12 +609,17 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         le.fit(labels)
         y_true = le.transform(y_true)
         y_pred = le.transform(y_pred)
-        sorted_labels = le.classes_
 
-        # labels are now from 0 to len(labels) - 1
+        labels = le.transform(labels)[:n_labels]
+        indices = np.where(np.isin(y_true, labels) +
+                           np.isin(y_pred, labels) == True)[0]
+
+        y_true = y_true[indices]
+        y_pred = y_pred[indices]
         tp = y_true == y_pred
         tp_bins = y_true[tp]
         if sample_weight is not None:
+            sample_weight = sample_weight[indices]
             tp_bins_weights = np.asarray(sample_weight)[tp]
         else:
             tp_bins_weights = None
@@ -623,7 +628,9 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
                                  minlength=len(labels))
         else:
+            # pathological case
             true_sum = pred_sum = tp_sum = np.zeros(len(labels))
+
         if len(y_pred):
             pred_sum = np.bincount(y_pred, weights=sample_weight,
                                    minlength=len(labels))
@@ -631,36 +638,24 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             true_sum = np.bincount(y_true, weights=sample_weight,
                                    minlength=len(labels))
 
-        indices = np.searchsorted(sorted_labels, labels[:n_labels])
-        tp_sum = tp_sum[indices]
-
         if average == 'micro' or average == 'binary':
-            tp_sum = tp_sum.sum()
-            labels = le.transform(labels[:n_labels])
-            union_indices = np.where(np.isin(y_true, labels) +
-                                     np.isin(y_pred, labels) == True)[0]
-            if sample_weight is not None:
-                den = sample_weight[union_indices].sum()
-            else:
-                den = len(union_indices)
-            score = tp_sum / den
-            return score
-
-        true_sum = true_sum[indices]
-        pred_sum = pred_sum[indices]
-
-        if average == 'macro':
-            den = true_sum + pred_sum - tp_sum
-            score = tp_sum / den
-            return np.average(score)
+            num = np.array([tp_sum.sum()])
+            den = np.array([true_sum.sum()])
+            weights = None
+        elif average == 'macro':
+            num = tp_sum[labels]
+            den = true_sum[labels] + pred_sum[labels] - tp_sum[labels]
+            weights = None
         elif average == 'weighted':
-            den = true_sum + pred_sum - tp_sum
-            score = tp_sum / den
-            return _weighted_sum(score, sample_weight=true_sum, normalize=True)
+            num = tp_sum[labels]
+            den = true_sum[labels] + pred_sum[labels] - tp_sum[labels]
+            weights = true_sum[labels]
         else:
             raise ValueError("In multiclass classification average must be "
                              "one of ('micro', 'macro', 'weighted'), got "
                              "average=%s." % average)
+        score = num / den
+        return np.average(score, weights=weights)
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):

From 6e75c5a277bb68cbdb45ee289176b8325f592c6f Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 7 Jan 2018 20:40:57 +0530
Subject: [PATCH 32/88] fix average='micro' for multiclass jaccard

---
 sklearn/metrics/classification.py            | 21 ++++++++++----------
 sklearn/metrics/tests/test_classification.py |  2 +-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index bcc67d8c2a397..48217e10ce766 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -474,7 +474,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average='macro')
     0.5
     >>> jaccard_similarity_score(y_true, y_pred, average='micro')
-    0.5
+    ... # doctest: +ELLIPSIS
+    0.333...
 
     >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
     >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
@@ -626,30 +627,30 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
         if len(tp_bins):
             tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
-                                 minlength=len(labels))
+                                 minlength=len(labels))[labels]
         else:
             # pathological case
             true_sum = pred_sum = tp_sum = np.zeros(len(labels))
 
         if len(y_pred):
             pred_sum = np.bincount(y_pred, weights=sample_weight,
-                                   minlength=len(labels))
+                                   minlength=len(labels))[labels]
         if len(y_true):
             true_sum = np.bincount(y_true, weights=sample_weight,
-                                   minlength=len(labels))
+                                   minlength=len(labels))[labels]
 
         if average == 'micro' or average == 'binary':
             num = np.array([tp_sum.sum()])
-            den = np.array([true_sum.sum()])
+            den = np.array([true_sum.sum() + pred_sum.sum() - tp_sum.sum()])
             weights = None
         elif average == 'macro':
-            num = tp_sum[labels]
-            den = true_sum[labels] + pred_sum[labels] - tp_sum[labels]
+            num = tp_sum
+            den = true_sum + pred_sum - tp_sum
             weights = None
         elif average == 'weighted':
-            num = tp_sum[labels]
-            den = true_sum[labels] + pred_sum[labels] - tp_sum[labels]
-            weights = true_sum[labels]
+            num = tp_sum
+            den = true_sum + pred_sum - tp_sum
+            weights = true_sum
         else:
             raise ValueError("In multiclass classification average must be "
                              "one of ('micro', 'macro', 'weighted'), got "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 843164079c054..90a216c2df9d2 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1046,7 +1046,7 @@ def test_multiclass_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='micro',
                                                  labels=['ant', 'cat']),
-                        4. / 7)
+                        1. / 2)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='micro',
                                                  labels=['cat']), 2. / 5)

From c80059839fe2ee83898640f5ea974dd70d1d3728 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sun, 7 Jan 2018 21:36:12 +0530
Subject: [PATCH 33/88] add smart tests

---
 sklearn/metrics/tests/test_classification.py | 54 +++++++-------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 90a216c2df9d2..9c5e719e7d3e5 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -10,7 +10,7 @@
 from sklearn import svm
 
 from sklearn.datasets import make_multilabel_classification
-from sklearn.preprocessing import label_binarize
+from sklearn.preprocessing import label_binarize, LabelBinarizer
 from sklearn.utils.validation import check_random_state
 
 from sklearn.utils.testing import assert_raises, clean_warning_registry
@@ -1026,40 +1026,24 @@ def test_multilabel_jaccard_similarity_score():
 def test_multiclass_jaccard_similarity_score():
     y_true = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat', 'bird', 'bird']
     y_pred = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird', 'bird', 'cat']
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='macro'), 7. / 15)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='macro',
-                                                 labels=['ant', 'bird']),
-                        1. / 2)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='macro',
-                                                 labels=['ant', 'cat']),
-                        8. / 15)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='macro',
-                                                 labels=['cat', 'bird']),
-                        11. / 30)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='weighted'), 29. / 60)
-
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 labels=['ant', 'cat']),
-                        1. / 2)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 labels=['cat']), 2. / 5)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 labels=['ant']), 2. / 3)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 labels=['bird']), 1. / 3)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 labels=['ant', 'bird']),
-                        1. / 2)
+    labels = ['ant', 'bird', 'cat']
+    lb = LabelBinarizer()
+    lb.fit(labels)
+    y_true_bin = lb.transform(y_true)
+    y_pred_bin = lb.transform(y_pred)
+    multi_jaccard_similarity_score = partial(jaccard_similarity_score, y_true,
+                                             y_pred)
+    bin_jaccard_similarity_score = partial(jaccard_similarity_score, y_true_bin
+                                           , y_pred_bin)
+    multi_labels_list = [['ant', 'bird'], ['ant', 'cat'], ['cat', 'bird'],
+                         ['ant'], ['bird'], ['cat'], None]
+    bin_labels_list = [[0, 1], [0, 2], [2, 1], [0], [1], [2], None]
+    for average in ('macro', 'weighted', 'micro'):
+        for m_label, b_label in zip(multi_labels_list, bin_labels_list):
+            assert_almost_equal(multi_jaccard_similarity_score(average=average,
+                                                               labels=m_label),
+                                bin_jaccard_similarity_score(average=average,
+                                                             labels=b_label))
     weight = np.array([1, 2, 1, 1, 2, 1, 2, 3])
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='micro',

From c3fa41d195d96a80d9bfc196409250a2c214e21e Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Mon, 8 Jan 2018 16:16:12 +0530
Subject: [PATCH 34/88] first fix for test_common

---
 sklearn/metrics/tests/test_common.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 83cffc2e4fca0..ff03ac4ce0cbd 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -224,6 +224,11 @@
     "label_ranking_average_precision_score",
 ]
 
+# Those metrics don't support average=None for multiclass input
+NONE_AVERAGE_UNDEFINED_MULTICLASS = [
+    "jaccard_similarity_score"
+]
+
 # Those metrics don't support multiclass inputs
 METRIC_UNDEFINED_MULTICLASS = [
     "brier_score_loss",
@@ -1142,9 +1147,14 @@ def test_no_averaging_labels():
                                [y_true_multilabel, y_pred_multilabel]]:
             if name not in MULTILABELS_METRICS and y_pred.ndim > 1:
                 continue
+            if name in NONE_AVERAGE_UNDEFINED_MULTICLASS and y_pred.ndim < 2:
+                continue
 
             metric = ALL_METRICS[name]
 
             score_labels = metric(y_true, y_pred, labels=labels, average=None)
             score = metric(y_true, y_pred, average=None)
-            assert_array_equal(score_labels, score[inverse_labels])
+            if isinstance(score, np.ndarray):
+                assert_array_equal(score_labels, score[inverse_labels])
+            else:
+                assert_almost_equal(score_labels, score)

From 27ffebf1e3f8113efc9a3bad9a4005e0c2839afd Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Mon, 8 Jan 2018 17:31:58 +0530
Subject: [PATCH 35/88] fixes LGTM errors?

---
 sklearn/metrics/classification.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 48217e10ce766..4663e9e705b92 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -640,22 +640,19 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                    minlength=len(labels))[labels]
 
         if average == 'micro' or average == 'binary':
-            num = np.array([tp_sum.sum()])
-            den = np.array([true_sum.sum() + pred_sum.sum() - tp_sum.sum()])
+            tp_sum = np.array([tp_sum.sum()])
+            true_sum = np.array([true_sum.sum()])
+            pred_sum = np.array([pred_sum.sum()])
             weights = None
         elif average == 'macro':
-            num = tp_sum
-            den = true_sum + pred_sum - tp_sum
             weights = None
         elif average == 'weighted':
-            num = tp_sum
-            den = true_sum + pred_sum - tp_sum
             weights = true_sum
         else:
             raise ValueError("In multiclass classification average must be "
                              "one of ('micro', 'macro', 'weighted'), got "
                              "average=%s." % average)
-        score = num / den
+        score = tp_sum / (true_sum + pred_sum - tp_sum)
         return np.average(score, weights=weights)
 
 

From e017ccfe2e104f27e310ac3446f381130014b213 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 9 Jan 2018 11:44:53 +0530
Subject: [PATCH 36/88] remove warning from tests/test_classification.py

---
 sklearn/metrics/tests/test_classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 9c5e719e7d3e5..845818b4e52a7 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -939,7 +939,6 @@ def test_multilabel_hamming_loss():
     assert_warns(DeprecationWarning, hamming_loss, y1, y2, classes=[0, 1])
 
 
-@ignore_warnings
 def test_jaccard_similarity_score():
     y_true = np.array([0, 0])
     y_pred = np.array([0, 0])

From 9ee4c11831334d705868a97e5932a8840ae12727 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 10 Jan 2018 18:51:10 +0530
Subject: [PATCH 37/88] simplify code for multilabel jaccard

---
 sklearn/metrics/classification.py            | 94 +++++++++-----------
 sklearn/metrics/tests/test_classification.py |  6 +-
 sklearn/metrics/tests/test_common.py         | 12 +--
 3 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 4663e9e705b92..2e9f5391a4569 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -482,6 +482,13 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
     ... # doctest: +ELLIPSIS
     0.4722...
+    >>> jaccard_similarity_score(y_true, y_pred)
+    ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
+    array([ 0.66...,  0. ,  0.5  ])
+    >>> jaccard_similarity_score(y_true, y_pred,
+    ... labels=['ant', 'cat', 'bird'])
+    ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
+    array([ 0.66...,  0.5 ,  0.  ])
     """
 
     average_options = (None, 'micro', 'macro', 'weighted', 'samples',
@@ -546,61 +553,44 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 raise ValueError('All labels must be in [0, n, labels). '
                                  'Got %d < 0' % np.min(labels))
 
-        # wait for response on 'prf-bug' PR, since I'm less than 90% sure
         if n_labels is not None:
             y_true = y_true[:, labels[:n_labels]]
             y_pred = y_pred[:, labels[:n_labels]]
 
         with np.errstate(divide='ignore', invalid='ignore'):
 
-            if average == 'samples':
-                pred_or_true = count_nonzero(y_true + y_pred, axis=1)
-                pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                              axis=1)
-                score = pred_and_true / pred_or_true
-                score[pred_or_true == 0.0] == 1.0
-                return _weighted_sum(score, sample_weight, normalize=True)
-            elif average == 'none-samples':
-                pred_or_true = count_nonzero(y_true + y_pred, axis=1)
-                pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                              axis=1)
-                score = pred_and_true / pred_or_true
-                score[pred_or_true == 0.0] == 1.0
-                return _weighted_sum(score, sample_weight, normalize=False)
+            if average == 'samples' or average == 'none-samples':
+                sum_axis = 1
+                class_weight = sample_weight
+                weights = None
             elif average == 'micro':
-                pred_or_true = count_nonzero(y_true + y_pred, axis=1,
-                                             sample_weight=sample_weight)
-                pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                              axis=1,
-                                              sample_weight=sample_weight)
-                if np.sum(pred_or_true):
-                    score = np.sum(pred_and_true) / np.sum(pred_or_true)
-                else:
-                    score = 1.
-                return score
+                sum_axis = 1
+                class_weight = None
+                weights = sample_weight
             elif average == 'macro':
-                pred_or_true = count_nonzero(y_true + y_pred, axis=0,
-                                             sample_weight=sample_weight)
-                pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                              axis=0,
-                                              sample_weight=sample_weight)
-                score = pred_and_true / pred_or_true
-                score[pred_or_true == 0.0] == 1.0
-                n_features = y_true.shape[1]
-                return np.sum(score) / n_features
-            else:
-                # average = 'weighted'
-                pred_or_true = count_nonzero(y_true + y_pred, axis=0,
-                                             sample_weight=sample_weight)
-                pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                              axis=0,
-                                              sample_weight=sample_weight)
-                score = pred_and_true / pred_or_true
-                score[pred_or_true == 0.0] == 1.0
-                weights = y_true.toarray().sum(axis=0)
-                score = _weighted_sum(score, sample_weight=weights,
-                                      normalize=True)
-                return score
+                sum_axis = 0
+                class_weight = None
+                weights = sample_weight
+            elif average == 'weighted':
+                sum_axis = 0
+                class_weight = y_true.toarray().sum(axis=0)
+                weights = sample_weight
+
+            pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis,
+                                         sample_weight=weights)
+            pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                          axis=sum_axis,
+                                          sample_weight=weights)
+            if average == 'micro':
+                pred_or_true = np.array([pred_or_true.sum()])
+                pred_and_true = np.array([pred_and_true.sum()])
+
+            score = pred_and_true / pred_or_true
+            score[pred_or_true == 0.0] == 1.0
+
+            if average != 'none-samples':
+                score = np.average(score, weights=class_weight)
+            return score
     elif average == 'samples':
         raise ValueError("Sample-based jaccard similarity score is "
                          "not meaningful outside multilabel "
@@ -648,12 +638,12 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             weights = None
         elif average == 'weighted':
             weights = true_sum
-        else:
-            raise ValueError("In multiclass classification average must be "
-                             "one of ('micro', 'macro', 'weighted'), got "
-                             "average=%s." % average)
+
         score = tp_sum / (true_sum + pred_sum - tp_sum)
-        return np.average(score, weights=weights)
+
+        if average is not None:
+            score = np.average(score, weights=weights)
+        return score
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 845818b4e52a7..3ecfaa3941533 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1049,10 +1049,8 @@ def test_multiclass_jaccard_similarity_score():
                                                  labels=['ant', 'bird'],
                                                  sample_weight=weight),
                         6. / 11)
-    msg = ("In multiclass classification average must be one of "
-           "('micro', 'macro', 'weighted'), got average=None.")
-    assert_raise_message(ValueError, msg, jaccard_similarity_score, y_true,
-                         y_pred)
+    assert_array_equal(jaccard_similarity_score(y_true, y_pred),
+                       np.array([2. / 3,  1. / 3,  2. / 5]))
     y_true = np.array([1, 0, 1, 1, 0])
     y_pred = np.array([1, 0, 1, 1, 1])
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index ff03ac4ce0cbd..83cffc2e4fca0 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -224,11 +224,6 @@
     "label_ranking_average_precision_score",
 ]
 
-# Those metrics don't support average=None for multiclass input
-NONE_AVERAGE_UNDEFINED_MULTICLASS = [
-    "jaccard_similarity_score"
-]
-
 # Those metrics don't support multiclass inputs
 METRIC_UNDEFINED_MULTICLASS = [
     "brier_score_loss",
@@ -1147,14 +1142,9 @@ def test_no_averaging_labels():
                                [y_true_multilabel, y_pred_multilabel]]:
             if name not in MULTILABELS_METRICS and y_pred.ndim > 1:
                 continue
-            if name in NONE_AVERAGE_UNDEFINED_MULTICLASS and y_pred.ndim < 2:
-                continue
 
             metric = ALL_METRICS[name]
 
             score_labels = metric(y_true, y_pred, labels=labels, average=None)
             score = metric(y_true, y_pred, average=None)
-            if isinstance(score, np.ndarray):
-                assert_array_equal(score_labels, score[inverse_labels])
-            else:
-                assert_almost_equal(score_labels, score)
+            assert_array_equal(score_labels, score[inverse_labels])

From d1311c7ad6e9d8143705234afc6e9f258f7b46f9 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 11 Jan 2018 15:35:14 +0530
Subject: [PATCH 38/88] address Joel's comments

---
 sklearn/metrics/classification.py            | 52 +++++++++++---------
 sklearn/metrics/tests/test_classification.py | 17 ++++++-
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 2e9f5391a4569..d74d6ac9dc5fc 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,7 +373,8 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average=None, normalize=None, sample_weight=None):
+                             average='samples', normalize=None,
+                             sample_weight=None):
     """Jaccard similarity coefficient score
 
     The Jaccard index [1], or Jaccard similarity coefficient, defined as
@@ -406,7 +407,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
         scores for that label only.
 
-    average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
+    average : string, ['samples' (default), 'binary', 'micro', 'macro', None, \
                        'weighted']
         If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
@@ -428,16 +429,20 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             Calculate metrics for each instance, and find their average (only
             meaningful for multilabel classification).
         ``'none-samples'``:
+            Calculate metrics for each instance, (only meaningful for
+            multilabel classification). This differs from 'samples' to return
+            array of class-wise jaccard index, instead of normalzing the array.
 
-    normalize : bool, optional (defaul=True)
+    normalize : None, bool, optional (defaul=True)
         If ``False``, return the sum of the Jaccard similarity coefficient
         over the sample set. Otherwise, return the average of Jaccard
         similarity coefficient. This is only to be specified in case
         `average='samples'`.
 
         .. versionchanged: 0.20
-           'normalize' is deprecated and will be removed in 0.22, instead use
-           `average='none-samples'`
+           'normalize' is deprecated and will be removed in 0.22, instead of
+           `normalize=True` use instead just `average='samples'` and for
+           `normalize=False` use instead `average='none-samples'`.
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
@@ -528,21 +533,18 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                                  assume_unique=True)])
 
     if y_type.startswith('multilabel'):
-        if average in (None, 'samples'):
-            if normalize is None:
-                # default is average='samples'
-                average = 'samples'
-            else:
-                if normalize:
-                    average = 'samples'
-                else:
+        if normalize is not None:
+            if average == 'samples':
+                if not normalize:
                     average = 'none-samples'
-                warn_message = ("'normalize' was removed in version 0.20 and "
-                                "will be removed in 0.22, instead use "
-                                "`average='%s'`." % normalize)
-                warnings.warn(warn_message, DeprecationWarning)
-        # else:
-            # otherwise what should we do? raise warning or ValueError?
+            else:
+                raise ValueError("normalize != None' is only meaningful with "
+                                 "`average='samples'`, got `average='%s'`."
+                                 % average)
+            warn_message = ("'normalize' was deprecated in version 0.20 and "
+                            "will be removed in 0.22, instead use "
+                            "`average='%s'`." % average)
+            warnings.warn(warn_message, DeprecationWarning)
 
         if not np.all(labels == present_labels):
             if np.max(labels) > np.max(present_labels):
@@ -575,6 +577,9 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 sum_axis = 0
                 class_weight = y_true.toarray().sum(axis=0)
                 weights = sample_weight
+            else:
+                sum_axis = 0
+                weights = sample_weight
 
             pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis,
                                          sample_weight=weights)
@@ -588,10 +593,13 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             score = pred_and_true / pred_or_true
             score[pred_or_true == 0.0] == 1.0
 
-            if average != 'none-samples':
-                score = np.average(score, weights=class_weight)
+            if average is not None:
+                if average == 'none-samples':
+                    score = np.sum(score)
+                else:
+                    score = np.average(score, weights=class_weight)
             return score
-    elif average == 'samples':
+    elif average == 'samples' or average == 'none-samples':
         raise ValueError("Sample-based jaccard similarity score is "
                          "not meaningful outside multilabel "
                          "classification. See the accuracy_score instead.")
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 3ecfaa3941533..fe389f3da81e6 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -951,6 +951,11 @@ def test_jaccard_similarity_score():
             "Please choose another average setting.")
     assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
                          y_pred, average='binary', pos_label=-1)
+    assert_warns_message(DeprecationWarning,
+                         "'normalize' was deprecated in version 0.20 and will "
+                         "be removed in 0.22, instead use `average='samples'`."
+                         , jaccard_similarity_score, y_true, y_pred,
+                         average='samples', normalize=True)
 
     y_true = np.array([0, 1, 1, 0, 2])
     y_pred = np.array([1, 1, 1, 1, 0])
@@ -962,6 +967,8 @@ def test_jaccard_similarity_score():
             "multilabel classification. See the accuracy_score instead.")
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
                          y_pred, average='samples')
+    assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
+                         y_pred, average='none-samples')
 
     assert_warns_message(UserWarning,
                         "Note that pos_label (set to 3) is ignored when "
@@ -1008,6 +1015,11 @@ def test_multilabel_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='samples',
                                                  sample_weight=[1, 2]), 5. / 9)
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                 average='none-samples'),
+                        35. / 30)
+    assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
+                       np.array([1. / 2, 1., 1. / 2]))
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='micro',
                                                  sample_weight=[1, 2]), 4. / 7)
@@ -1049,8 +1061,11 @@ def test_multiclass_jaccard_similarity_score():
                                                  labels=['ant', 'bird'],
                                                  sample_weight=weight),
                         6. / 11)
-    assert_array_equal(jaccard_similarity_score(y_true, y_pred),
+    assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
                        np.array([2. / 3,  1. / 3,  2. / 5]))
+
+
+def test_average_binary_jaccard_similarity_score():
     y_true = np.array([1, 0, 1, 1, 0])
     y_pred = np.array([1, 0, 1, 1, 1])
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,

From d3f76d5136bc53fa2278efd111cdaeabb2fab02d Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 11 Jan 2018 16:15:08 +0530
Subject: [PATCH 39/88] fix doc and add average=None test for multiclass

now, need to fix common test
---
 doc/modules/model_evaluation.rst             | 4 ++--
 sklearn/metrics/tests/test_classification.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 784b505dddbae..c076c0081c6e6 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -682,8 +682,8 @@ score is equal to the classification accuracy.
   0.33...
   >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
   0.5
-  >>> jaccard_similarity_score(y_true, y_pred)
-  array([ 1.,  0.,  0.,  1. ])
+  >>> jaccard_similarity_score(y_true, y_pred, average=None)
+  array([ 1.,  0.,  0.,  1.])
 
 In the multilabel case with binary label indicators: ::
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index fe389f3da81e6..f4a3e2a1037d1 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1049,7 +1049,9 @@ def test_multiclass_jaccard_similarity_score():
     multi_labels_list = [['ant', 'bird'], ['ant', 'cat'], ['cat', 'bird'],
                          ['ant'], ['bird'], ['cat'], None]
     bin_labels_list = [[0, 1], [0, 2], [2, 1], [0], [1], [2], None]
-    for average in ('macro', 'weighted', 'micro'):
+
+    # other than average='samples'/'none-samples', test everything else here
+    for average in ('macro', 'weighted', 'micro', None):
         for m_label, b_label in zip(multi_labels_list, bin_labels_list):
             assert_almost_equal(multi_jaccard_similarity_score(average=average,
                                                                labels=m_label),

From 319b5d3bcda45c96e33e5b72354fe045a6757c3b Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Fri, 12 Jan 2018 11:56:57 +0530
Subject: [PATCH 40/88] fix none-samples jaccard_similarity score to return
 array of scores

---
 sklearn/metrics/classification.py            | 18 ++++++++++--------
 sklearn/metrics/tests/test_classification.py |  4 ++--
 sklearn/metrics/tests/test_common.py         | 10 ++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index d74d6ac9dc5fc..543ec8561f6c1 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -431,13 +431,14 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         ``'none-samples'``:
             Calculate metrics for each instance, (only meaningful for
             multilabel classification). This differs from 'samples' to return
-            array of class-wise jaccard index, instead of normalzing the array.
+            an array of sample-wise jaccard index, instead of normalizing the
+            array.
 
-    normalize : None, bool, optional (defaul=True)
-        If ``False``, return the sum of the Jaccard similarity coefficient
-        over the sample set. Otherwise, return the average of Jaccard
-        similarity coefficient. This is only to be specified in case
-        `average='samples'`.
+    normalize : bool, optional (default=True)
+        If ``False``, return an array of Jaccard similarity coefficient for
+        each samples over the sample set. Otherwise, return the average of
+        Jaccard similarity coefficient. 'normalize' is only to be specified in
+        case `average='samples'`.
 
         .. versionchanged: 0.20
            'normalize' is deprecated and will be removed in 0.22, instead of
@@ -591,11 +592,12 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 pred_and_true = np.array([pred_and_true.sum()])
 
             score = pred_and_true / pred_or_true
-            score[pred_or_true == 0.0] == 1.0
+            score[pred_or_true == 0.0] = 1.0
 
             if average is not None:
                 if average == 'none-samples':
-                    score = np.sum(score)
+                    if class_weight is not None:
+                        score = score * class_weight
                 else:
                     score = np.average(score, weights=class_weight)
             return score
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index f4a3e2a1037d1..a28bb9cb7be04 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1015,9 +1015,9 @@ def test_multilabel_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='samples',
                                                  sample_weight=[1, 2]), 5. / 9)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+    assert_array_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='none-samples'),
-                        35. / 30)
+                       np.array([2. / 3, 1. / 2]))
     assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
                        np.array([1. / 2, 1., 1. / 2]))
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 83cffc2e4fca0..6c829dbdaa678 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -475,10 +475,12 @@ def test_sample_order_invariance_multilabel_and_multioutput():
 
     for name in MULTILABELS_METRICS:
         metric = ALL_METRICS[name]
-        assert_almost_equal(metric(y_true, y_pred),
-                            metric(y_true_shuffle, y_pred_shuffle),
-                            err_msg="%s is not sample order invariant"
-                                    % name)
+        if name != 'unnormalized_jaccard_similarity_score' and \
+            name != 'none-samples_jaccard_similarity_score':
+            assert_almost_equal(metric(y_true, y_pred),
+                                metric(y_true_shuffle, y_pred_shuffle),
+                                err_msg="%s is not sample order invariant"
+                                        % name)
 
     for name in THRESHOLDED_MULTILABEL_METRICS:
         metric = ALL_METRICS[name]

From 07a05e6e575edd9fe20cd3bc3bb61c35ce8cad61 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 13 Jan 2018 13:34:39 +0530
Subject: [PATCH 41/88] take of care of zero weights for average='weighted'

---
 sklearn/metrics/classification.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 543ec8561f6c1..91fb857f13598 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -578,6 +578,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 sum_axis = 0
                 class_weight = y_true.toarray().sum(axis=0)
                 weights = sample_weight
+                if class_weight.sum() == 0:
+                    return 0
             else:
                 sum_axis = 0
                 weights = sample_weight
@@ -620,7 +622,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         tp = y_true == y_pred
         tp_bins = y_true[tp]
         if sample_weight is not None:
-            sample_weight = sample_weight[indices]
+            sample_weight = np.array(sample_weight)[indices]
             tp_bins_weights = np.asarray(sample_weight)[tp]
         else:
             tp_bins_weights = None
@@ -648,6 +650,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             weights = None
         elif average == 'weighted':
             weights = true_sum
+            if weights.sum() == 0:
+                return 0
 
         score = tp_sum / (true_sum + pred_sum - tp_sum)
 

From 3a312a3f8705ab84880b6414040547cdb91d6b89 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 13 Jan 2018 23:47:15 +0530
Subject: [PATCH 42/88] fix test_common

---
 sklearn/metrics/classification.py    |  2 +-
 sklearn/metrics/tests/test_common.py | 83 +++++++++++++++++++---------
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 91fb857f13598..33c373389bee7 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -511,7 +511,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         if y_type == 'binary':
             if pos_label not in present_labels:
                 if len(present_labels) < 2:
-                    return 0.
+                    return 1.
                 else:
                     raise ValueError("pos_label=%r is not a valid label: "
                                     "%r" % (pos_label, present_labels))
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 6c829dbdaa678..a157e7c198265 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -151,13 +151,14 @@
     "samples_f2_score": partial(fbeta_score, average="samples", beta=2),
     "samples_precision_score": partial(precision_score, average="samples"),
     "samples_recall_score": partial(recall_score, average="samples"),
-    "samples_jaccard_similarity_score":
-    partial(jaccard_similarity_score, average="samples"),
 
     "cohen_kappa_score": cohen_kappa_score,
 
     "none-samples_jaccard_similarity_score":
-    partial(jaccard_similarity_score, average='none-samples')
+    partial(jaccard_similarity_score, average='none-samples'),
+
+    "binary_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average="binary")
 }
 
 THRESHOLDED_METRICS = {
@@ -211,8 +212,9 @@
     "samples_precision_score",
     "samples_recall_score",
     "coverage_error",
-    "samples_jaccard_similarity_score",
+    "jaccard_similarity_score",
     "none-samples_jaccard_similarity_score",
+    "unnormalized_jaccard_similarity_score",
 
     "average_precision_score",
     "weighted_average_precision_score",
@@ -235,8 +237,10 @@
     "macro_roc_auc",
     "samples_roc_auc",
 
-    "samples_jaccard_similarity_score",
+    "jaccard_similarity_score",
     "none-samples_jaccard_similarity_score",
+    "unnormalized_jaccard_similarity_score",
+    "binary_jaccard_similarity_score",
 
     # with default average='binary', multiclass is prohibited
     "precision_score",
@@ -253,7 +257,7 @@
 # Metrics with an "average" argument
 METRICS_WITH_AVERAGING = [
     "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
-    "jaccard_similarity_score"
+    "binary_jaccard_similarity_score"
 ]
 
 # Threshold-based metrics with an "average" argument
@@ -303,6 +307,11 @@
     "macro_precision_score", "macro_recall_score",
     "macro_jaccard_similarity_score",
 
+    "none-samples_jaccard_similarity_score",
+    "unnormalized_jaccard_similarity_score",
+
+    "binary_jaccard_similarity_score",
+
     "cohen_kappa_score",
 ]
 
@@ -310,7 +319,6 @@
 METRICS_WITH_NORMALIZE_OPTION = [
     "accuracy_score",
     "jaccard_similarity_score",
-    "samples_jaccard_similarity_score",
     "zero_one_loss",
 ]
 
@@ -333,7 +341,8 @@
 MULTILABELS_METRICS = [
     "accuracy_score", "unnormalized_accuracy_score",
     "hamming_loss",
-    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
+    "jaccard_similarity_score", "none-samples_jaccard_similarity_score",
+    "unnormalized_jaccard_similarity_score",
     "zero_one_loss", "unnormalized_zero_one_loss",
 
     "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
@@ -350,8 +359,6 @@
 
     "samples_f0.5_score", "samples_f1_score", "samples_f2_score",
     "samples_precision_score", "samples_recall_score",
-    "samples_jaccard_similarity_score",
-    "none-samples_jaccard_similarity_score"
 ]
 
 # Regression metrics with "multioutput-continuous" format support
@@ -365,7 +372,8 @@
 SYMMETRIC_METRICS = [
     "accuracy_score", "unnormalized_accuracy_score",
     "hamming_loss",
-    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
+    "jaccard_similarity_score", "none-samples_jaccard_similarity_score",
+    "unnormalized_jaccard_similarity_score",
     "zero_one_loss", "unnormalized_zero_one_loss",
 
     "micro_jaccard_similarity_score", "macro_jaccard_similarity_score",
@@ -376,6 +384,8 @@
     "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
     "micro_precision_score", "micro_recall_score",
 
+    "binary_jaccard_similarity_score",
+
     "matthews_corrcoef_score", "mean_absolute_error", "mean_squared_error",
     "median_absolute_error",
 
@@ -393,7 +403,7 @@
     "precision_score", "recall_score", "f2_score", "f0.5_score",
 
     "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
-    "weighted_precision_score",
+    "weighted_precision_score", "weighted_jaccard_similarity_score",
 
     "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
     "macro_recall_score", "log_loss", "hinge_loss"
@@ -418,6 +428,9 @@ def test_symmetry():
     y_true = random_state.randint(0, 2, size=(20, ))
     y_pred = random_state.randint(0, 2, size=(20, ))
 
+    y_true_bin = random_state.randint(0, 2, size=(20, 25))
+    y_pred_bin = random_state.randint(0, 2, size=(20, 25))
+
     # We shouldn't forget any metrics
     assert_equal(set(SYMMETRIC_METRICS).union(
         NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS,
@@ -431,9 +444,15 @@ def test_symmetry():
     # Symmetric metric
     for name in SYMMETRIC_METRICS:
         metric = ALL_METRICS[name]
-        assert_almost_equal(metric(y_true, y_pred),
-                            metric(y_pred, y_true),
-                            err_msg="%s is not symmetric" % name)
+        if (name in METRIC_UNDEFINED_BINARY and
+                name in METRIC_UNDEFINED_BINARY):
+            assert_almost_equal(metric(y_true_bin, y_pred_bin),
+                                metric(y_pred_bin, y_true_bin),
+                                err_msg="%s is not symmetric" % name)
+        else:
+            assert_almost_equal(metric(y_true, y_pred),
+                                metric(y_pred, y_true),
+                                err_msg="%s is not symmetric" % name)
 
     # Not symmetric metrics
     for name in NOT_SYMMETRIC_METRICS:
@@ -799,6 +818,8 @@ def test_normalize_option_binary_classification(n_samples=20):
     y_pred = random_state.randint(0, 2, size=(n_samples, ))
 
     for name in METRICS_WITH_NORMALIZE_OPTION:
+        if name in METRIC_UNDEFINED_BINARY:
+            continue
         metrics = ALL_METRICS[name]
         measure = metrics(y_true, y_pred, normalize=True)
         assert_greater(measure, 0,
@@ -815,6 +836,8 @@ def test_normalize_option_multiclass_classification():
     n_samples = y_true.shape[0]
 
     for name in METRICS_WITH_NORMALIZE_OPTION:
+        if name in METRIC_UNDEFINED_MULTICLASS:
+            continue
         metrics = ALL_METRICS[name]
         measure = metrics(y_true, y_pred, normalize=True)
         assert_greater(measure, 0,
@@ -850,8 +873,10 @@ def test_normalize_option_multilabel_classification():
         measure = metrics(y_true, y_pred, normalize=True)
         assert_greater(measure, 0,
                        msg="We failed to test correctly the normalize option")
-        assert_almost_equal(metrics(y_true, y_pred, normalize=False)
-                            / n_samples, measure,
+        unnormalize_measure = metrics(y_true, y_pred, normalize=False)
+        if isinstance(unnormalize_measure, np.ndarray):
+            unnormalize_measure = np.sum(unnormalize_measure)
+        assert_almost_equal(unnormalize_measure / n_samples, measure,
                             err_msg="Failed with %s" % name)
 
 
@@ -991,19 +1016,25 @@ def check_sample_weight_invariance(name, metric, y1, y2):
 
     # check that the weighted and unweighted scores are unequal
     weighted_score = metric(y1, y2, sample_weight=sample_weight)
-    assert_not_equal(
-        unweighted_score, weighted_score,
-        msg="Unweighted and weighted scores are unexpectedly "
-            "equal (%f) for %s" % (weighted_score, name))
+    if isinstance(weighted_score, np.ndarray):
+        assert(not np.allclose(weighted_score, unweighted_score))
+    else:
+        assert_not_equal(
+            unweighted_score, weighted_score,
+            msg="Unweighted and weighted scores are unexpectedly "
+                "equal (%r) for %s" % (weighted_score, name))
 
     # check that sample_weight can be a list
     weighted_score_list = metric(y1, y2,
                                  sample_weight=sample_weight.tolist())
-    assert_almost_equal(
-        weighted_score, weighted_score_list,
-        err_msg=("Weighted scores for array and list "
-                 "sample_weight input are not equal (%f != %f) for %s") % (
-                     weighted_score, weighted_score_list, name))
+    if isinstance(weighted_score, np.ndarray):
+        assert(np.allclose(weighted_score, weighted_score_list))
+    else:
+        assert_almost_equal(
+            weighted_score, weighted_score_list,
+            err_msg=("Weighted scores for array and list "
+                     "sample_weight input are not equal (%f != %f) for %s") % (
+                         weighted_score, weighted_score_list, name))
 
     # check that integer weights is the same as repeated samples
     repeat_weighted_score = metric(

From 9251d29710a4df71a98f351d06f72f37b76fe303 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Sat, 13 Jan 2018 23:56:48 +0530
Subject: [PATCH 43/88] don't bother testing 'sample_weight' and fix test

---
 sklearn/metrics/tests/test_classification.py | 25 ++++----------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index a28bb9cb7be04..0ac1ba3cb6558 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -943,7 +943,7 @@ def test_jaccard_similarity_score():
     y_true = np.array([0, 0])
     y_pred = np.array([0, 0])
     assert_equal(jaccard_similarity_score(y_true, y_pred, average='binary',
-                                          pos_label=-1), 0.)
+                                          pos_label=-1), 1.)
 
     y_true = np.array([[0, 1, 1], [1, 0, 0]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
@@ -1004,32 +1004,25 @@ def test_multilabel_jaccard_similarity_score():
                                                 average='micro'), 3. / 5)
     # average='samples' (default)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 12)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                sample_weight=np.array([0.1, 0.9])), 31. / 60)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='samples',
                                                  labels=[0, 2]), 1. / 2)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='samples',
                                                  labels=[1, 2]), 1. / 2)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='samples',
-                                                 sample_weight=[1, 2]), 5. / 9)
+    # average='none-samples'
     assert_array_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='none-samples'),
                        np.array([2. / 3, 1. / 2]))
+    # average=None
     assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
                        np.array([1. / 2, 1., 1. / 2]))
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 sample_weight=[1, 2]), 4. / 7)
+
     y_true = np.array([[0, 1, 1], [1, 0, 1]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='macro'), 5. / 6)
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='macro',
-                                                 sample_weight=[1, 2]), 8. / 9)
+    # average='weighted'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='weighted'), 7. / 8)
 
@@ -1057,14 +1050,6 @@ def test_multiclass_jaccard_similarity_score():
                                                                labels=m_label),
                                 bin_jaccard_similarity_score(average=average,
                                                              labels=b_label))
-    weight = np.array([1, 2, 1, 1, 2, 1, 2, 3])
-    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='micro',
-                                                 labels=['ant', 'bird'],
-                                                 sample_weight=weight),
-                        6. / 11)
-    assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
-                       np.array([2. / 3,  1. / 3,  2. / 5]))
 
 
 def test_average_binary_jaccard_similarity_score():

From 225c0f28c2978e911b37e6bdf1326097e4fed4b9 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 16 Jan 2018 14:12:20 +0530
Subject: [PATCH 44/88] remove average='none-samples' as a possibility

---
 sklearn/metrics/classification.py            | 45 +++++++-------------
 sklearn/metrics/tests/test_classification.py | 18 +++-----
 sklearn/metrics/tests/test_common.py         | 26 +++--------
 3 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 33c373389bee7..4ea184980676a 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -428,11 +428,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         ``'samples'``:
             Calculate metrics for each instance, and find their average (only
             meaningful for multilabel classification).
-        ``'none-samples'``:
-            Calculate metrics for each instance, (only meaningful for
-            multilabel classification). This differs from 'samples' to return
-            an array of sample-wise jaccard index, instead of normalizing the
-            array.
 
     normalize : bool, optional (default=True)
         If ``False``, return an array of Jaccard similarity coefficient for
@@ -440,11 +435,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         Jaccard similarity coefficient. 'normalize' is only to be specified in
         case `average='samples'`.
 
-        .. versionchanged: 0.20
-           'normalize' is deprecated and will be removed in 0.22, instead of
-           `normalize=True` use instead just `average='samples'` and for
-           `normalize=False` use instead `average='none-samples'`.
-
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
@@ -497,8 +487,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     array([ 0.66...,  0.5 ,  0.  ])
     """
 
-    average_options = (None, 'micro', 'macro', 'weighted', 'samples',
-                       'none-samples')
+    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options and average != 'binary':
         raise ValueError("average has to be one of " + str(average_options))
 
@@ -534,18 +523,13 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                                  assume_unique=True)])
 
     if y_type.startswith('multilabel'):
-        if normalize is not None:
-            if average == 'samples':
-                if not normalize:
-                    average = 'none-samples'
-            else:
-                raise ValueError("normalize != None' is only meaningful with "
-                                 "`average='samples'`, got `average='%s'`."
-                                 % average)
-            warn_message = ("'normalize' was deprecated in version 0.20 and "
-                            "will be removed in 0.22, instead use "
-                            "`average='%s'`." % average)
-            warnings.warn(warn_message, DeprecationWarning)
+        if average == 'samples':
+            if normalize is None:
+                normalize = True
+        elif normalize is not None:
+            raise ValueError("'normalize' is only meaningful with "
+                             "`average='samples'`, got `average='%s'`."
+                             % average)
 
         if not np.all(labels == present_labels):
             if np.max(labels) > np.max(present_labels):
@@ -562,7 +546,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
         with np.errstate(divide='ignore', invalid='ignore'):
 
-            if average == 'samples' or average == 'none-samples':
+            if average == 'samples':
                 sum_axis = 1
                 class_weight = sample_weight
                 weights = None
@@ -597,13 +581,14 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             score[pred_or_true == 0.0] = 1.0
 
             if average is not None:
-                if average == 'none-samples':
+                if normalize == False:
                     if class_weight is not None:
-                        score = score * class_weight
-                else:
-                    score = np.average(score, weights=class_weight)
+                        score = np.dot(score, class_weight)
+                    else:
+                        score = score.sum()
+                score = np.average(score, weights=class_weight)
             return score
-    elif average == 'samples' or average == 'none-samples':
+    elif average == 'samples':
         raise ValueError("Sample-based jaccard similarity score is "
                          "not meaningful outside multilabel "
                          "classification. See the accuracy_score instead.")
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 0ac1ba3cb6558..003379e1df9a7 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -951,11 +951,6 @@ def test_jaccard_similarity_score():
             "Please choose another average setting.")
     assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
                          y_pred, average='binary', pos_label=-1)
-    assert_warns_message(DeprecationWarning,
-                         "'normalize' was deprecated in version 0.20 and will "
-                         "be removed in 0.22, instead use `average='samples'`."
-                         , jaccard_similarity_score, y_true, y_pred,
-                         average='samples', normalize=True)
 
     y_true = np.array([0, 1, 1, 0, 2])
     y_pred = np.array([1, 1, 1, 1, 0])
@@ -968,7 +963,7 @@ def test_jaccard_similarity_score():
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
                          y_pred, average='samples')
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
-                         y_pred, average='none-samples')
+                         y_pred, average='samples', normalize=False)
 
     assert_warns_message(UserWarning,
                         "Note that pos_label (set to 3) is ignored when "
@@ -998,7 +993,7 @@ def test_multilabel_jaccard_similarity_score():
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
     # average='macro'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='macro'), 2. / 3)
+                                                 average='macro'), 2. / 3)
     # average='micro'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                 average='micro'), 3. / 5)
@@ -1010,10 +1005,11 @@ def test_multilabel_jaccard_similarity_score():
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='samples',
                                                  labels=[1, 2]), 1. / 2)
-    # average='none-samples'
-    assert_array_equal(jaccard_similarity_score(y_true, y_pred,
-                                                 average='none-samples'),
-                       np.array([2. / 3, 1. / 2]))
+    # average='samples', normalize=False
+    assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
+                                                average='samples',
+                                                normalize=False),
+                       7. / 6)
     # average=None
     assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
                        np.array([1. / 2, 1., 1. / 2]))
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index a157e7c198265..19113172b7559 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -154,9 +154,6 @@
 
     "cohen_kappa_score": cohen_kappa_score,
 
-    "none-samples_jaccard_similarity_score":
-    partial(jaccard_similarity_score, average='none-samples'),
-
     "binary_jaccard_similarity_score":
     partial(jaccard_similarity_score, average="binary")
 }
@@ -213,7 +210,6 @@
     "samples_recall_score",
     "coverage_error",
     "jaccard_similarity_score",
-    "none-samples_jaccard_similarity_score",
     "unnormalized_jaccard_similarity_score",
 
     "average_precision_score",
@@ -238,7 +234,6 @@
     "samples_roc_auc",
 
     "jaccard_similarity_score",
-    "none-samples_jaccard_similarity_score",
     "unnormalized_jaccard_similarity_score",
     "binary_jaccard_similarity_score",
 
@@ -307,7 +302,6 @@
     "macro_precision_score", "macro_recall_score",
     "macro_jaccard_similarity_score",
 
-    "none-samples_jaccard_similarity_score",
     "unnormalized_jaccard_similarity_score",
 
     "binary_jaccard_similarity_score",
@@ -341,8 +335,7 @@
 MULTILABELS_METRICS = [
     "accuracy_score", "unnormalized_accuracy_score",
     "hamming_loss",
-    "jaccard_similarity_score", "none-samples_jaccard_similarity_score",
-    "unnormalized_jaccard_similarity_score",
+    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
     "zero_one_loss", "unnormalized_zero_one_loss",
 
     "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
@@ -372,8 +365,7 @@
 SYMMETRIC_METRICS = [
     "accuracy_score", "unnormalized_accuracy_score",
     "hamming_loss",
-    "jaccard_similarity_score", "none-samples_jaccard_similarity_score",
-    "unnormalized_jaccard_similarity_score",
+    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
     "zero_one_loss", "unnormalized_zero_one_loss",
 
     "micro_jaccard_similarity_score", "macro_jaccard_similarity_score",
@@ -494,11 +486,9 @@ def test_sample_order_invariance_multilabel_and_multioutput():
 
     for name in MULTILABELS_METRICS:
         metric = ALL_METRICS[name]
-        if name != 'unnormalized_jaccard_similarity_score' and \
-            name != 'none-samples_jaccard_similarity_score':
-            assert_almost_equal(metric(y_true, y_pred),
-                                metric(y_true_shuffle, y_pred_shuffle),
-                                err_msg="%s is not sample order invariant"
+        assert_almost_equal(metric(y_true, y_pred),
+                            metric(y_true_shuffle, y_pred_shuffle),
+                            err_msg="%s is not sample order invariant"
                                         % name)
 
     for name in THRESHOLDED_MULTILABEL_METRICS:
@@ -873,10 +863,8 @@ def test_normalize_option_multilabel_classification():
         measure = metrics(y_true, y_pred, normalize=True)
         assert_greater(measure, 0,
                        msg="We failed to test correctly the normalize option")
-        unnormalize_measure = metrics(y_true, y_pred, normalize=False)
-        if isinstance(unnormalize_measure, np.ndarray):
-            unnormalize_measure = np.sum(unnormalize_measure)
-        assert_almost_equal(unnormalize_measure / n_samples, measure,
+        assert_almost_equal(metrics(y_true, y_pred, normalize=False)
+                            / n_samples, measure,
                             err_msg="Failed with %s" % name)
 
 

From d7fe5caf0f9d8c82e6c9252ba3b707bba53e3f7c Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 16 Jan 2018 18:53:27 +0530
Subject: [PATCH 45/88] fix average='weighted'

---
 sklearn/metrics/classification.py    | 12 ++++++++----
 sklearn/metrics/tests/test_common.py |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 4ea184980676a..7455d0e2eab41 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -478,10 +478,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
     ... # doctest: +ELLIPSIS
     0.4722...
-    >>> jaccard_similarity_score(y_true, y_pred)
+    >>> jaccard_similarity_score(y_true, y_pred, average=None)
     ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
     array([ 0.66...,  0. ,  0.5  ])
-    >>> jaccard_similarity_score(y_true, y_pred,
+    >>> jaccard_similarity_score(y_true, y_pred, average=None,
     ... labels=['ant', 'cat', 'bird'])
     ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
     array([ 0.66...,  0.5 ,  0.  ])
@@ -560,7 +560,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 weights = sample_weight
             elif average == 'weighted':
                 sum_axis = 0
-                class_weight = y_true.toarray().sum(axis=0)
+                if sample_weight is None:
+                    class_weight = y_true.toarray().sum(axis=0)
+                else:
+                    class_weight = (y_true.toarray().T).dot(sample_weight)
                 weights = sample_weight
                 if class_weight.sum() == 0:
                     return 0
@@ -586,7 +589,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                         score = np.dot(score, class_weight)
                     else:
                         score = score.sum()
-                score = np.average(score, weights=class_weight)
+                else:
+                    score = np.average(score, weights=class_weight)
             return score
     elif average == 'samples':
         raise ValueError("Sample-based jaccard similarity score is "
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 19113172b7559..148b7bab77a59 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -489,7 +489,7 @@ def test_sample_order_invariance_multilabel_and_multioutput():
         assert_almost_equal(metric(y_true, y_pred),
                             metric(y_true_shuffle, y_pred_shuffle),
                             err_msg="%s is not sample order invariant"
-                                        % name)
+                                    % name)
 
     for name in THRESHOLDED_MULTILABEL_METRICS:
         metric = ALL_METRICS[name]

From 414ae8badb14323721fed224bbc90222aab1e241 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 17 Jan 2018 04:00:41 +0530
Subject: [PATCH 46/88] use np.in1d instead of np.isin (unavailable in version
 < 1.13.0)

---
 sklearn/metrics/classification.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7455d0e2eab41..1c56930ab1976 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -603,8 +603,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         y_pred = le.transform(y_pred)
 
         labels = le.transform(labels)[:n_labels]
-        indices = np.where(np.isin(y_true, labels) +
-                           np.isin(y_pred, labels) == True)[0]
+        # use 'np.in1d' instead of 'np.isin' (unavailable in version < 1.13.0)
+        indices = np.where(np.in1d(y_true, labels, assume_unique=False,
+                                   invert=False)
+                           + np.in1d(y_pred, labels, assume_unique=False,
+                                     invert=False) == True)[0]
 
         y_true = y_true[indices]
         y_pred = y_pred[indices]

From 3ac79bdcdbd216f29cd1b9abb62064282edd96e2 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 17 Jan 2018 15:36:38 +0530
Subject: [PATCH 47/88] address Joel's comments

---
 sklearn/metrics/classification.py | 35 ++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 1c56930ab1976..b65bc0ad7e58a 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -545,6 +545,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             y_pred = y_pred[:, labels[:n_labels]]
 
         with np.errstate(divide='ignore', invalid='ignore'):
+            class_weight = None
 
             if average == 'samples':
                 sum_axis = 1
@@ -560,11 +561,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                 weights = sample_weight
             elif average == 'weighted':
                 sum_axis = 0
+                weights = sample_weight
                 if sample_weight is None:
                     class_weight = y_true.toarray().sum(axis=0)
                 else:
                     class_weight = (y_true.toarray().T).dot(sample_weight)
-                weights = sample_weight
                 if class_weight.sum() == 0:
                     return 0
             else:
@@ -622,33 +623,39 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         if len(tp_bins):
             tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
                                  minlength=len(labels))[labels]
-        else:
-            # pathological case
-            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
-
-        if len(y_pred):
-            pred_sum = np.bincount(y_pred, weights=sample_weight,
-                                   minlength=len(labels))[labels]
-        if len(y_true):
             true_sum = np.bincount(y_true, weights=sample_weight,
                                    minlength=len(labels))[labels]
+            pred_sum = np.bincount(y_pred, weights=sample_weight,
+                                   minlength=len(labels))[labels]
+        else:
+            tp_sum = np.zeros(len(labels))
+            if len(y_true):
+                true_sum = np.bincount(y_true, weights=sample_weight,
+                                       minlength=len(labels))[labels]
+            else:
+                true_sum = np.zeros(len(labels))
+            if len(y_pred):
+                pred_sum = np.bincount(y_pred, weights=sample_weight,
+                                       minlength=len(labels))[labels]
+            else:
+                pred_sum = np.zeros(len(labels))
 
         if average == 'micro' or average == 'binary':
             tp_sum = np.array([tp_sum.sum()])
             true_sum = np.array([true_sum.sum()])
             pred_sum = np.array([pred_sum.sum()])
-            weights = None
+            class_weight = None
         elif average == 'macro':
-            weights = None
+            class_weight = None
         elif average == 'weighted':
-            weights = true_sum
-            if weights.sum() == 0:
+            class_weight = true_sum
+            if class_weight.sum() == 0:
                 return 0
 
         score = tp_sum / (true_sum + pred_sum - tp_sum)
 
         if average is not None:
-            score = np.average(score, weights=weights)
+            score = np.average(score, weights=class_weight)
         return score
 
 

From 3673407a464f7c23fe59955e7ffb3e6ef2ef3afb Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 17 Jan 2018 17:59:56 +0530
Subject: [PATCH 48/88] fix lgtm error

---
 sklearn/metrics/classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index b65bc0ad7e58a..7d34c24711759 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -647,7 +647,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             class_weight = None
         elif average == 'macro':
             class_weight = None
-        elif average == 'weighted':
+        else:
+            # average='weighted'
             class_weight = true_sum
             if class_weight.sum() == 0:
                 return 0

From d3d7ca931bd4514648b1dd8dceda970e783fb822 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 17 Jan 2018 18:02:43 +0530
Subject: [PATCH 49/88] fix lgtm

---
 sklearn/metrics/classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7d34c24711759..74d342208bee0 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -640,6 +640,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             else:
                 pred_sum = np.zeros(len(labels))
 
+        class_weight = None
         if average == 'micro' or average == 'binary':
             tp_sum = np.array([tp_sum.sum()])
             true_sum = np.array([true_sum.sum()])
@@ -647,8 +648,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             class_weight = None
         elif average == 'macro':
             class_weight = None
-        else:
-            # average='weighted'
+        elif average == 'weighted':
             class_weight = true_sum
             if class_weight.sum() == 0:
                 return 0

From 04768c50e13b2d54d3c1b502fd4bf988450a46fe Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 18 Jan 2018 10:31:22 +0530
Subject: [PATCH 50/88] Fix flake8 errors

---
 sklearn/metrics/classification.py            | 14 ++++++-------
 sklearn/metrics/tests/test_classification.py | 22 ++++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 74d342208bee0..587da76a4529b 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -503,11 +503,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                     return 1.
                 else:
                     raise ValueError("pos_label=%r is not a valid label: "
-                                    "%r" % (pos_label, present_labels))
+                                     "%r" % (pos_label, present_labels))
             labels = [pos_label]
         else:
             raise ValueError("Target is %s but average='binary'. Please "
-                            "choose another average setting." % y_type)
+                             "choose another average setting." % y_type)
     elif pos_label not in (None, 1):
         warnings.warn("Note that pos_label (set to %r) is ignored when "
                       "average != 'binary' (got %r). You may use "
@@ -585,7 +585,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             score[pred_or_true == 0.0] = 1.0
 
             if average is not None:
-                if normalize == False:
+                if normalize is False:
                     if class_weight is not None:
                         score = np.dot(score, class_weight)
                     else:
@@ -608,7 +608,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         indices = np.where(np.in1d(y_true, labels, assume_unique=False,
                                    invert=False)
                            + np.in1d(y_pred, labels, assume_unique=False,
-                                     invert=False) == True)[0]
+                                     invert=False))[0]
 
         y_true = y_true[indices]
         y_pred = y_pred[indices]
@@ -1307,16 +1307,16 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
         if len(tp_bins):
             tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
-                              minlength=len(labels))
+                                 minlength=len(labels))
         else:
             # Pathological case
             true_sum = pred_sum = tp_sum = np.zeros(len(labels))
         if len(y_pred):
             pred_sum = np.bincount(y_pred, weights=sample_weight,
-                                minlength=len(labels))
+                                   minlength=len(labels))
         if len(y_true):
             true_sum = np.bincount(y_true, weights=sample_weight,
-                                minlength=len(labels))
+                                   minlength=len(labels))
 
         # Retain only selected labels
         indices = np.searchsorted(sorted_labels, labels[:n_labels])
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 003379e1df9a7..e677d28def36c 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -966,11 +966,11 @@ def test_jaccard_similarity_score():
                          y_pred, average='samples', normalize=False)
 
     assert_warns_message(UserWarning,
-                        "Note that pos_label (set to 3) is ignored when "
-                        "average != 'binary' (got 'micro'). You may use "
-                        "labels=[pos_label] to specify a single positive "
-                        "class.", jaccard_similarity_score, y_true, y_pred,
-                        average='micro', pos_label=3)
+                         "Note that pos_label (set to 3) is ignored when "
+                         "average != 'binary' (got 'micro'). You may use "
+                         "labels=[pos_label] to specify a single positive "
+                         "class.", jaccard_similarity_score, y_true, y_pred,
+                         average='micro', pos_label=3)
 
 
 def test_multilabel_jaccard_similarity_score():
@@ -996,7 +996,7 @@ def test_multilabel_jaccard_similarity_score():
                                                  average='macro'), 2. / 3)
     # average='micro'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='micro'), 3. / 5)
+                                                 average='micro'), 3. / 5)
     # average='samples' (default)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred), 7. / 12)
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
@@ -1007,9 +1007,9 @@ def test_multilabel_jaccard_similarity_score():
                                                  labels=[1, 2]), 1. / 2)
     # average='samples', normalize=False
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
-                                                average='samples',
-                                                normalize=False),
-                       7. / 6)
+                                                 average='samples',
+                                                 normalize=False),
+                        7. / 6)
     # average=None
     assert_array_equal(jaccard_similarity_score(y_true, y_pred, average=None),
                        np.array([1. / 2, 1., 1. / 2]))
@@ -1033,8 +1033,8 @@ def test_multiclass_jaccard_similarity_score():
     y_pred_bin = lb.transform(y_pred)
     multi_jaccard_similarity_score = partial(jaccard_similarity_score, y_true,
                                              y_pred)
-    bin_jaccard_similarity_score = partial(jaccard_similarity_score, y_true_bin
-                                           , y_pred_bin)
+    bin_jaccard_similarity_score = partial(jaccard_similarity_score,
+                                           y_true_bin, y_pred_bin)
     multi_labels_list = [['ant', 'bird'], ['ant', 'cat'], ['cat', 'bird'],
                          ['ant'], ['bird'], ['cat'], None]
     bin_labels_list = [[0, 1], [0, 2], [2, 1], [0], [1], [2], None]

From 54fe344e3ed8f2ff1ab54db057ce7b07608884d3 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 18 Jan 2018 12:19:15 +0530
Subject: [PATCH 51/88] code coverage

---
 sklearn/metrics/classification.py            | 41 ++++++++++----------
 sklearn/metrics/tests/test_classification.py | 25 ++++++++++--
 sklearn/metrics/tests/test_common.py         | 24 +++++-------
 3 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 587da76a4529b..28cdb8ad58843 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -430,10 +430,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             meaningful for multilabel classification).
 
     normalize : bool, optional (default=True)
-        If ``False``, return an array of Jaccard similarity coefficient for
-        each samples over the sample set. Otherwise, return the average of
-        Jaccard similarity coefficient. 'normalize' is only to be specified in
-        case `average='samples'`.
+        If ``False``, return the sum of the Jaccard similarity coefficient
+        over the sample set. Otherwise, return the average of Jaccard
+        similarity coefficient. ``normalize`` is only meaningful when
+        ``average='samples'``.
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
@@ -640,24 +640,25 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             else:
                 pred_sum = np.zeros(len(labels))
 
-        class_weight = None
-        if average == 'micro' or average == 'binary':
-            tp_sum = np.array([tp_sum.sum()])
-            true_sum = np.array([true_sum.sum()])
-            pred_sum = np.array([pred_sum.sum()])
-            class_weight = None
-        elif average == 'macro':
+        with np.errstate(divide='ignore', invalid='ignore'):
             class_weight = None
-        elif average == 'weighted':
-            class_weight = true_sum
-            if class_weight.sum() == 0:
-                return 0
+            if average == 'micro' or average == 'binary':
+                tp_sum = np.array([tp_sum.sum()])
+                true_sum = np.array([true_sum.sum()])
+                pred_sum = np.array([pred_sum.sum()])
+                class_weight = None
+            elif average == 'macro':
+                class_weight = None
+            elif average == 'weighted':
+                class_weight = true_sum
+                if class_weight.sum() == 0:
+                    return 0
 
-        score = tp_sum / (true_sum + pred_sum - tp_sum)
+            score = tp_sum / (true_sum + pred_sum - tp_sum)
 
-        if average is not None:
-            score = np.average(score, weights=class_weight)
-        return score
+            if average is not None:
+                score = np.average(score, weights=class_weight)
+            return score
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
@@ -1267,7 +1268,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         # Select labels:
         if not np.all(labels == present_labels):
             if np.max(labels) > np.max(present_labels):
-                raise ValueError('All labels must be in [0, n labels). '
+                raise ValueError('All labels must be in [0, n, labels). '
                                  'Got %d > %d' %
                                  (np.max(labels), np.max(present_labels)))
             if np.min(labels) < 0:
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index e677d28def36c..0ae50e99c1410 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -940,10 +940,13 @@ def test_multilabel_hamming_loss():
 
 
 def test_jaccard_similarity_score():
-    y_true = np.array([0, 0])
-    y_pred = np.array([0, 0])
+    y_true = np.array([0, 1, 0, 1, 1])
+    y_pred = np.array([0, 1, 0, 1, 1])
     assert_equal(jaccard_similarity_score(y_true, y_pred, average='binary',
-                                          pos_label=-1), 1.)
+                                          pos_label=0), 1.)
+    assert_raise_message(ValueError, "pos_label=2 is not a valid label: "
+                         "array([0, 1])", jaccard_similarity_score, y_true,
+                         y_pred, average='binary', pos_label=2)
 
     y_true = np.array([[0, 1, 1], [1, 0, 0]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
@@ -1021,6 +1024,17 @@ def test_multilabel_jaccard_similarity_score():
     # average='weighted'
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,
                                                  average='weighted'), 7. / 8)
+    # normalize error
+    msg1 = ("'normalize' is only meaningful with `average='samples'`, got "
+           "`average='macro'`.")
+    assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
+                         y_pred, average='macro', normalize=False)
+    msg2 = 'All labels must be in [0, n, labels). Got 4 > 2'
+    assert_raise_message(ValueError, msg2, jaccard_similarity_score, y_true,
+                         y_pred, labels=[4])
+    msg3 = 'All labels must be in [0, n, labels). Got -1 < 0'
+    assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
+                         y_pred, labels=[-1])
 
 
 def test_multiclass_jaccard_similarity_score():
@@ -1047,6 +1061,11 @@ def test_multiclass_jaccard_similarity_score():
                                 bin_jaccard_similarity_score(average=average,
                                                              labels=b_label))
 
+    y_true = np.array([])
+    y_pred = np.array([])
+    assert_equal(jaccard_similarity_score(y_true, y_pred, average='weighted'),
+                 0.)
+
 
 def test_average_binary_jaccard_similarity_score():
     y_true = np.array([1, 0, 1, 1, 0])
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 148b7bab77a59..018a7ea8d9959 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1004,25 +1004,19 @@ def check_sample_weight_invariance(name, metric, y1, y2):
 
     # check that the weighted and unweighted scores are unequal
     weighted_score = metric(y1, y2, sample_weight=sample_weight)
-    if isinstance(weighted_score, np.ndarray):
-        assert(not np.allclose(weighted_score, unweighted_score))
-    else:
-        assert_not_equal(
-            unweighted_score, weighted_score,
-            msg="Unweighted and weighted scores are unexpectedly "
-                "equal (%r) for %s" % (weighted_score, name))
+    assert_not_equal(
+        unweighted_score, weighted_score,
+        msg="Unweighted and weighted scores are unexpectedly "
+            "equal (%f) for %s" % (weighted_score, name))
 
     # check that sample_weight can be a list
     weighted_score_list = metric(y1, y2,
                                  sample_weight=sample_weight.tolist())
-    if isinstance(weighted_score, np.ndarray):
-        assert(np.allclose(weighted_score, weighted_score_list))
-    else:
-        assert_almost_equal(
-            weighted_score, weighted_score_list,
-            err_msg=("Weighted scores for array and list "
-                     "sample_weight input are not equal (%f != %f) for %s") % (
-                         weighted_score, weighted_score_list, name))
+    assert_almost_equal(
+        weighted_score, weighted_score_list,
+        err_msg=("Weighted scores for array and list "
+                 "sample_weight input are not equal (%f != %f) for %s") % (
+                    weighted_score, weighted_score_list, name))
 
     # check that integer weights is the same as repeated samples
     repeat_weighted_score = metric(

From ee548532f4bd1c6b57ee7d078353ae8dcd27703d Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 18 Jan 2018 13:04:38 +0530
Subject: [PATCH 52/88] fix flake8

---
 sklearn/metrics/tests/test_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 0ae50e99c1410..846781fb4b618 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1026,7 +1026,7 @@ def test_multilabel_jaccard_similarity_score():
                                                  average='weighted'), 7. / 8)
     # normalize error
     msg1 = ("'normalize' is only meaningful with `average='samples'`, got "
-           "`average='macro'`.")
+            "`average='macro'`.")
     assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
                          y_pred, average='macro', normalize=False)
     msg2 = 'All labels must be in [0, n, labels). Got 4 > 2'

From 551804d9c2a251b54dc7888ab5d068e2533a24ce Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 18 Jan 2018 16:54:48 +0530
Subject: [PATCH 53/88] improve doc

---
 doc/modules/model_evaluation.rst | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index c076c0081c6e6..829b674861d77 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -70,6 +70,7 @@ Scoring                           Function
 'neg_log_loss'                    :func:`metrics.log_loss`                          requires ``predict_proba`` support
 'precision' etc.                  :func:`metrics.precision_score`                   suffixes apply as with 'f1'
 'recall' etc.                     :func:`metrics.recall_score`                      suffixes apply as with 'f1'
+'jaccard' etc.                    :func:`metric.jaccard_similarity_score`           suffixes apply as with 'f1'
 'roc_auc'                         :func:`metrics.roc_auc_score`
 
 **Clustering**
@@ -667,29 +668,33 @@ with a ground truth label set :math:`y_i` and predicted label set
 
     J(y_i, \hat{y}_i) = \frac{|y_i \cap \hat{y}_i|}{|y_i \cup \hat{y}_i|}.
 
-In binary and multiclass classification, the Jaccard similarity coefficient
-score is equal to the classification accuracy.
+:func:`jaccard_similarity_score` works like :func:`precision_recall_fscore_support`
+as a naively set-wise measure applying only to binary and multilabel targets.
 
-::
+In the multilabel case with binary label indicators: ::
 
   >>> import numpy as np
   >>> from sklearn.metrics import jaccard_similarity_score
+  >>> y_true = np.array([[0, 1], [1, 1]])
+  >>> y_pred = np.ones((2, 2))
+  >>> jaccard_similarity_score(y_true, y_pred)
+  0.75
+  >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
+  1.5
+
+For which multiclass problems are binarized.
+
+::
+
   >>> y_pred = [0, 2, 1, 3]
   >>> y_true = [0, 1, 2, 3]
   >>> jaccard_similarity_score(y_true, y_pred, average='macro')
   0.5
   >>> jaccard_similarity_score(y_true, y_pred, average='micro')
   0.33...
-  >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
-  0.5
   >>> jaccard_similarity_score(y_true, y_pred, average=None)
   array([ 1.,  0.,  0.,  1.])
 
-In the multilabel case with binary label indicators: ::
-
-  >>> jaccard_similarity_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
-  0.75
-
 .. _precision_recall_f_measure_metrics:
 
 Precision, recall and F-measures

From 0d45a444e039a3f0e00d60a68e4f8e672ed1316c Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Mon, 22 Jan 2018 21:27:35 +0530
Subject: [PATCH 54/88] add what's new entry and address Joel's comments

---
 doc/modules/model_evaluation.rst     |  4 +--
 doc/whats_new/v0.20.rst              | 11 ++++++
 sklearn/metrics/classification.py    | 51 +++++++++++-----------------
 sklearn/metrics/tests/test_common.py |  2 +-
 sklearn/svm/base.py                  |  2 +-
 5 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index b9e77651568cf..cebd1b45dac31 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -683,9 +683,7 @@ In the multilabel case with binary label indicators: ::
   >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
   1.5
 
-For which multiclass problems are binarized.
-
-::
+Multiclass problems are binarized: ::
 
   >>> y_pred = [0, 2, 1, 3]
   >>> y_true = [0, 1, 2, 3]
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 7e7d39dbf1759..65ceaa24c1614 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -149,6 +149,12 @@ Metrics
 - :func:`metrics.roc_auc_score` now supports binary ``y_true`` other than
   ``{0, 1}`` or ``{-1, 1}``. :issue:`9828` by :user:`Hanmin Qin <qinhanmin2014>`.
 
+- :func:`metrics.jaccard_similarity_score` now accepts ``average`` argument
+  like :func:`metrics.precision_recall_fscore_support` as a naively set-wise
+  measure applying only to binary, multilabel targets and binarizing
+  multiclass input.
+  :issue:`10083` by :user:`Gaurav Dhingra <gxyd>`.
+
 Linear, kernelized and related models
 
 - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the
@@ -266,6 +272,11 @@ Decomposition, manifold learning and clustering
 
 Metrics
 
+- Fixed a bug in :func:`metrics.jaccard_similarity_score`, to disallow
+  sample-wise averaging for 1d input, since it is redundantly equal to
+  :func:`metrics.accuracy_score`.
+  :issue:`10083` by :user:`Gaurav Dhingra <gxyd>`.
+
 - Fixed a bug in :func:`metrics.precision_precision_recall_fscore_support`
   when truncated `range(n_labels)` is passed as value for `labels`.
   :issue:`10377` by :user:`Gaurav Dhingra <gxyd>`.
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 290c5be87720e..34696fbca4c60 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,7 +373,7 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average='samples', normalize=None,
+                             average='samples', normalize=True,
                              sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -440,7 +440,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     Returns
     -------
-    score: float (if average is not None) or array of float, shape =\
+    score: float (if average is not None) or array of floats, shape =\
             [n_unique_labels]
 
     See also
@@ -459,32 +459,24 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     In the multilabel case with binary label indicators:
 
-    >>> jaccard_similarity_score(np.array([[0, 1], [1, 1]]),
-    ... np.ones((2, 2)))
-    0.75
-
-    In the multiclass case:
-
-    >>> y_pred = [0, 2, 1, 3]
-    >>> y_true = [0, 1, 2, 3]
-    >>> jaccard_similarity_score(y_true, y_pred, average='macro')
-    0.5
+    >>> y_true = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
+    >>> y_pred = np.array([[0, 1, 1], [1, 1, 1], [0, 0, 1]])
+    >>> jaccard_similarity_score(y_true, y_pred)
+    ... # doctest: +ELLIPSIS
+    0.33...
     >>> jaccard_similarity_score(y_true, y_pred, average='micro')
     ... # doctest: +ELLIPSIS
-    0.333...
-
-    >>> y_pred = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']
-    >>> y_true = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']
+    0.33...
     >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
-    ... # doctest: +ELLIPSIS
-    0.4722...
+    0.5
     >>> jaccard_similarity_score(y_true, y_pred, average=None)
-    ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
-    array([ 0.66...,  0. ,  0.5  ])
-    >>> jaccard_similarity_score(y_true, y_pred, average=None,
-    ... labels=['ant', 'cat', 'bird'])
-    ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
-    array([ 0.66...,  0.5 ,  0.  ])
+    array([ 0.,  0.,  1.])
+
+    In the multiclass case:
+
+    >>> jaccard_similarity_score(np.array([0, 2, 1, 3]),
+    ... np.array([0, 1, 2, 3]), average='macro')
+    0.5
     """
 
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
@@ -523,10 +515,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                                  assume_unique=True)])
 
     if y_type.startswith('multilabel'):
-        if average == 'samples':
-            if normalize is None:
-                normalize = True
-        elif normalize is not None:
+        if average != 'samples' and not normalize:
             raise ValueError("'normalize' is only meaningful with "
                              "`average='samples'`, got `average='%s'`."
                              % average)
@@ -585,7 +574,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             score[pred_or_true == 0.0] = 1.0
 
             if average is not None:
-                if normalize is False:
+                if not normalize:
                     if class_weight is not None:
                         score = np.dot(score, class_weight)
                     else:
@@ -640,7 +629,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             else:
                 pred_sum = np.zeros(len(labels))
 
-        with np.errstate(divide='ignore', invalid='ignore'):
+        with np.errstate(divide='ignore'):
             class_weight = None
             if average == 'micro' or average == 'binary':
                 tp_sum = np.array([tp_sum.sum()])
@@ -1268,7 +1257,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         # Select labels:
         if not np.all(labels == present_labels):
             if np.max(labels) > np.max(present_labels):
-                raise ValueError('All labels must be in [0, n, labels). '
+                raise ValueError('All labels must be in [0, n labels). '
                                  'Got %d > %d' %
                                  (np.max(labels), np.max(present_labels)))
             if np.min(labels) < 0:
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 1570479150d97..11951076d92db 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1015,7 +1015,7 @@ def check_sample_weight_invariance(name, metric, y1, y2):
         weighted_score, weighted_score_list,
         err_msg=("Weighted scores for array and list "
                  "sample_weight input are not equal (%f != %f) for %s") % (
-                    weighted_score, weighted_score_list, name))
+                     weighted_score, weighted_score_list, name))
 
     # check that integer weights is the same as repeated samples
     repeat_weighted_score = metric(
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index 60442d6bbeee1..eb5bb01508953 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -543,7 +543,7 @@ def predict(self, X):
     # estimators.
     def _check_proba(self):
         if not self.probability:
-            raise AttributeError("predict_proba is not available when"
+            raise AttributeError("predict_proba is not available when "
                                  " probability=False")
         if self._impl not in ('c_svc', 'nu_svc'):
             raise AttributeError("predict_proba only implemented for SVC"

From 785bb36c06c26a9446f9b2c10b547ad445ab05bf Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Mon, 22 Jan 2018 21:48:36 +0530
Subject: [PATCH 55/88] improve doc's entry

---
 doc/whats_new/v0.20.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 65ceaa24c1614..b487b4473d4be 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -273,7 +273,7 @@ Decomposition, manifold learning and clustering
 Metrics
 
 - Fixed a bug in :func:`metrics.jaccard_similarity_score`, to disallow
-  sample-wise averaging for 1d input, since it is redundantly equal to
+  sample-wise averaging of multiclass input, since it is redundantly equal to
   :func:`metrics.accuracy_score`.
   :issue:`10083` by :user:`Gaurav Dhingra <gxyd>`.
 

From 7c1314a90f285e2773c952282096fafc7ab23a8a Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 23 Jan 2018 11:31:26 +0530
Subject: [PATCH 56/88] use normalize='true-if-samples' for internal use

---
 sklearn/metrics/classification.py            | 4 ++--
 sklearn/metrics/tests/test_classification.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 34696fbca4c60..a9bc1a2ccd63f 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -373,7 +373,7 @@ class labels [2]_.
 
 
 def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
-                             average='samples', normalize=True,
+                             average='samples', normalize='true-if-samples',
                              sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -515,7 +515,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                                                  assume_unique=True)])
 
     if y_type.startswith('multilabel'):
-        if average != 'samples' and not normalize:
+        if average != 'samples' and normalize != 'true-if-samples':
             raise ValueError("'normalize' is only meaningful with "
                              "`average='samples'`, got `average='%s'`."
                              % average)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index ce28406d214a1..d12f17edbe7b5 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1051,6 +1051,8 @@ def test_multilabel_jaccard_similarity_score():
             "`average='macro'`.")
     assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
                          y_pred, average='macro', normalize=False)
+    assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
+                         y_pred, average='macro', normalize=True)
     msg2 = 'All labels must be in [0, n, labels). Got 4 > 2'
     assert_raise_message(ValueError, msg2, jaccard_similarity_score, y_true,
                          y_pred, labels=[4])

From 90e0c5c5e5709f3249db01c367ad7e3030adfbd7 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Thu, 25 Jan 2018 05:13:16 +0530
Subject: [PATCH 57/88] address Joel's comments all, but one

---
 doc/modules/model_evaluation.rst     |  5 +-
 doc/whats_new/v0.20.rst              |  4 +-
 sklearn/metrics/classification.py    | 97 ++++++++++++++--------------
 sklearn/metrics/tests/test_common.py |  3 +-
 4 files changed, 55 insertions(+), 54 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index cebd1b45dac31..4bd65bd87176e 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -70,7 +70,7 @@ Scoring                           Function
 'neg_log_loss'                    :func:`metrics.log_loss`                          requires ``predict_proba`` support
 'precision' etc.                  :func:`metrics.precision_score`                   suffixes apply as with 'f1'
 'recall' etc.                     :func:`metrics.recall_score`                      suffixes apply as with 'f1'
-'jaccard' etc.                    :func:`metric.jaccard_similarity_score`           suffixes apply as with 'f1'
+'jaccard' etc.                    :func:`metrics.jaccard_similarity_score`          suffixes apply as with 'f1'
 'roc_auc'                         :func:`metrics.roc_auc_score`
 
 **Clustering**
@@ -683,7 +683,8 @@ In the multilabel case with binary label indicators: ::
   >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
   1.5
 
-Multiclass problems are binarized: ::
+Multiclass problems are binarized and treated like the corresponding
+multilabel problem: ::
 
   >>> y_pred = [0, 2, 1, 3]
   >>> y_true = [0, 1, 2, 3]
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index b487b4473d4be..2eab043ecf3c5 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -151,8 +151,8 @@ Metrics
 
 - :func:`metrics.jaccard_similarity_score` now accepts ``average`` argument
   like :func:`metrics.precision_recall_fscore_support` as a naively set-wise
-  measure applying only to binary, multilabel targets and binarizing
-  multiclass input.
+  measure applying only to binary, multilabel targets and it binarizes
+  multiclass input and treats them like the corresponding multilabel problem.
   :issue:`10083` by :user:`Gaurav Dhingra <gxyd>`.
 
 Linear, kernelized and related models
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index a9bc1a2ccd63f..26a247475f9be 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -533,43 +533,44 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             y_true = y_true[:, labels[:n_labels]]
             y_pred = y_pred[:, labels[:n_labels]]
 
-        with np.errstate(divide='ignore', invalid='ignore'):
+        class_weight = None
+
+        if average == 'samples':
+            sum_axis = 1
+            class_weight = sample_weight
+            weights = None
+        elif average == 'micro':
+            sum_axis = 1
             class_weight = None
-
-            if average == 'samples':
-                sum_axis = 1
-                class_weight = sample_weight
-                weights = None
-            elif average == 'micro':
-                sum_axis = 1
-                class_weight = None
-                weights = sample_weight
-            elif average == 'macro':
-                sum_axis = 0
-                class_weight = None
-                weights = sample_weight
-            elif average == 'weighted':
-                sum_axis = 0
-                weights = sample_weight
-                if sample_weight is None:
-                    class_weight = y_true.toarray().sum(axis=0)
-                else:
-                    class_weight = (y_true.toarray().T).dot(sample_weight)
-                if class_weight.sum() == 0:
-                    return 0
+            weights = sample_weight
+        elif average == 'macro':
+            sum_axis = 0
+            class_weight = None
+            weights = sample_weight
+        elif average == 'weighted':
+            sum_axis = 0
+            weights = sample_weight
+            if sample_weight is None:
+                class_weight = y_true.toarray().sum(axis=0)
             else:
-                sum_axis = 0
-                weights = sample_weight
-
-            pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis,
-                                         sample_weight=weights)
-            pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                          axis=sum_axis,
-                                          sample_weight=weights)
-            if average == 'micro':
-                pred_or_true = np.array([pred_or_true.sum()])
-                pred_and_true = np.array([pred_and_true.sum()])
+                class_weight = (y_true.toarray().T).dot(sample_weight)
+            if class_weight.sum() == 0:
+                return 0
+        else:
+            # average=None
+            sum_axis = 0
+            weights = sample_weight
+
+        pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis,
+                                     sample_weight=weights)
+        pred_and_true = count_nonzero(y_true.multiply(y_pred),
+                                      axis=sum_axis,
+                                      sample_weight=weights)
+        if average == 'micro':
+            pred_or_true = np.array([pred_or_true.sum()])
+            pred_and_true = np.array([pred_and_true.sum()])
 
+        with np.errstate(divide='ignore', invalid='ignore'):
             score = pred_and_true / pred_or_true
             score[pred_or_true == 0.0] = 1.0
 
@@ -581,7 +582,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                         score = score.sum()
                 else:
                     score = np.average(score, weights=class_weight)
-            return score
+        return score
     elif average == 'samples':
         raise ValueError("Sample-based jaccard similarity score is "
                          "not meaningful outside multilabel "
@@ -629,25 +630,25 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             else:
                 pred_sum = np.zeros(len(labels))
 
-        with np.errstate(divide='ignore'):
+        class_weight = None
+        if average == 'micro' or average == 'binary':
+            tp_sum = np.array([tp_sum.sum()])
+            true_sum = np.array([true_sum.sum()])
+            pred_sum = np.array([pred_sum.sum()])
             class_weight = None
-            if average == 'micro' or average == 'binary':
-                tp_sum = np.array([tp_sum.sum()])
-                true_sum = np.array([true_sum.sum()])
-                pred_sum = np.array([pred_sum.sum()])
-                class_weight = None
-            elif average == 'macro':
-                class_weight = None
-            elif average == 'weighted':
-                class_weight = true_sum
-                if class_weight.sum() == 0:
-                    return 0
+        elif average == 'macro':
+            class_weight = None
+        elif average == 'weighted':
+            class_weight = true_sum
+            if class_weight.sum() == 0:
+                return 0
 
+        with np.errstate(divide='ignore', invalid='ignore'):
             score = tp_sum / (true_sum + pred_sum - tp_sum)
 
             if average is not None:
                 score = np.average(score, weights=class_weight)
-            return score
+        return score
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 11951076d92db..b18cc76c7078d 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -435,8 +435,7 @@ def test_symmetry():
     # Symmetric metric
     for name in SYMMETRIC_METRICS:
         metric = ALL_METRICS[name]
-        if (name in METRIC_UNDEFINED_BINARY and
-                name in METRIC_UNDEFINED_BINARY):
+        if name in METRIC_UNDEFINED_BINARY:
             assert_almost_equal(metric(y_true_bin, y_pred_bin),
                                 metric(y_pred_bin, y_true_bin),
                                 err_msg="%s is not symmetric" % name)

From 8ff62bc76d5f80b1ab35c710d2e0389193efa123 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 04:54:40 +0530
Subject: [PATCH 58/88] add jaccard similarity score to scorers

---
 sklearn/metrics/scorer.py                   |  8 ++++++--
 sklearn/metrics/tests/test_score_objects.py | 17 +++++++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 05231826a8998..67df2b5482808 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -27,7 +27,8 @@
                mean_squared_error, mean_squared_log_error, accuracy_score,
                f1_score, roc_auc_score, average_precision_score,
                precision_score, recall_score, log_loss, balanced_accuracy_score,
-               explained_variance_score, brier_score_loss)
+               explained_variance_score, brier_score_loss,
+               jaccard_similarity_score)
 
 from .cluster import adjusted_rand_score
 from .cluster import homogeneity_score
@@ -41,6 +42,7 @@
 from ..utils.multiclass import type_of_target
 from ..externals import six
 from ..base import is_regressor
+from functools import partial
 
 
 class _BaseScorer(six.with_metaclass(ABCMeta, object)):
@@ -501,6 +503,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 accuracy_scorer = make_scorer(accuracy_score)
 f1_scorer = make_scorer(f1_score)
 balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
+jaccard_similarity_scorer = make_scorer(jaccard_similarity_score)
 
 # Score functions that need decision values
 roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
@@ -561,7 +564,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 
 
 for name, metric in [('precision', precision_score),
-                     ('recall', recall_score), ('f1', f1_score)]:
+                     ('recall', recall_score), ('f1', f1_score),
+                     ('jaccard_similarity', partial(jaccard_similarity_score, average='binary'))]:
     SCORERS[name] = make_scorer(metric)
     for average in ['macro', 'micro', 'samples', 'weighted']:
         qualified_name = '{0}_{1}'.format(name, average)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 6af6418635d59..6e2de82a8b503 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -19,7 +19,8 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score,
-                             log_loss, precision_score, recall_score)
+                             log_loss, precision_score, recall_score,
+                             jaccard_similarity_score)
 from sklearn.metrics import cluster as cluster_module
 from sklearn.metrics.scorer import (check_scoring, _PredictScorer,
                                     _passthrough_scorer)
@@ -39,6 +40,10 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.externals import joblib
+from functools import partial
+
+jaccard_similarity_score = partial(jaccard_similarity_score, average='binary')
+
 
 
 REGRESSION_SCORERS = ['explained_variance', 'r2',
@@ -52,7 +57,9 @@
                'roc_auc', 'average_precision', 'precision',
                'precision_weighted', 'precision_macro', 'precision_micro',
                'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
-               'neg_log_loss', 'log_loss', 'brier_score_loss']
+               'neg_log_loss', 'log_loss', 'brier_score_loss', 'jaccard_similarity',
+               'jaccard_similarity_weighted', 'jaccard_similarity_macro',
+               'jaccard_similarity_micro']
 
 # All supervised cluster scorers (They behave like classification metric)
 CLUSTER_SCORERS = ["adjusted_rand_score",
@@ -64,7 +71,8 @@
                    "normalized_mutual_info_score",
                    "fowlkes_mallows_score"]
 
-MULTILABEL_ONLY_SCORERS = ['precision_samples', 'recall_samples', 'f1_samples']
+MULTILABEL_ONLY_SCORERS = ['precision_samples', 'recall_samples', 'f1_samples',
+                           'jaccard_similarity_samples']
 
 
 def _make_estimators(X_train, y_train, y_ml_train):
@@ -283,7 +291,8 @@ def test_classification_scores():
     clf.fit(X_train, y_train)
 
     for prefix, metric in [('f1', f1_score), ('precision', precision_score),
-                           ('recall', recall_score)]:
+                           ('recall', recall_score),
+                           ('jaccard_similarity', jaccard_similarity_score)]:
 
         score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
         score2 = metric(y_test, clf.predict(X_test), pos_label=None,

From f5d03d0696cb749095373b816fd5f11774daba6f Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 11:11:36 +0530
Subject: [PATCH 59/88] use make_scorer with average='binary'

---
 sklearn/metrics/scorer.py                   | 4 ++--
 sklearn/metrics/tests/test_score_objects.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 67df2b5482808..2796302f911fd 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -565,8 +565,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 
 for name, metric in [('precision', precision_score),
                      ('recall', recall_score), ('f1', f1_score),
-                     ('jaccard_similarity', partial(jaccard_similarity_score, average='binary'))]:
-    SCORERS[name] = make_scorer(metric)
+                     ('jaccard_similarity', jaccard_similarity_score)]:
+    SCORERS[name] = make_scorer(metric, average='binary')
     for average in ['macro', 'micro', 'samples', 'weighted']:
         qualified_name = '{0}_{1}'.format(name, average)
         SCORERS[qualified_name] = make_scorer(metric, pos_label=None,
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 6e2de82a8b503..f7725213d4549 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -42,8 +42,6 @@
 from sklearn.externals import joblib
 from functools import partial
 
-jaccard_similarity_score = partial(jaccard_similarity_score, average='binary')
-
 
 
 REGRESSION_SCORERS = ['explained_variance', 'r2',
@@ -292,7 +290,9 @@ def test_classification_scores():
 
     for prefix, metric in [('f1', f1_score), ('precision', precision_score),
                            ('recall', recall_score),
-                           ('jaccard_similarity', jaccard_similarity_score)]:
+                           ('jaccard_similarity',
+                            partial(jaccard_similarity_score,
+                                    average='binary'))]:
 
         score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
         score2 = metric(y_test, clf.predict(X_test), pos_label=None,

From c3279ff63afe2919b1956b02d60a46e3cd03d64d Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 11:13:09 +0530
Subject: [PATCH 60/88] fix import and pep8

---
 sklearn/metrics/scorer.py                   | 1 -
 sklearn/metrics/tests/test_score_objects.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 2796302f911fd..32ad695027030 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -42,7 +42,6 @@
 from ..utils.multiclass import type_of_target
 from ..externals import six
 from ..base import is_regressor
-from functools import partial
 
 
 class _BaseScorer(six.with_metaclass(ABCMeta, object)):
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index f7725213d4549..03f7233aa09af 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -55,9 +55,9 @@
                'roc_auc', 'average_precision', 'precision',
                'precision_weighted', 'precision_macro', 'precision_micro',
                'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
-               'neg_log_loss', 'log_loss', 'brier_score_loss', 'jaccard_similarity',
-               'jaccard_similarity_weighted', 'jaccard_similarity_macro',
-               'jaccard_similarity_micro']
+               'neg_log_loss', 'log_loss', 'brier_score_loss',
+               'jaccard_similarity', 'jaccard_similarity_weighted',
+               'jaccard_similarity_macro', 'jaccard_similarity_micro']
 
 # All supervised cluster scorers (They behave like classification metric)
 CLUSTER_SCORERS = ["adjusted_rand_score",

From a6736831b08791db2e9270c184872a1eab0505ba Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 11:36:00 +0530
Subject: [PATCH 61/88] fix doc

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4bd65bd87176e..21aa6c2a99da1 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -105,7 +105,7 @@ Usage examples:
     >>> model = svm.SVC()
     >>> cross_val_score(model, X, y, scoring='wrong_choice')
     Traceback (most recent call last):
-    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
+    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard_similarity', 'jaccard_similarity_macro', 'jaccard_similarity_micro', 'jaccard_similarity_samples', 'jaccard_similarity_weighted', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
 
 .. note::
 

From 0b507aa2756132c63670acf6539f5dd7370c6f94 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 13:41:31 +0530
Subject: [PATCH 62/88] use 'jaccard' instead of 'jaccard_similarity'

---
 doc/modules/model_evaluation.rst            |  2 +-
 sklearn/metrics/scorer.py                   |  2 +-
 sklearn/metrics/tests/test_score_objects.py | 11 +++++------
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 21aa6c2a99da1..55d442c601baa 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -105,7 +105,7 @@ Usage examples:
     >>> model = svm.SVC()
     >>> cross_val_score(model, X, y, scoring='wrong_choice')
     Traceback (most recent call last):
-    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard_similarity', 'jaccard_similarity_macro', 'jaccard_similarity_micro', 'jaccard_similarity_samples', 'jaccard_similarity_weighted', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
+    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
 
 .. note::
 
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 32ad695027030..c29ecb58f0edd 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -564,7 +564,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 
 for name, metric in [('precision', precision_score),
                      ('recall', recall_score), ('f1', f1_score),
-                     ('jaccard_similarity', jaccard_similarity_score)]:
+                     ('jaccard', jaccard_similarity_score)]:
     SCORERS[name] = make_scorer(metric, average='binary')
     for average in ['macro', 'micro', 'samples', 'weighted']:
         qualified_name = '{0}_{1}'.format(name, average)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 03f7233aa09af..5b31d3e60edce 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -56,8 +56,8 @@
                'precision_weighted', 'precision_macro', 'precision_micro',
                'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
                'neg_log_loss', 'log_loss', 'brier_score_loss',
-               'jaccard_similarity', 'jaccard_similarity_weighted',
-               'jaccard_similarity_macro', 'jaccard_similarity_micro']
+               'jaccard', 'jaccard_weighted', 'jaccard_macro',
+               'jaccard_micro']
 
 # All supervised cluster scorers (They behave like classification metric)
 CLUSTER_SCORERS = ["adjusted_rand_score",
@@ -70,7 +70,7 @@
                    "fowlkes_mallows_score"]
 
 MULTILABEL_ONLY_SCORERS = ['precision_samples', 'recall_samples', 'f1_samples',
-                           'jaccard_similarity_samples']
+                           'jaccard_samples']
 
 
 def _make_estimators(X_train, y_train, y_ml_train):
@@ -290,9 +290,8 @@ def test_classification_scores():
 
     for prefix, metric in [('f1', f1_score), ('precision', precision_score),
                            ('recall', recall_score),
-                           ('jaccard_similarity',
-                            partial(jaccard_similarity_score,
-                                    average='binary'))]:
+                           ('jaccard', partial(jaccard_similarity_score,
+                                               average='binary'))]:
 
         score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
         score2 = metric(y_test, clf.predict(X_test), pos_label=None,

From 5bb690d0e2ef7ac5cd80339fbf183b46d2ca3a08 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 14:47:48 +0530
Subject: [PATCH 63/88] collect common validation code between prfs and jaccard

---
 sklearn/metrics/classification.py | 37 ++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 26a247475f9be..7ad81a1222f53 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -478,15 +478,9 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     ... np.array([0, 1, 2, 3]), average='macro')
     0.5
     """
-
-    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
-    if average not in average_options and average != 'binary':
-        raise ValueError("average has to be one of " + str(average_options))
-
-    # Compute accuracy for each possible representation
-    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-    check_consistent_length(y_true, y_pred, sample_weight)
-    present_labels = unique_labels(y_true, y_pred)
+    validate_average(average)
+    y_type, y_true, y_pred, present_labels = validate_input(y_true, y_pred,
+                                                            sample_weight)
 
     if average == 'binary':
         if y_type == 'binary':
@@ -1068,6 +1062,20 @@ def _prf_divide(numerator, denominator, metric, modifier, average, warn_for):
     return result
 
 
+def validate_average(average):
+    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
+    if average not in average_options and average != 'binary':
+        raise ValueError('average has to be one of ' +
+                         str(average_options))
+
+
+def validate_input(y_true, y_pred, sample_weight):
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    check_consistent_length(y_true, y_pred, sample_weight)
+    present_labels = unique_labels(y_true, y_pred)
+    return y_type, y_true, y_pred, present_labels
+
+
 def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                                     pos_label=1, average=None,
                                     warn_for=('precision', 'recall',
@@ -1211,16 +1219,13 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
      array([2, 2, 2]))
 
     """
-    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
-    if average not in average_options and average != 'binary':
-        raise ValueError('average has to be one of ' +
-                         str(average_options))
+    validate_average(average)
+
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
 
-    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-    check_consistent_length(y_true, y_pred, sample_weight)
-    present_labels = unique_labels(y_true, y_pred)
+    y_type, y_true, y_pred, present_labels = validate_input(y_true, y_pred,
+                                                            sample_weight)
 
     if average == 'binary':
         if y_type == 'binary':

From f1e1b698a38b83aa7aa6d7c652f97a1a70594b07 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 17:00:59 +0530
Subject: [PATCH 64/88] update docstring and name

---
 sklearn/metrics/classification.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7ad81a1222f53..f4b1c315381c4 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -478,9 +478,9 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     ... np.array([0, 1, 2, 3]), average='macro')
     0.5
     """
-    validate_average(average)
-    y_type, y_true, y_pred, present_labels = validate_input(y_true, y_pred,
-                                                            sample_weight)
+    _validate_prfsj_average(average)
+    y_type, y_true, y_pred, present_labels = _validate_prfsj_input(y_true,
+                                                        y_pred, sample_weight)
 
     if average == 'binary':
         if y_type == 'binary':
@@ -1062,14 +1062,22 @@ def _prf_divide(numerator, denominator, metric, modifier, average, warn_for):
     return result
 
 
-def validate_average(average):
+def _validate_prfsj_average(average):
+    """Validate ``average`` as a valid average option for
+    functions :func:`metrics.precision_recall_fscore_support` and
+    :func:`metrics.jaccard_similarity_score`.
+    """
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options and average != 'binary':
         raise ValueError('average has to be one of ' +
                          str(average_options))
 
 
-def validate_input(y_true, y_pred, sample_weight):
+def _validate_prfsj_input(y_true, y_pred, sample_weight):
+    """Validate input for consistent length and type for functions
+    :func:`metrics.precision_recall_fscore_support` and
+    :func:`metrics.jaccard_similarity_score`.
+    """
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
     present_labels = unique_labels(y_true, y_pred)
@@ -1219,13 +1227,13 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
      array([2, 2, 2]))
 
     """
-    validate_average(average)
+    _validate_prfsj_average(average)
 
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
 
-    y_type, y_true, y_pred, present_labels = validate_input(y_true, y_pred,
-                                                            sample_weight)
+    y_type, y_true, y_pred, present_labels = _validate_prfsj_input(y_true,
+                                                        y_pred, sample_weight)
 
     if average == 'binary':
         if y_type == 'binary':

From 9606d52443595f1a518e31f745b262b7994a5646 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 30 Jan 2018 17:31:04 +0530
Subject: [PATCH 65/88] fix pep8

---
 sklearn/metrics/classification.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index f4b1c315381c4..27bb2a56541ef 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -479,9 +479,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     0.5
     """
     _validate_prfsj_average(average)
-    y_type, y_true, y_pred, present_labels = _validate_prfsj_input(y_true,
-                                                        y_pred, sample_weight)
-
+    y_type, y_true, y_pred, present_labels = _validate_input(y_true, y_pred,
+                                                             sample_weight)
     if average == 'binary':
         if y_type == 'binary':
             if pos_label not in present_labels:
@@ -1073,7 +1072,7 @@ def _validate_prfsj_average(average):
                          str(average_options))
 
 
-def _validate_prfsj_input(y_true, y_pred, sample_weight):
+def _validate_input(y_true, y_pred, sample_weight):
     """Validate input for consistent length and type for functions
     :func:`metrics.precision_recall_fscore_support` and
     :func:`metrics.jaccard_similarity_score`.
@@ -1228,13 +1227,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
     """
     _validate_prfsj_average(average)
-
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
-
-    y_type, y_true, y_pred, present_labels = _validate_prfsj_input(y_true,
-                                                        y_pred, sample_weight)
-
+    y_type, y_true, y_pred, present_labels = _validate_input(y_true, y_pred,
+                                                             sample_weight)
     if average == 'binary':
         if y_type == 'binary':
             if pos_label not in present_labels:

From e1d7e288e9b1b94007d822557abeee2393c6893f Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 31 Jan 2018 10:36:24 +0530
Subject: [PATCH 66/88] a little more refactoring

---
 sklearn/metrics/classification.py | 32 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 27bb2a56541ef..117fb410ff26b 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -513,14 +513,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                              "`average='samples'`, got `average='%s'`."
                              % average)
 
-        if not np.all(labels == present_labels):
-            if np.max(labels) > np.max(present_labels):
-                raise ValueError('All labels must be in [0, n, labels). '
-                                 'Got %d > %d' %
-                                 (np.max(labels), np.max(present_labels)))
-            if np.min(labels) < 0:
-                raise ValueError('All labels must be in [0, n, labels). '
-                                 'Got %d < 0' % np.min(labels))
+        _validate_multilabels(labels, present_labels)
 
         if n_labels is not None:
             y_true = y_true[:, labels[:n_labels]]
@@ -1083,6 +1076,18 @@ def _validate_input(y_true, y_pred, sample_weight):
     return y_type, y_true, y_pred, present_labels
 
 
+def _validate_multilabels(labels, present_labels):
+    """All labels are index integers for multilabel."""
+    if not np.all(labels == present_labels):
+        if np.max(labels) > np.max(present_labels):
+            raise ValueError('All labels must be in [0, n, labels). '
+                             'Got %d > %d' %
+                             (np.max(labels), np.max(present_labels)))
+        if np.min(labels) < 0:
+            raise ValueError('All labels must be in [0, n, labels). '
+                             'Got %d < 0' % np.min(labels))
+
+
 def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                                     pos_label=1, average=None,
                                     warn_for=('precision', 'recall',
@@ -1263,16 +1268,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     if y_type.startswith('multilabel'):
         sum_axis = 1 if average == 'samples' else 0
 
-        # All labels are index integers for multilabel.
-        # Select labels:
-        if not np.all(labels == present_labels):
-            if np.max(labels) > np.max(present_labels):
-                raise ValueError('All labels must be in [0, n labels). '
-                                 'Got %d > %d' %
-                                 (np.max(labels), np.max(present_labels)))
-            if np.min(labels) < 0:
-                raise ValueError('All labels must be in [0, n labels). '
-                                 'Got %d < 0' % np.min(labels))
+        _validate_multilabels(labels, present_labels)
 
         if n_labels is not None:
             y_true = y_true[:, labels[:n_labels]]

From a2a09da81b435ce02d6917633f9ca067d2268e31 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 31 Jan 2018 18:17:01 +0530
Subject: [PATCH 67/88] change answer for zeroed multiclass and binary
 averaging

---
 sklearn/metrics/classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 117fb410ff26b..ad949ab42143d 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -485,7 +485,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         if y_type == 'binary':
             if pos_label not in present_labels:
                 if len(present_labels) < 2:
-                    return 1.
+                    return 0.
                 else:
                     raise ValueError("pos_label=%r is not a valid label: "
                                      "%r" % (pos_label, present_labels))
@@ -558,7 +558,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
         with np.errstate(divide='ignore', invalid='ignore'):
             score = pred_and_true / pred_or_true
-            score[pred_or_true == 0.0] = 1.0
+            score[pred_or_true == 0.0] = 0.0
 
             if average is not None:
                 if not normalize:

From c873dcecb49b49aff233a58b84b800fe7f7dcfde Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Tue, 6 Feb 2018 16:15:23 +0530
Subject: [PATCH 68/88] fix for edge cases

---
 sklearn/metrics/classification.py            |  5 +++--
 sklearn/metrics/tests/test_classification.py | 13 +++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index ad949ab42143d..56e45291c061c 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -579,6 +579,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         y_true = le.transform(y_true)
         y_pred = le.transform(y_pred)
 
+        Labels = labels
         labels = le.transform(labels)[:n_labels]
         # use 'np.in1d' instead of 'np.isin' (unavailable in version < 1.13.0)
         indices = np.where(np.in1d(y_true, labels, assume_unique=False,
@@ -607,12 +608,12 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
             tp_sum = np.zeros(len(labels))
             if len(y_true):
                 true_sum = np.bincount(y_true, weights=sample_weight,
-                                       minlength=len(labels))[labels]
+                                       minlength=len(Labels))[labels]
             else:
                 true_sum = np.zeros(len(labels))
             if len(y_pred):
                 pred_sum = np.bincount(y_pred, weights=sample_weight,
-                                       minlength=len(labels))[labels]
+                                       minlength=len(Labels))[labels]
             else:
                 pred_sum = np.zeros(len(labels))
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index d12f17edbe7b5..0bd93ece7fa43 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1092,6 +1092,19 @@ def test_multiclass_jaccard_similarity_score():
 
 
 def test_average_binary_jaccard_similarity_score():
+    # tp=0, fp=0, fn=1, tn=0
+    y_true = np.array([1])
+    y_pred = np.array([0])
+    assert_equal(jaccard_similarity_score(y_true, y_pred,
+                                          average='binary'), 0.)
+    # tp=0, fp=0, fn=0, tn=1
+    y_true = np.array([0])
+    y_pred = np.array([0])
+    assert_equal(jaccard_similarity_score(y_true, y_pred,
+                                          average='binary'), 0.)
+    # tp=1, fp=0, fn=0, tn=0 (pos_label=0)
+    assert_equal(jaccard_similarity_score(y_true, y_pred, pos_label=0,
+                                          average='binary'), 1.)
     y_true = np.array([1, 0, 1, 1, 0])
     y_pred = np.array([1, 0, 1, 1, 1])
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,

From 4978dfd19163330e0338eb2f13b6c1e8554a6478 Mon Sep 17 00:00:00 2001
From: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
Date: Wed, 7 Feb 2018 09:57:07 +0530
Subject: [PATCH 69/88] update (refactoring) function name and add doc example

---
 sklearn/metrics/classification.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 56e45291c061c..c19e723221b7b 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -447,6 +447,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     --------
     accuracy_score, hamming_loss, zero_one_loss
 
+    Notes
+    -----
+    :func:`jaccard_similarity_score` may be a poor metric if there are no
+    positives for some samples or classes.
+
     References
     ----------
     .. [1] `Wikipedia entry for the Jaccard index
@@ -472,13 +477,21 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average=None)
     array([ 0.,  0.,  1.])
 
+    It may be a poor indicator if there are no positives for some samples
+    or classes:
+
+    >>> jaccard_similarity_score(np.array([0]), np.array([0]),
+    ... average='binary')
+    0.0
+
     In the multiclass case:
 
     >>> jaccard_similarity_score(np.array([0, 2, 1, 3]),
     ... np.array([0, 1, 2, 3]), average='macro')
     0.5
+
     """
-    _validate_prfsj_average(average)
+    _validate_set_wise_average(average)
     y_type, y_true, y_pred, present_labels = _validate_input(y_true, y_pred,
                                                              sample_weight)
     if average == 'binary':
@@ -1055,7 +1068,7 @@ def _prf_divide(numerator, denominator, metric, modifier, average, warn_for):
     return result
 
 
-def _validate_prfsj_average(average):
+def _validate_set_wise_average(average):
     """Validate ``average`` as a valid average option for
     functions :func:`metrics.precision_recall_fscore_support` and
     :func:`metrics.jaccard_similarity_score`.
@@ -1232,7 +1245,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
      array([2, 2, 2]))
 
     """
-    _validate_prfsj_average(average)
+    _validate_set_wise_average(average)
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
     y_type, y_true, y_pred, present_labels = _validate_input(y_true, y_pred,

From 4fe8a1f94e1edaca0886cf2f1e34b04435f9e9a7 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 7 Nov 2018 00:22:36 +1100
Subject: [PATCH 70/88] Fix merge error

---
 sklearn/metrics/tests/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index fa9441c6e2bc0..11946573654ce 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -905,7 +905,7 @@ def test_normalize_option_multiclass_classification(name):
         metrics = ALL_METRICS[name]
         measure = metrics(y_true, y_pred, normalize=True)
         assert_array_less(-1.0 * measure, 0,
-                       msg="We failed to test correctly the normalize option")
+                          err_msg="We failed to test correctly the normalize option")
         assert_allclose(metrics(y_true, y_pred, normalize=False)
                         / n_samples, measure)
 

From 2c9b3562bdb165552694857883290fd787779600 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 13 Nov 2018 12:59:35 +1100
Subject: [PATCH 71/88] WIP

---
 sklearn/metrics/classification.py    | 165 +++++----------------------
 sklearn/metrics/tests/test_common.py |   3 +
 2 files changed, 33 insertions(+), 135 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 2a5dc763ca696..197db3af81db6 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -717,144 +717,39 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                       "average != 'binary' (got %r). You may use "
                       "labels=[pos_label] to specify a single positive class."
                       % (pos_label, average), UserWarning)
+    if average != 'samples' and normalize != 'true-if-samples':
+        raise ValueError("'normalize' is only meaningful with "
+                         "`average='samples'`, got `average='%s'`."
+                         % average)
 
-    if labels is None:
-        labels = present_labels
-        n_labels = None
-    else:
-        n_labels = len(labels)
-        labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
-                                                 assume_unique=True)])
-
-    if y_type.startswith('multilabel'):
-        if average != 'samples' and normalize != 'true-if-samples':
-            raise ValueError("'normalize' is only meaningful with "
-                             "`average='samples'`, got `average='%s'`."
-                             % average)
-
-        _validate_multilabels(labels, present_labels)
-
-        if n_labels is not None:
-            y_true = y_true[:, labels[:n_labels]]
-            y_pred = y_pred[:, labels[:n_labels]]
+    samplewise = average == 'samples'
+    MCM = multilabel_confusion_matrix(y_true, y_pred,
+                                      sample_weight=sample_weight,
+                                      labels=labels, samplewise=samplewise)
+    numerator = MCM[:, 1, 1]
+    denominator = MCM[:, 1, 1] + MCM[:, 0, 1] + MCM[:, 1, 0]
 
-        class_weight = None
-
-        if average == 'samples':
-            sum_axis = 1
-            class_weight = sample_weight
-            weights = None
-        elif average == 'micro':
-            sum_axis = 1
-            class_weight = None
-            weights = sample_weight
-        elif average == 'macro':
-            sum_axis = 0
-            class_weight = None
-            weights = sample_weight
-        elif average == 'weighted':
-            sum_axis = 0
-            weights = sample_weight
-            if sample_weight is None:
-                class_weight = y_true.toarray().sum(axis=0)
-            else:
-                class_weight = (y_true.toarray().T).dot(sample_weight)
-            if class_weight.sum() == 0:
-                return 0
-        else:
-            # average=None
-            sum_axis = 0
-            weights = sample_weight
-
-        pred_or_true = count_nonzero(y_true + y_pred, axis=sum_axis,
-                                     sample_weight=weights)
-        pred_and_true = count_nonzero(y_true.multiply(y_pred),
-                                      axis=sum_axis,
-                                      sample_weight=weights)
-        if average == 'micro':
-            pred_or_true = np.array([pred_or_true.sum()])
-            pred_and_true = np.array([pred_and_true.sum()])
-
-        with np.errstate(divide='ignore', invalid='ignore'):
-            score = pred_and_true / pred_or_true
-            score[pred_or_true == 0.0] = 0.0
-
-            if average is not None:
-                if not normalize:
-                    if class_weight is not None:
-                        score = np.dot(score, class_weight)
-                    else:
-                        score = score.sum()
-                else:
-                    score = np.average(score, weights=class_weight)
-        return score
-    elif average == 'samples':
-        raise ValueError("Sample-based jaccard similarity score is "
-                         "not meaningful outside multilabel "
-                         "classification. See the accuracy_score instead.")
+    if average == 'micro':
+        numerator = np.array([numerator.sum()])
+        denominator = np.array([denominator.sum()])
+
+    if not np.all(denominator):
+        # TODO: warn that 0 will be returned
+        denominator[denominator == 0] == 1
+
+    jaccard = numerator / denominator
+    if average is None:
+        return jaccard
+    if not normalize:
+        return (jaccard * (1 if sample_weight is None
+                           else sample_weight)).sum()
+    if average == 'weighted':
+        weights = MCM[:, 1, 0] + MCM[:, 1, 1]
+    elif average == 'samples' and sample_weight is not None:
+        weights = sample_weight
     else:
-        le = LabelEncoder()
-        le.fit(labels)
-        y_true = le.transform(y_true)
-        y_pred = le.transform(y_pred)
-
-        Labels = labels
-        labels = le.transform(labels)[:n_labels]
-        # use 'np.in1d' instead of 'np.isin' (unavailable in version < 1.13.0)
-        indices = np.where(np.in1d(y_true, labels, assume_unique=False,
-                                   invert=False)
-                           + np.in1d(y_pred, labels, assume_unique=False,
-                                     invert=False))[0]
-
-        y_true = y_true[indices]
-        y_pred = y_pred[indices]
-        tp = y_true == y_pred
-        tp_bins = y_true[tp]
-        if sample_weight is not None:
-            sample_weight = np.array(sample_weight)[indices]
-            tp_bins_weights = np.asarray(sample_weight)[tp]
-        else:
-            tp_bins_weights = None
-
-        if len(tp_bins):
-            tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
-                                 minlength=len(labels))[labels]
-            true_sum = np.bincount(y_true, weights=sample_weight,
-                                   minlength=len(labels))[labels]
-            pred_sum = np.bincount(y_pred, weights=sample_weight,
-                                   minlength=len(labels))[labels]
-        else:
-            tp_sum = np.zeros(len(labels))
-            if len(y_true):
-                true_sum = np.bincount(y_true, weights=sample_weight,
-                                       minlength=len(Labels))[labels]
-            else:
-                true_sum = np.zeros(len(labels))
-            if len(y_pred):
-                pred_sum = np.bincount(y_pred, weights=sample_weight,
-                                       minlength=len(Labels))[labels]
-            else:
-                pred_sum = np.zeros(len(labels))
-
-        class_weight = None
-        if average == 'micro' or average == 'binary':
-            tp_sum = np.array([tp_sum.sum()])
-            true_sum = np.array([true_sum.sum()])
-            pred_sum = np.array([pred_sum.sum()])
-            class_weight = None
-        elif average == 'macro':
-            class_weight = None
-        elif average == 'weighted':
-            class_weight = true_sum
-            if class_weight.sum() == 0:
-                return 0
-
-        with np.errstate(divide='ignore', invalid='ignore'):
-            score = tp_sum / (true_sum + pred_sum - tp_sum)
-
-            if average is not None:
-                score = np.average(score, weights=class_weight)
-        return score
+        weights = None
+    return np.average(jaccard, weights=weights)
 
 
 def matthews_corrcoef(y_true, y_pred, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 11946573654ce..8895d05a679f5 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -165,6 +165,8 @@
     "samples_f2_score": partial(fbeta_score, average="samples", beta=2),
     "samples_precision_score": partial(precision_score, average="samples"),
     "samples_recall_score": partial(recall_score, average="samples"),
+    "samples_jaccard_similarity_score":
+    partial(jaccard_similarity_score, average="macro"),
 
     "cohen_kappa_score": cohen_kappa_score,
 
@@ -413,6 +415,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
 
     "samples_f0.5_score", "samples_f1_score", "samples_f2_score",
     "samples_precision_score", "samples_recall_score",
+    "samples_jaccard_similarity_score",
 }
 
 # Regression metrics with "multioutput-continuous" format support

From 0e9e12d5d84665eb78ecd2bbe862210299579917 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 18 Nov 2018 18:00:18 +1100
Subject: [PATCH 72/88] Make tests pass

---
 sklearn/metrics/classification.py            |  8 ++++++--
 sklearn/metrics/tests/test_classification.py | 12 ++++++------
 sklearn/metrics/tests/test_common.py         |  7 +++----
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 197db3af81db6..36d10182ba9a4 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -638,7 +638,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     normalize : bool, optional (default=True)
         If ``False``, return the sum of the Jaccard similarity coefficient
         over the sample set. Otherwise, return the average of Jaccard
-        similarity coefficient. ``normalize`` is only meaningful when
+        similarity coefficient. ``normalize`` is only applicable when
         ``average='samples'``.
 
     sample_weight : array-like of shape = [n_samples], optional
@@ -735,9 +735,10 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     if not np.all(denominator):
         # TODO: warn that 0 will be returned
-        denominator[denominator == 0] == 1
+        denominator[denominator == 0] = 1
 
     jaccard = numerator / denominator
+    print(numerator, denominator)
     if average is None:
         return jaccard
     if not normalize:
@@ -745,6 +746,9 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
                            else sample_weight)).sum()
     if average == 'weighted':
         weights = MCM[:, 1, 0] + MCM[:, 1, 1]
+        if not np.any(weights):
+            # numerator is 0, and warning should have already been issued
+            weights = None
     elif average == 'samples' and sample_weight is not None:
         weights = sample_weight
     else:
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 2a7051c21398c..bbeed9f08e877 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1151,8 +1151,8 @@ def test_jaccard_similarity_score():
             "another average setting.")
     assert_raise_message(ValueError, msg2, jaccard_similarity_score, y_true,
                          y_pred, average='binary')
-    msg3 = ("Sample-based jaccard similarity score is not meaningful outside "
-            "multilabel classification. See the accuracy_score instead.")
+    msg3 = ("Samplewise metrics are not available outside of multilabel "
+            "classification.")
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
                          y_pred, average='samples')
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
@@ -1221,10 +1221,10 @@ def test_multilabel_jaccard_similarity_score():
                          y_pred, average='macro', normalize=False)
     assert_raise_message(ValueError, msg1, jaccard_similarity_score, y_true,
                          y_pred, average='macro', normalize=True)
-    msg2 = 'All labels must be in [0, n, labels). Got 4 > 2'
+    msg2 = 'Got 4 > 2'
     assert_raise_message(ValueError, msg2, jaccard_similarity_score, y_true,
                          y_pred, labels=[4])
-    msg3 = 'All labels must be in [0, n, labels). Got -1 < 0'
+    msg3 = 'Got -1 < 0'
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
                          y_pred, labels=[-1])
 
@@ -1253,8 +1253,8 @@ def test_multiclass_jaccard_similarity_score():
                                 bin_jaccard_similarity_score(average=average,
                                                              labels=b_label))
 
-    y_true = np.array([])
-    y_pred = np.array([])
+    y_true = np.array([[0, 0], [0, 0], [0, 0]])
+    y_pred = np.array([[0, 0], [0, 0], [0, 0]])
     assert_equal(jaccard_similarity_score(y_true, y_pred, average='weighted'),
                  0.)
 
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 8895d05a679f5..44249cb7c0581 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -433,6 +433,8 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "zero_one_loss", "unnormalized_zero_one_loss",
 
     "micro_jaccard_similarity_score", "macro_jaccard_similarity_score",
+    "binary_jaccard_similarity_score",
+    "samples_jaccard_similarity_score",
 
     "f1_score", "micro_f1_score", "macro_f1_score",
     "weighted_recall_score",
@@ -440,8 +442,6 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
     "micro_precision_score", "micro_recall_score",
 
-    "binary_jaccard_similarity_score",
-
     "matthews_corrcoef_score", "mean_absolute_error", "mean_squared_error",
     "median_absolute_error", "max_error",
 
@@ -888,8 +888,7 @@ def test_normalize_option_binary_classification(name):
             continue
         metrics = ALL_METRICS[name]
         measure = metrics(y_true, y_pred, normalize=True)
-        assert_greater(measure, 0,
-                       msg="We failed to test correctly the normalize option")
+        assert measure > 0, "We failed to test correctly the normalize option"
         assert_allclose(metrics(y_true, y_pred, normalize=False)
                         / n_samples, measure)
 

From 95dfadab6550bc379dc721cdfe0f544233e03d1b Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 18 Nov 2018 20:25:17 +1100
Subject: [PATCH 73/88] Credit in what's new

---
 doc/whats_new/v0.21.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 08d2c904e7a5a..2162ebe030850 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -79,9 +79,10 @@ Support for Python 3.4 and below has been officially dropped.
 
 - |Feature| |Fix| :func:`metrics.jaccard_similarity_score` now accepts
   ``average`` argument like :func:`metrics.precision_recall_fscore_support` as
-  a naively set-wise measure applying only to binary, multilabel targets and it
-  binarizes multiclass input and treats them like the corresponding multilabel
-  problem.  :issue:`10083` by :user:`Gaurav Dhingra <gxyd>`.
+  a naively set-wise measure applying only to binary, multilabel targets. It
+  now binarizes multiclass input and treats them like the corresponding
+  multilabel problem.
+  :issue:`10083` by :user:`Gaurav Dhingra <gxyd>` and `Joel Nothman`_.
 
 :mod:`sklearn.neighbors`
 ........................

From 99fdd5cdfa9b46d6bb9148d1a3146c9a248f9f5b Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 18 Nov 2018 20:29:22 +1100
Subject: [PATCH 74/88] Clean merge error in what's new

---
 doc/whats_new/v0.20.rst | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 348f1a1b7f137..934b3d63f45e0 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -1060,20 +1060,8 @@ Linear, kernelized and related models
 - |API| :class:`pipeline.FeatureUnion` now supports ``'drop'`` as a transformer
   to drop features. :issue:`11144` by :user:`Thomas Fan <thomasjpfan>`.
 
-
-<<<<<<< HEAD
-- Fixed a bug in :func:`metrics.jaccard_similarity_score`, to disallow
-  sample-wise averaging of multiclass input, since it is redundantly equal to
-  :func:`metrics.accuracy_score`.
-  :issue:`10083` by :user:`Gaurav Dhingra <gxyd>`.
-
-- Fixed a bug in :func:`metrics.precision_precision_recall_fscore_support`
-  when truncated `range(n_labels)` is passed as value for `labels`.
-  :issue:`10377` by :user:`Gaurav Dhingra <gxyd>`.
-=======
 :mod:`sklearn.preprocessing`
 ............................
->>>>>>> master
 
 - |MajorFeature| Expanded :class:`preprocessing.OneHotEncoder` to allow to
   encode categorical string features as a numeric array using a one-hot (or

From 80520e9beb1181ee0a5e51181b0bf6ba973fde37 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 18 Nov 2018 20:30:12 +1100
Subject: [PATCH 75/88] Remove debug print

---
 sklearn/metrics/classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 36d10182ba9a4..3f4e465a24399 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -738,7 +738,6 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         denominator[denominator == 0] = 1
 
     jaccard = numerator / denominator
-    print(numerator, denominator)
     if average is None:
         return jaccard
     if not normalize:

From 4ba98bcf71deff3617775c51739bbfa7b845c627 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 18 Nov 2018 20:30:53 +1100
Subject: [PATCH 76/88] PEP8

---
 sklearn/metrics/tests/test_common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 44249cb7c0581..cdb2d2caa8e0d 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -907,7 +907,8 @@ def test_normalize_option_multiclass_classification(name):
         metrics = ALL_METRICS[name]
         measure = metrics(y_true, y_pred, normalize=True)
         assert_array_less(-1.0 * measure, 0,
-                          err_msg="We failed to test correctly the normalize option")
+                          err_msg="We failed to test correctly the normalize "
+                                  "option")
         assert_allclose(metrics(y_true, y_pred, normalize=False)
                         / n_samples, measure)
 

From 5b5f04c49c834e30eca37ebb8e5c00e084db6fac Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 18 Nov 2018 22:14:38 +1100
Subject: [PATCH 77/88] new array printing format

---
 sklearn/metrics/classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 3f4e465a24399..527a973103843 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -681,7 +681,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average='weighted')
     0.5
     >>> jaccard_similarity_score(y_true, y_pred, average=None)
-    array([ 0.,  0.,  1.])
+    array([0., 0., 1.])
 
     It may be a poor indicator if there are no positives for some samples
     or classes:

From dfe58f4330314ccbe7d80289324bc224f31616dc Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 19 Nov 2018 08:49:55 +1100
Subject: [PATCH 78/88] new array printing format #2

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 158ee695fdb0f..efbd11e026558 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -704,7 +704,7 @@ multilabel problem: ::
   >>> jaccard_similarity_score(y_true, y_pred, average='micro')
   0.33...
   >>> jaccard_similarity_score(y_true, y_pred, average=None)
-  array([ 1.,  0.,  0.,  1.])
+  array([1., 0., 0., 1.])
 
 .. _precision_recall_f_measure_metrics:
 

From 7422982f9b18f3a10389702552d541b5e6cf1c63 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 19 Nov 2018 10:39:20 +1100
Subject: [PATCH 79/88] Revert changes to v0.20.rst

---
 doc/whats_new/v0.20.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 934b3d63f45e0..165f200d0c848 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -65,6 +65,9 @@ Changelog
   location in :func:`datasets.fetch_olivetti_faces`. :issue:`12441` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`
 
+- |Fix| :func:`datasets.fetch_openml` to retry downloading when reading
+  from local cache fails. :issue:`12517` by :user:`Thomas Fan <thomasjpfan>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
@@ -948,8 +951,6 @@ Support for Python 3.3 has been officially dropped.
   of parameters in the parameter grid. ``n_iter`` now acts as an upper bound on
   iterations. :issue:`10982` by :user:`Juliet Lawton <julietcl>`
 
-Linear, kernelized and related models
-=======
 - |API| Invalid input for :class:`model_selection.ParameterGrid` now
   raises TypeError.
   :issue:`10928` by :user:`Solutus Immensus <solutusimmensus>`
@@ -1060,6 +1061,7 @@ Linear, kernelized and related models
 - |API| :class:`pipeline.FeatureUnion` now supports ``'drop'`` as a transformer
   to drop features. :issue:`11144` by :user:`Thomas Fan <thomasjpfan>`.
 
+
 :mod:`sklearn.preprocessing`
 ............................
 

From afa7759832b326c646523ad13767bd35ab2e4616 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sat, 24 Nov 2018 23:48:37 +1100
Subject: [PATCH 80/88] Remove changes due to bad merge

---
 sklearn/metrics/tests/test_common.py | 45 +++++++++++++++-------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index cdb2d2caa8e0d..2f7fa9d1f47e2 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -502,9 +502,12 @@ def test_symmetry():
     for name in SYMMETRIC_METRICS:
         metric = ALL_METRICS[name]
         if name in METRIC_UNDEFINED_BINARY:
-            assert_allclose(metric(y_true_bin, y_pred_bin),
-                            metric(y_pred_bin, y_true_bin),
-                            err_msg="%s is not symmetric" % name)
+            if name in MULTILABELS_METRICS:
+                assert_allclose(metric(y_true_bin, y_pred_bin),
+                                metric(y_pred_bin, y_true_bin),
+                                err_msg="%s is not symmetric" % name)
+            else:
+                assert False, "This case is currently unhandled"
         else:
             assert_allclose(metric(y_true, y_pred),
                             metric(y_pred, y_true),
@@ -877,40 +880,40 @@ def test_raise_value_error_multilabel_sequences(name):
 
 @pytest.mark.parametrize('name', METRICS_WITH_NORMALIZE_OPTION)
 def test_normalize_option_binary_classification(name):
+    if name in METRIC_UNDEFINED_BINARY:
+        return
     # Test in the binary case
     n_samples = 20
     random_state = check_random_state(0)
     y_true = random_state.randint(0, 2, size=(n_samples, ))
     y_pred = random_state.randint(0, 2, size=(n_samples, ))
 
-    for name in METRICS_WITH_NORMALIZE_OPTION:
-        if name in METRIC_UNDEFINED_BINARY:
-            continue
-        metrics = ALL_METRICS[name]
-        measure = metrics(y_true, y_pred, normalize=True)
-        assert measure > 0, "We failed to test correctly the normalize option"
-        assert_allclose(metrics(y_true, y_pred, normalize=False)
-                        / n_samples, measure)
+    metrics = ALL_METRICS[name]
+    measure = metrics(y_true, y_pred, normalize=True)
+    assert_array_less(-1.0 * measure, 0,
+                      err_msg="We failed to test correctly the normalize "
+                              "option")
+    assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
+                    measure)
 
 
 @pytest.mark.parametrize('name', METRICS_WITH_NORMALIZE_OPTION)
 def test_normalize_option_multiclass_classification(name):
+    if name in METRIC_UNDEFINED_MULTICLASS:
+        return
     # Test in the multiclass case
     random_state = check_random_state(0)
     y_true = random_state.randint(0, 4, size=(20, ))
     y_pred = random_state.randint(0, 4, size=(20, ))
     n_samples = y_true.shape[0]
 
-    for name in METRICS_WITH_NORMALIZE_OPTION:
-        if name in METRIC_UNDEFINED_MULTICLASS:
-            continue
-        metrics = ALL_METRICS[name]
-        measure = metrics(y_true, y_pred, normalize=True)
-        assert_array_less(-1.0 * measure, 0,
-                          err_msg="We failed to test correctly the normalize "
-                                  "option")
-        assert_allclose(metrics(y_true, y_pred, normalize=False)
-                        / n_samples, measure)
+    metrics = ALL_METRICS[name]
+    measure = metrics(y_true, y_pred, normalize=True)
+    assert_array_less(-1.0 * measure, 0,
+                      err_msg="We failed to test correctly the normalize "
+                              "option")
+    assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
+                    measure)
 
 
 def test_normalize_option_multilabel_classification():

From 55b1e8333d7a3ff370734937f704386db4d4dfe6 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sat, 24 Nov 2018 23:57:24 +1100
Subject: [PATCH 81/88] Avoid assert_equal

---
 sklearn/metrics/tests/test_classification.py | 31 ++++++++------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index bbeed9f08e877..f36b6abbc7dde 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1129,11 +1129,9 @@ def test_multilabel_hamming_loss():
     assert_equal(hamming_loss(y1[0], y2[0]), sp_hamming(y1[0], y2[0]))
 
 
-def test_jaccard_similarity_score():
+def test_jaccard_similarity_score_validation():
     y_true = np.array([0, 1, 0, 1, 1])
     y_pred = np.array([0, 1, 0, 1, 1])
-    assert_equal(jaccard_similarity_score(y_true, y_pred, average='binary',
-                                          pos_label=0), 1.)
     assert_raise_message(ValueError, "pos_label=2 is not a valid label: "
                          "array([0, 1])", jaccard_similarity_score, y_true,
                          y_pred, average='binary', pos_label=2)
@@ -1174,13 +1172,13 @@ def test_multilabel_jaccard_similarity_score():
     # size(y1 \inter y2) = [1, 2]
     # size(y1 \union y2) = [2, 2]
 
-    assert_equal(jaccard_similarity_score(y1, y2), 0.75)
-    assert_equal(jaccard_similarity_score(y1, y1), 1)
-    assert_equal(jaccard_similarity_score(y2, y2), 1)
-    assert_equal(jaccard_similarity_score(y2, np.logical_not(y2)), 0)
-    assert_equal(jaccard_similarity_score(y1, np.logical_not(y1)), 0)
-    assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0)
-    assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0)
+    assert jaccard_similarity_score(y1, y2) == 0.75
+    assert jaccard_similarity_score(y1, y1) == 1
+    assert jaccard_similarity_score(y2, y2) == 1
+    assert jaccard_similarity_score(y2, np.logical_not(y2)) == 0
+    assert jaccard_similarity_score(y1, np.logical_not(y1)) == 0
+    assert jaccard_similarity_score(y1, np.zeros(y1.shape)) == 0
+    assert jaccard_similarity_score(y2, np.zeros(y1.shape)) == 0
 
     y_true = np.array([[0, 1, 1], [1, 0, 0]])
     y_pred = np.array([[1, 1, 1], [1, 0, 1]])
@@ -1255,24 +1253,21 @@ def test_multiclass_jaccard_similarity_score():
 
     y_true = np.array([[0, 0], [0, 0], [0, 0]])
     y_pred = np.array([[0, 0], [0, 0], [0, 0]])
-    assert_equal(jaccard_similarity_score(y_true, y_pred, average='weighted'),
-                 0.)
+    assert jaccard_similarity_score(y_true, y_pred, average='weighted') == 0
 
 
 def test_average_binary_jaccard_similarity_score():
     # tp=0, fp=0, fn=1, tn=0
     y_true = np.array([1])
     y_pred = np.array([0])
-    assert_equal(jaccard_similarity_score(y_true, y_pred,
-                                          average='binary'), 0.)
+    assert jaccard_similarity_score(y_true, y_pred, average='binary') == 0.
     # tp=0, fp=0, fn=0, tn=1
     y_true = np.array([0])
     y_pred = np.array([0])
-    assert_equal(jaccard_similarity_score(y_true, y_pred,
-                                          average='binary'), 0.)
+    assert jaccard_similarity_score(y_true, y_pred, average='binary') == 0.
     # tp=1, fp=0, fn=0, tn=0 (pos_label=0)
-    assert_equal(jaccard_similarity_score(y_true, y_pred, pos_label=0,
-                                          average='binary'), 1.)
+    assert jaccard_similarity_score(y_true, y_pred, pos_label=0,
+                                    average='binary') == 1.
     y_true = np.array([1, 0, 1, 1, 0])
     y_pred = np.array([1, 0, 1, 1, 1])
     assert_almost_equal(jaccard_similarity_score(y_true, y_pred,

From 6b71c18aeebe513eced3467229ff758042947397 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 25 Nov 2018 00:08:06 +1100
Subject: [PATCH 82/88] cosmit

---
 sklearn/metrics/classification.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index cc8a9326af6b9..945a657d7cac1 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -646,7 +646,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     Returns
     -------
-    score: float (if average is not None) or array of floats, shape =\
+    score : float (if average is not None) or array of floats, shape =\
             [n_unique_labels]
 
     See also
@@ -665,14 +665,13 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
 
     Examples
     --------
-    >>> import numpy as np
     >>> from sklearn.metrics import jaccard_similarity_score
 
-    In the multilabel case with binary label indicators:
+    In the multilabel case:
 
     >>> y_true = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
     >>> y_pred = np.array([[0, 1, 1], [1, 1, 1], [0, 0, 1]])
-    >>> jaccard_similarity_score(y_true, y_pred)
+    >>> jaccard_similarity_score(y_true, y_pred, average='samples')
     ... # doctest: +ELLIPSIS
     0.33...
     >>> jaccard_similarity_score(y_true, y_pred, average='micro')
@@ -683,18 +682,11 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     >>> jaccard_similarity_score(y_true, y_pred, average=None)
     array([0., 0., 1.])
 
-    It may be a poor indicator if there are no positives for some samples
-    or classes:
-
-    >>> jaccard_similarity_score(np.array([0]), np.array([0]),
-    ... average='binary')
-    0.0
-
     In the multiclass case:
 
-    >>> jaccard_similarity_score(np.array([0, 2, 1, 3]),
-    ... np.array([0, 1, 2, 3]), average='macro')
-    0.5
+    >>> jaccard_similarity_score(np.array([0, 1, 2, 3]),
+    ...                          np.array([0, 2, 2, 3]), average='macro')
+    0.625
 
     """
     _validate_set_wise_average(average)
@@ -741,8 +733,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     if average is None:
         return jaccard
     if not normalize:
-        return (jaccard * (1 if sample_weight is None
-                           else sample_weight)).sum()
+        return np.sum(jaccard * (1 if sample_weight is None
+                                 else sample_weight))
     if average == 'weighted':
         weights = MCM[:, 1, 0] + MCM[:, 1, 1]
         if not np.any(weights):

From 03d89de2375fde343c31076eaf5c4a360e9942c0 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 25 Nov 2018 00:22:05 +1100
Subject: [PATCH 83/88] Clean up validation

---
 sklearn/metrics/classification.py | 99 +++++++++++--------------------
 1 file changed, 33 insertions(+), 66 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 945a657d7cac1..4ba62a7cb06cc 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -689,31 +689,14 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     0.625
 
     """
-    _validate_set_wise_average(average)
-    y_type, y_true, y_pred, present_labels = _validate_input(y_true, y_pred,
-                                                             sample_weight)
-    if average == 'binary':
-        if y_type == 'binary':
-            if pos_label not in present_labels:
-                if len(present_labels) < 2:
-                    return 0.
-                else:
-                    raise ValueError("pos_label=%r is not a valid label: "
-                                     "%r" % (pos_label, present_labels))
-            labels = [pos_label]
-        else:
-            raise ValueError("Target is %s but average='binary'. Please "
-                             "choose another average setting." % y_type)
-    elif pos_label not in (None, 1):
-        warnings.warn("Note that pos_label (set to %r) is ignored when "
-                      "average != 'binary' (got %r). You may use "
-                      "labels=[pos_label] to specify a single positive class."
-                      % (pos_label, average), UserWarning)
     if average != 'samples' and normalize != 'true-if-samples':
         raise ValueError("'normalize' is only meaningful with "
                          "`average='samples'`, got `average='%s'`."
                          % average)
-
+    labels = _check_set_wise_labels(y_true, y_pred, average, labels,
+                                    pos_label)
+    if labels is _ALL_ZERO:
+        return 0.
     samplewise = average == 'samples'
     MCM = multilabel_confusion_matrix(y_true, y_pred,
                                       sample_weight=sample_weight,
@@ -1173,38 +1156,39 @@ def _prf_divide(numerator, denominator, metric, modifier, average, warn_for):
     return result
 
 
-def _validate_set_wise_average(average):
-    """Validate ``average`` as a valid average option for
-    functions :func:`metrics.precision_recall_fscore_support` and
-    :func:`metrics.jaccard_similarity_score`.
+_ALL_ZERO = object()  # sentinel for special, degenerate case
+
+
+def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
+    """Validation associated with set-wise metrics
+
+    Returns identified labels or _ALL_ZERO sentinel
     """
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options and average != 'binary':
         raise ValueError('average has to be one of ' +
                          str(average_options))
 
-
-def _validate_input(y_true, y_pred, sample_weight):
-    """Validate input for consistent length and type for functions
-    :func:`metrics.precision_recall_fscore_support` and
-    :func:`metrics.jaccard_similarity_score`.
-    """
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-    check_consistent_length(y_true, y_pred, sample_weight)
     present_labels = unique_labels(y_true, y_pred)
-    return y_type, y_true, y_pred, present_labels
-
-
-def _validate_multilabels(labels, present_labels):
-    """All labels are index integers for multilabel."""
-    if not np.all(labels == present_labels):
-        if np.max(labels) > np.max(present_labels):
-            raise ValueError('All labels must be in [0, n, labels). '
-                             'Got %d > %d' %
-                             (np.max(labels), np.max(present_labels)))
-        if np.min(labels) < 0:
-            raise ValueError('All labels must be in [0, n, labels). '
-                             'Got %d < 0' % np.min(labels))
+    if average == 'binary':
+        if y_type == 'binary':
+            if pos_label not in present_labels:
+                if len(present_labels) < 2:
+                    return _ALL_ZERO
+                else:
+                    raise ValueError("pos_label=%r is not a valid label: "
+                                     "%r" % (pos_label, present_labels))
+            labels = [pos_label]
+        else:
+            raise ValueError("Target is %s but average='binary'. Please "
+                             "choose another average setting." % y_type)
+    elif pos_label not in (None, 1):
+        warnings.warn("Note that pos_label (set to %r) is ignored when "
+                      "average != 'binary' (got %r). You may use "
+                      "labels=[pos_label] to specify a single positive class."
+                      % (pos_label, average), UserWarning)
+    return labels
 
 
 def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
@@ -1349,29 +1333,12 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
      array([2, 2, 2]))
 
     """
-    _validate_set_wise_average(average)
     if beta <= 0:
         raise ValueError("beta should be >0 in the F-beta score")
-    y_type, y_true, y_pred, present_labels = _validate_input(y_true, y_pred,
-                                                             sample_weight)
-    if average == 'binary':
-        if y_type == 'binary':
-            if pos_label not in present_labels:
-                if len(present_labels) < 2:
-                    # Only negative labels
-                    return (0., 0., 0., 0)
-                else:
-                    raise ValueError("pos_label=%r is not a valid label: %r" %
-                                     (pos_label, present_labels))
-            labels = [pos_label]
-        else:
-            raise ValueError("Target is %s but average='binary'. Please "
-                             "choose another average setting." % y_type)
-    elif pos_label not in (None, 1):
-        warnings.warn("Note that pos_label (set to %r) is ignored when "
-                      "average != 'binary' (got %r). You may use "
-                      "labels=[pos_label] to specify a single positive class."
-                      % (pos_label, average), UserWarning)
+    labels = _check_set_wise_labels(y_true, y_pred, average, labels,
+                                    pos_label)
+    if labels is _ALL_ZERO:
+        return (0., 0., 0., 0)
 
     # Calculate tp_sum, pred_sum, true_sum ###
     samplewise = average == 'samples'

From 46c127489f8e3218a88f08e526d3f9c281c32d2a Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 25 Nov 2018 00:26:29 +1100
Subject: [PATCH 84/88] reuse warning code

---
 sklearn/metrics/classification.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 4ba62a7cb06cc..0dadbc013ae7a 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -708,11 +708,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         numerator = np.array([numerator.sum()])
         denominator = np.array([denominator.sum()])
 
-    if not np.all(denominator):
-        # TODO: warn that 0 will be returned
-        denominator[denominator == 0] = 1
-
-    jaccard = numerator / denominator
+    jaccard = _prf_divide(numerator, denominator, 'jaccard',
+                          'true or predicted', average, ('jaccard',))
     if average is None:
         return jaccard
     if not normalize:

From e082e62d18a1eb26cf34881ac3da4615727b799f Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 2 Jan 2019 23:12:08 +1100
Subject: [PATCH 85/88] WIP

---
 sklearn/metrics/classification.py            |  2 +
 sklearn/metrics/tests/test_classification.py | 44 +++++++++++++++-----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 0dadbc013ae7a..10fbac76907b4 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -689,6 +689,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     0.625
 
     """
+    print('!', y_true)
     if average != 'samples' and normalize != 'true-if-samples':
         raise ValueError("'normalize' is only meaningful with "
                          "`average='samples'`, got `average='%s'`."
@@ -708,6 +709,7 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         numerator = np.array([numerator.sum()])
         denominator = np.array([denominator.sum()])
 
+    print(locals())
     jaccard = _prf_divide(numerator, denominator, 'jaccard',
                           'true or predicted', average, ('jaccard',))
     if average is None:
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index f36b6abbc7dde..ee139f8f34261 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1164,7 +1164,7 @@ def test_jaccard_similarity_score_validation():
                          average='micro', pos_label=3)
 
 
-def test_multilabel_jaccard_similarity_score():
+def test_multilabel_jaccard_similarity_score(recwarn):
     # Dense label indicator matrix format
     y1 = np.array([[0, 1, 1], [1, 0, 1]])
     y2 = np.array([[0, 0, 1], [1, 0, 1]])
@@ -1226,8 +1226,26 @@ def test_multilabel_jaccard_similarity_score():
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
                          y_pred, labels=[-1])
 
+    msg = ('Jaccard is ill-defined and being set to 0.0 in labels '
+           'with no true or predicted samples.')
+    assert assert_warns_message(UndefinedMetricWarning, msg,
+                                jaccard_similarity_score,
+                                np.array([[0, 1]]),
+                                np.array([[0, 1]]),
+                                average='macro') == 0.5
 
-def test_multiclass_jaccard_similarity_score():
+    msg = ('Jaccard is ill-defined and being set to 0.0 in samples '
+           'with no true or predicted labels.')
+    assert assert_warns_message(UndefinedMetricWarning, msg,
+                                jaccard_similarity_score,
+                                np.transpose([[0, 1]]),
+                                np.transpose([[0, 1]]),
+                                average='samples') == 0.5
+
+    assert not list(recwarn)
+
+
+def test_multiclass_jaccard_similarity_score(recwarn):
     y_true = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat', 'bird', 'bird']
     y_pred = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird', 'bird', 'cat']
     labels = ['ant', 'bird', 'cat']
@@ -1255,18 +1273,22 @@ def test_multiclass_jaccard_similarity_score():
     y_pred = np.array([[0, 0], [0, 0], [0, 0]])
     assert jaccard_similarity_score(y_true, y_pred, average='weighted') == 0
 
+    assert not list(recwarn)
 
-def test_average_binary_jaccard_similarity_score():
+
+def test_average_binary_jaccard_similarity_score(recwarn):
     # tp=0, fp=0, fn=1, tn=0
-    y_true = np.array([1])
-    y_pred = np.array([0])
-    assert jaccard_similarity_score(y_true, y_pred, average='binary') == 0.
+    assert jaccard_similarity_score([1], [0], average='binary') == 0.
     # tp=0, fp=0, fn=0, tn=1
-    y_true = np.array([0])
-    y_pred = np.array([0])
-    assert jaccard_similarity_score(y_true, y_pred, average='binary') == 0.
+    msg = ('Jaccard is ill-defined and '
+           'being set to 0.0 in labels with no predicted samples.')
+    assert assert_warns_message(UndefinedMetricWarning,
+                                msg,
+                                jaccard_similarity_score,
+                                [0], [0],
+                                average='binary') == 0.
     # tp=1, fp=0, fn=0, tn=0 (pos_label=0)
-    assert jaccard_similarity_score(y_true, y_pred, pos_label=0,
+    assert jaccard_similarity_score([0], [0], pos_label=0,
                                     average='binary') == 1.
     y_true = np.array([1, 0, 1, 1, 0])
     y_pred = np.array([1, 0, 1, 1, 1])
@@ -1276,6 +1298,8 @@ def test_average_binary_jaccard_similarity_score():
                                                  average='binary',
                                                  pos_label=0), 1. / 2)
 
+    assert not list(recwarn)
+
 
 @ignore_warnings
 def test_precision_recall_f1_score_multilabel_1():

From 28dcca431cad374eba01d01fa10eff639bd1059b Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 5 Feb 2019 08:27:03 +1100
Subject: [PATCH 86/88] Clean what's new

---
 doc/whats_new/v0.21.rst | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 4e5267669372a..99e81bec66963 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -167,8 +167,6 @@ Support for Python 3.4 and below has been officially dropped.
   multilabel problem.
   :issue:`10083` by :user:`Gaurav Dhingra <gxyd>` and `Joel Nothman`_.
 
-:mod:`sklearn.model_selection`
-......................
 - |Enhancement| Use label `accuracy` instead of `micro-average` on
   :func:`metrics.classification_report` to avoid confusion. `micro-average` is
   only shown for multi-label or multi-class with a subset of classes because
@@ -176,15 +174,14 @@ Support for Python 3.4 and below has been officially dropped.
   :issue:`12334` by :user:`Emmanuel Arias <eamanu@eamanu.com>`,
   `Joel Nothman`_ and `Andreas Müller`_
 
+- |Fix| The metric :func:`metrics.r2_score` is degenerate with a single sample
+  and now it returns NaN and raises :class:`exceptions.UndefinedMetricWarning`.
+  :issue:`12855` by :user:`Pawel Sendyk <psendyk>.`
+
 - |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated
   in version 0.21 and will be removed in version 0.23.
   :issue:`10580` by :user:`Reshama Shaikh <reshamas>` and `Sandra
   Mitrovic <SandraMNE>`.
->>>>>>> master
-
-- |Fix| The metric :func:`metrics.r2_score` is degenerate with a single sample
-  and now it returns NaN and raises :class:`exceptions.UndefinedMetricWarning`.
-  :issue:`12855` by :user:`Pawel Sendyk <psendyk>.`
 
 :mod:`sklearn.model_selection`
 ..............................

From 27cf502d291a6a5bc3605f92660ae58e502c9e29 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 5 Feb 2019 23:40:18 +1100
Subject: [PATCH 87/88] FIX coax tests to pass

---
 sklearn/metrics/classification.py            |  6 +++++-
 sklearn/metrics/tests/test_classification.py | 14 ++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 428250236c1cc..14d520a8b85de 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -696,6 +696,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
     labels = _check_set_wise_labels(y_true, y_pred, average, labels,
                                     pos_label)
     if labels is _ALL_ZERO:
+        warnings.warn('Jaccard is ill-defined and being set to 0.0 with no '
+                      'true or predicted samples', UndefinedMetricWarning)
         return 0.
     samplewise = average == 'samples'
     MCM = multilabel_confusion_matrix(y_true, y_pred,
@@ -1119,8 +1121,10 @@ def _prf_divide(numerator, denominator, metric, modifier, average, warn_for):
     The metric, modifier and average arguments are used only for determining
     an appropriate warning.
     """
-    result = numerator / denominator
     mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1
+    result = numerator / denominator
     if not np.any(mask):
         return result
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 145bbde7665fc..24c69f86cf7bd 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1246,8 +1246,8 @@ def test_multilabel_jaccard_similarity_score(recwarn):
            'with no true or predicted labels.')
     assert assert_warns_message(UndefinedMetricWarning, msg,
                                 jaccard_similarity_score,
-                                np.transpose([[0, 1]]),
-                                np.transpose([[0, 1]]),
+                                np.array([[0, 0], [1, 1]]),
+                                np.array([[0, 0], [1, 1]]),
                                 average='samples') == 0.5
 
     assert not list(recwarn)
@@ -1279,7 +1279,9 @@ def test_multiclass_jaccard_similarity_score(recwarn):
 
     y_true = np.array([[0, 0], [0, 0], [0, 0]])
     y_pred = np.array([[0, 0], [0, 0], [0, 0]])
-    assert jaccard_similarity_score(y_true, y_pred, average='weighted') == 0
+    with ignore_warnings():
+        assert (jaccard_similarity_score(y_true, y_pred, average='weighted')
+                == 0)
 
     assert not list(recwarn)
 
@@ -1288,12 +1290,12 @@ def test_average_binary_jaccard_similarity_score(recwarn):
     # tp=0, fp=0, fn=1, tn=0
     assert jaccard_similarity_score([1], [0], average='binary') == 0.
     # tp=0, fp=0, fn=0, tn=1
-    msg = ('Jaccard is ill-defined and '
-           'being set to 0.0 in labels with no predicted samples.')
+    msg = ('Jaccard is ill-defined and being set to 0.0 with '
+           'no true or predicted samples')
     assert assert_warns_message(UndefinedMetricWarning,
                                 msg,
                                 jaccard_similarity_score,
-                                [0], [0],
+                                [0, 0], [0, 0],
                                 average='binary') == 0.
     # tp=1, fp=0, fn=0, tn=0 (pos_label=0)
     assert jaccard_similarity_score([0], [0], pos_label=0,

From 47776c0c4186986e7774d0043be1770eabfae495 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 13 Feb 2019 10:13:49 +1100
Subject: [PATCH 88/88] Address Adrin's comments

---
 sklearn/metrics/classification.py            | 3 ++-
 sklearn/metrics/tests/test_classification.py | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index a58cbf25a8af9..5ae254b6028f7 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -638,7 +638,8 @@ def jaccard_similarity_score(y_true, y_pred, labels=None, pos_label=1,
         If ``False``, return the sum of the Jaccard similarity coefficient
         over the sample set. Otherwise, return the average of Jaccard
         similarity coefficient. ``normalize`` is only applicable when
-        ``average='samples'``.
+        ``average='samples'``. The default value 'true-if-samples' behaves like
+        True, but does not raise an error with other values of `average`.
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 3dcb494fee1e8..c28236e8bf7f2 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1160,8 +1160,6 @@ def test_jaccard_similarity_score_validation():
             "classification.")
     assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
                          y_pred, average='samples')
-    assert_raise_message(ValueError, msg3, jaccard_similarity_score, y_true,
-                         y_pred, average='samples', normalize=False)
 
     assert_warns_message(UserWarning,
                          "Note that pos_label (set to 3) is ignored when "