From 4f1d87469c7a828d699e6823787dc09dab79087f Mon Sep 17 00:00:00 2001
From: Daniel Nouri <daniel.nouri@gmail.com>
Date: Tue, 17 Sep 2013 13:15:28 +0200
Subject: [PATCH 1/6] FIX #2372: StratifiedKFold less impact on the original
 order of samples.

---
 sklearn/cross_validation.py            | 14 +++++++---
 sklearn/tests/test_cross_validation.py | 36 +++++++++++++++++++++-----
 sklearn/tests/test_naive_bayes.py      |  8 +++---
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index d08f76f975a30..3e32baed243ad 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -386,10 +386,16 @@ def __init__(self, y, n_folds=3, indices=None):
         self.y = y
 
     def _iter_test_indices(self):
-        n_folds = self.n_folds
-        idx = np.argsort(self.y)
-        for i in range(n_folds):
-            yield idx[i::n_folds]
+        idx_cls = []
+        for cls in unique(self.y):
+            idx_cls.append(np.where(self.y == cls)[0])
+
+        for i in range(self.n_folds):
+            idxs = []
+            for idx in idx_cls:
+                len_idx = len(idx) / self.n_folds
+                idxs.extend(idx[i * len_idx:(i + 1) * len_idx])
+            yield sorted(idxs)
 
     def __repr__(self):
         return '%s.%s(labels=%s, n_folds=%i)' % (
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 559bada01b70e..84aec0c7cd0df 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -22,6 +22,7 @@
 from sklearn import cross_validation as cval
 from sklearn.base import BaseEstimator
 from sklearn.datasets import make_regression
+from sklearn.datasets import load_digits
 from sklearn.datasets import load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import f1_score
@@ -379,24 +380,24 @@ def test_cross_val_score_with_score_func_classification():
 
     # Default score (should be the accuracy score)
     scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
-    assert_array_almost_equal(scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
     # Correct classification score (aka. zero / one score) - should be the
     # same as the default estimator score
     zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                      scoring="accuracy", cv=5)
-    assert_array_almost_equal(zo_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
     # F1 score (class are balanced so f1_score should be equal to zero/one
     # score
     f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                      scoring="f1", cv=5)
-    assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
     # also test deprecated old way
     with warnings.catch_warnings(record=True):
         f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                          score_func=f1_score, cv=5)
-    assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
 
 def test_cross_val_score_with_score_func_regression():
@@ -450,7 +451,7 @@ def test_permutation_score():
     score_label, _, pvalue_label = cval.permutation_test_score(
         svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size),
         random_state=0)
-    assert_almost_equal(score_label, .95, 2)
+    assert_almost_equal(score_label, .97, 2)
     assert_almost_equal(pvalue_label, 0.01, 3)
 
     # check that we obtain the same results with a sparse representation
@@ -470,14 +471,14 @@ def test_permutation_score():
                                                         scoring="accuracy")
 
     assert_less(score, 0.5)
-    assert_greater(pvalue, 0.4)
+    assert_greater(pvalue, 0.2)
 
     # test with deprecated interface
     with warnings.catch_warnings(record=True):
         score, scores, pvalue = cval.permutation_test_score(
             svm, X, y, score_func=accuracy_score, cv=cv)
     assert_less(score, 0.5)
-    assert_greater(pvalue, 0.4)
+    assert_greater(pvalue, 0.2)
 
 
 def test_cross_val_generator_with_mask():
@@ -634,3 +635,24 @@ def test_cross_indices_exception():
     assert_raises(ValueError, cval.check_cv, skf, X, y)
     assert_raises(ValueError, cval.check_cv, lolo, X, y)
     assert_raises(ValueError, cval.check_cv, lopo, X, y)
+
+
+def test_stratified_kfold_preserve_order():  # see #2372
+    y = np.array([3, 2, 1, 3, 2, 3] * 2)
+    skf = cval.StratifiedKFold(y, n_folds=2)
+    [(train0, test0), (train1, test1)] = tuple(skf)
+    assert_array_equal(train0, np.arange(6, 12))
+    assert_array_equal(test0, np.arange(0, 6))
+    assert_array_equal(train1, np.arange(0, 6))
+    assert_array_equal(test1, np.arange(6, 12))
+
+
+def test_stratified_kfold_preserve_order_with_digits():  # see #2372
+    # A regression test, taken from
+    # http://nbviewer.ipython.org/urls/raw.github.com/ogrisel/notebooks/master/Non%2520IID%2520cross-validation.ipynb
+    digits = load_digits()
+    X, y = digits.data, digits.target
+
+    model = SVC(C=10, gamma=0.005)
+    cv = cval.StratifiedKFold(y, 5)
+    assert cval.cross_val_score(model, X, y, cv=cv, n_jobs=-1).mean() < 0.91
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index 77634726a386a..0dfad3f5cf69b 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -338,21 +338,21 @@ def test_check_accuracy_on_digits():
 
     # Multinomial NB
     scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
-    assert_greater(scores.mean(), 0.90)
+    assert_greater(scores.mean(), 0.88)
 
     scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
     assert_greater(scores.mean(), 0.95)
 
     # Bernoulli NB
     scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
-    assert_greater(scores.mean(), 0.85)
+    assert_greater(scores.mean(), 0.84)
 
     scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
-    assert_greater(scores.mean(), 0.94)
+    assert_greater(scores.mean(), 0.93)
 
     # Gaussian NB
     scores = cross_val_score(GaussianNB(), X, y, cv=10)
-    assert_greater(scores.mean(), 0.81)
+    assert_greater(scores.mean(), 0.78)
 
     scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
     assert_greater(scores.mean(), 0.86)

From 9777eae31a64e4ddd9556922edb182676dd0aae0 Mon Sep 17 00:00:00 2001
From: Daniel Nouri <daniel.nouri@gmail.com>
Date: Tue, 17 Sep 2013 23:05:21 +0200
Subject: [PATCH 2/6] Fix accidental doctest breakage.

---
 doc/modules/cross_validation.rst                       | 10 +++++-----
 doc/tutorial/statistical_inference/model_selection.rst |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 441fe445c691d..f673089862a45 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -106,13 +106,13 @@ time)::
   ...    clf, iris.data, iris.target, cv=5)
   ...
   >>> scores                                            # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.9 ...,  0.96...,  1.        ])
+  array([ 0.9666...,  1.        ,  0.9666...,  0.9666...,  1.        ])
 
 The mean score and the standard deviation of the score estimate are hence given
 by::
 
   >>> print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
-  Accuracy: 0.97 (+/- 0.07)
+  Accuracy: 0.98 (+/- 0.03)
 
 By default, the score computed at each CV iteration is the ``score``
 method of the estimator. It is possible to change this by using the
@@ -122,7 +122,7 @@ scoring parameter::
   >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
   ...     scoring='f1')
   ...                                                     # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.89...,  0.96...,  1.        ])
+  array([ 0.9665...,  1.        ,  0.9665...,  0.9665...,  1.        ])
 
 See :ref:`scoring_parameter` for details.
 In the case of the Iris dataset, the samples are balanced across target
@@ -206,8 +206,8 @@ two unbalanced classes::
   >>> skf = StratifiedKFold(labels, 2)
   >>> for train, test in skf:
   ...     print("%s %s" % (train, test))
-  [1 4 6] [0 2 3 5]
-  [0 2 3 5] [1 4 6]
+  [2 4 5 6] [0 1 3]
+  [0 1 3 5] [2 4 6]
 
 
 Leave-One-Out - LOO
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index b069c31d5ec69..8200bf0b9ecf2 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -146,7 +146,7 @@ estimator during the construction and exposes an estimator API::
     >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS
     GridSearchCV(cv=None,...
     >>> clf.best_score_   # doctest: +ELLIPSIS
-    0.9889...
+    0.9272...
     >>> clf.best_estimator_.gamma
     9.9999999999999995e-07
 
@@ -164,7 +164,7 @@ a stratified 3-fold.
     ::
 
         >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
-	array([ 0.97996661,  0.98163606,  0.98330551])
+        array([ 0.93456376,  0.95805369,  0.93624161])
 
     Two cross-validation loops are performed in parallel: one by the
     :class:`GridSearchCV` estimator to set `gamma` and the other one by

From 535829421930a87c126400b9d3dff888e788bc68 Mon Sep 17 00:00:00 2001
From: Daniel Nouri <daniel.nouri@gmail.com>
Date: Thu, 19 Sep 2013 14:43:20 +0200
Subject: [PATCH 3/6] Instead of linking to NB, explain the problem inside the
 test itself.

I'm using a smaller number of examples from the digits dataset because
that cuts down test execution time for me from 17s to 4s, and still
yields the very similar results.
---
 sklearn/tests/test_cross_validation.py | 35 ++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 84aec0c7cd0df..45d986a7ce524 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -648,11 +648,36 @@ def test_stratified_kfold_preserve_order():  # see #2372
 
 
 def test_stratified_kfold_preserve_order_with_digits():  # see #2372
-    # A regression test, taken from
-    # http://nbviewer.ipython.org/urls/raw.github.com/ogrisel/notebooks/master/Non%2520IID%2520cross-validation.ipynb
-    digits = load_digits()
-    X, y = digits.data, digits.target
+    # The digits samples are dependent as they are apparently grouped
+    # by authors although we don't have any information on the groups
+    # segment locations for this data. We can highlight this fact be
+    # computing k-fold cross-validation with and without shuffling: we
+    # observer that the shuffling case makes the IID assumption and is
+    # therefore too optimistic: it estimates a much higher accuracy
+    # (around 0.965) than than the non shuffling variant (around
+    # 0.905).
 
+    digits = load_digits()
+    X, y = digits.data[:800], digits.target[:800]
     model = SVC(C=10, gamma=0.005)
+    n = len(y)
+
+    cv = cval.KFold(n, 5, shuffle=False)
+    assert_greater(0.91, cval.cross_val_score(model, X, y, cv=cv).mean())
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
+    assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95)
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
+    assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95)
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=2)
+    assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95)
+
+    # Similarly, StratifiedKFold should try to shuffle the data as few
+    # as possible (while respecting the balanced class constraints)
+    # and thus be able to detect the dependency by not overestimating
+    # the CV score either:
+
     cv = cval.StratifiedKFold(y, 5)
-    assert cval.cross_val_score(model, X, y, cv=cv, n_jobs=-1).mean() < 0.91
+    assert_greater(0.91, cval.cross_val_score(model, X, y, cv=cv).mean())

From d9fa475af7ba36bbc0fac374ce733dbcb3c6c3ea Mon Sep 17 00:00:00 2001
From: Daniel Nouri <daniel.nouri@gmail.com>
Date: Thu, 19 Sep 2013 16:45:06 +0200
Subject: [PATCH 4/6] Avoid list, preallocate a numpy array for indices
 instead.

It appears that the sorted() call is unnecessary here.
---
 sklearn/cross_validation.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 3e32baed243ad..02c85e0566575 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -386,16 +386,18 @@ def __init__(self, y, n_folds=3, indices=None):
         self.y = y
 
     def _iter_test_indices(self):
-        idx_cls = []
-        for cls in unique(self.y):
-            idx_cls.append(np.where(self.y == cls)[0])
+        idx_per_label = []
+        for label in unique(self.y):
+            idx_per_label.append(np.where(self.y == label)[0])
 
+        idxs = np.empty(len(self.y) / self.n_folds, dtype=np.int)
         for i in range(self.n_folds):
-            idxs = []
-            for idx in idx_cls:
+            j = 0
+            for idx in idx_per_label:
                 len_idx = len(idx) / self.n_folds
-                idxs.extend(idx[i * len_idx:(i + 1) * len_idx])
-            yield sorted(idxs)
+                idxs[j:j + len_idx] = idx[i * len_idx:(i + 1) * len_idx]
+                j += len_idx
+            yield idxs[:j]
 
     def __repr__(self):
         return '%s.%s(labels=%s, n_folds=%i)' % (

From 5a084bd0243380072e307f9111e54b262d478f1e Mon Sep 17 00:00:00 2001
From: Daniel Nouri <daniel.nouri@gmail.com>
Date: Thu, 19 Sep 2013 17:05:22 +0200
Subject: [PATCH 5/6] Update comment with numbers for when we run with 800
 samples.

---
 sklearn/tests/test_cross_validation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 45d986a7ce524..024a8e7b143f8 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -654,8 +654,7 @@ def test_stratified_kfold_preserve_order_with_digits():  # see #2372
     # computing k-fold cross-validation with and without shuffling: we
     # observer that the shuffling case makes the IID assumption and is
     # therefore too optimistic: it estimates a much higher accuracy
-    # (around 0.965) than than the non shuffling variant (around
-    # 0.905).
+    # (around 0.96) than than the non shuffling variant (around 0.86).
 
     digits = load_digits()
     X, y = digits.data[:800], digits.target[:800]

From f01dc1835ed1e24df0a9aca944757c34ef86a448 Mon Sep 17 00:00:00 2001
From: Daniel Nouri <daniel.nouri@gmail.com>
Date: Fri, 20 Sep 2013 12:02:35 +0200
Subject: [PATCH 6/6] Add entry for #2372 to whats_new.rst

---
 doc/whats_new.rst | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index b641540c5fdbb..20d4194088e6c 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -41,6 +41,10 @@ Changelog
    - Memory improvements of extra trees and random forest by
      `Arnaud Joly`_.
 
+   - Changed :class:`cross_validation.StratifiedKFold` to try and
+     preserve as much of the original ordering of samples as possible.
+     By `Daniel Nouri`_ and `Olivier Grisel`_.
+
 
 API changes summary
 -------------------
@@ -778,7 +782,7 @@ List of contributors for release 0.13 by number of commits.
  *  17  `Fabian Pedregosa`_
  *  17  Nelle Varoquaux
  *  16  `Christian Osendorfer`_
- *  14  Daniel Nouri
+ *  14  `Daniel Nouri`_
  *  13  `Virgile Fritsch`_
  *  13  syhw
  *  12  `Satrajit Ghosh`_
@@ -2285,3 +2289,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Kyle Kastner: http://kastnerkyle.github.io
 
 .. _@FedericoV: https://github.com/FedericoV/
+
+.. _Daniel Nouri: http://danielnouri.org