scikit-learn · dnouri · Sep 17, 2013 · Sep 17, 2013 · Sep 19, 2013 · Sep 19, 2013
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -106,13 +106,13 @@ time)::
   ...    clf, iris.data, iris.target, cv=5)
   ...
   >>> scores                                            # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.9 ...,  0.96...,  1.        ])
+  array([ 0.9666...,  1.        ,  0.9666...,  0.9666...,  1.        ])
 
 The mean score and the standard deviation of the score estimate are hence given
 by::
 
   >>> print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
-  Accuracy: 0.97 (+/- 0.07)
+  Accuracy: 0.98 (+/- 0.03)
 
 By default, the score computed at each CV iteration is the ``score``
 method of the estimator. It is possible to change this by using the
@@ -122,7 +122,7 @@ scoring parameter::
   >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
   ...     scoring='f1')
   ...                                                     # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.89...,  0.96...,  1.        ])
+  array([ 0.9665...,  1.        ,  0.9665...,  0.9665...,  1.        ])
 
 See :ref:`scoring_parameter` for details.
 In the case of the Iris dataset, the samples are balanced across target
@@ -206,8 +206,8 @@ two unbalanced classes::
   >>> skf = StratifiedKFold(labels, 2)
   >>> for train, test in skf:
   ...     print("%s %s" % (train, test))
-  [1 4 6] [0 2 3 5]
-  [0 2 3 5] [1 4 6]
+  [2 4 5 6] [0 1 3]
+  [0 1 3 5] [2 4 6]
 
 
 Leave-One-Out - LOO

diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
@@ -146,7 +146,7 @@ estimator during the construction and exposes an estimator API::
     >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS
     GridSearchCV(cv=None,...
     >>> clf.best_score_   # doctest: +ELLIPSIS
-    0.9889...
+    0.9272...
     >>> clf.best_estimator_.gamma
     9.9999999999999995e-07
 
@@ -164,7 +164,7 @@ a stratified 3-fold.
     ::
 
         >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
-	array([ 0.97996661,  0.98163606,  0.98330551])
+        array([ 0.93456376,  0.95805369,  0.93624161])
 
     Two cross-validation loops are performed in parallel: one by the
     :class:`GridSearchCV` estimator to set `gamma` and the other one by

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -41,6 +41,10 @@ Changelog
    - Memory improvements of extra trees and random forest by
      `Arnaud Joly`_.
 
+   - Changed :class:`cross_validation.StratifiedKFold` to try and
+     preserve as much of the original ordering of samples as possible.
+     By `Daniel Nouri`_ and `Olivier Grisel`_.
+
 
 API changes summary
 -------------------
@@ -778,7 +782,7 @@ List of contributors for release 0.13 by number of commits.
  *  17  `Fabian Pedregosa`_
  *  17  Nelle Varoquaux
  *  16  `Christian Osendorfer`_
- *  14  Daniel Nouri
+ *  14  `Daniel Nouri`_
  *  13  `Virgile Fritsch`_
  *  13  syhw
  *  12  `Satrajit Ghosh`_
@@ -2285,3 +2289,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Kyle Kastner: http://kastnerkyle.github.io
 
 .. _@FedericoV: https://github.com/FedericoV/
+
+.. _Daniel Nouri: http://danielnouri.org
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -386,10 +386,18 @@ def __init__(self, y, n_folds=3, indices=None):
         self.y = y
 
     def _iter_test_indices(self):
-        n_folds = self.n_folds
-        idx = np.argsort(self.y)
-        for i in range(n_folds):
-            yield idx[i::n_folds]
+        idx_per_label = []
+        for label in unique(self.y):
+            idx_per_label.append(np.where(self.y == label)[0])
+
+        idxs = np.empty(len(self.y) / self.n_folds, dtype=np.int)
+        for i in range(self.n_folds):
+            j = 0
+            for idx in idx_per_label:
+                len_idx = len(idx) / self.n_folds
+                idxs[j:j + len_idx] = idx[i * len_idx:(i + 1) * len_idx]
+                j += len_idx
+            yield idxs[:j]
 
     def __repr__(self):
         return '%s.%s(labels=%s, n_folds=%i)' % (

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -22,6 +22,7 @@
 from sklearn import cross_validation as cval
 from sklearn.base import BaseEstimator
 from sklearn.datasets import make_regression
+from sklearn.datasets import load_digits
 from sklearn.datasets import load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import f1_score
@@ -379,24 +380,24 @@ def test_cross_val_score_with_score_func_classification():
 
     # Default score (should be the accuracy score)
     scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
-    assert_array_almost_equal(scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
     # Correct classification score (aka. zero / one score) - should be the
     # same as the default estimator score
     zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                      scoring="accuracy", cv=5)
-    assert_array_almost_equal(zo_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
     # F1 score (class are balanced so f1_score should be equal to zero/one
     # score
     f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                      scoring="f1", cv=5)
-    assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
     # also test deprecated old way
     with warnings.catch_warnings(record=True):
         f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                          score_func=f1_score, cv=5)
-    assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
 
 def test_cross_val_score_with_score_func_regression():
@@ -450,7 +451,7 @@ def test_permutation_score():
     score_label, _, pvalue_label = cval.permutation_test_score(
         svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size),
         random_state=0)
-    assert_almost_equal(score_label, .95, 2)
+    assert_almost_equal(score_label, .97, 2)
     assert_almost_equal(pvalue_label, 0.01, 3)
 
     # check that we obtain the same results with a sparse representation
@@ -470,14 +471,14 @@ def test_permutation_score():
                                                         scoring="accuracy")
 
     assert_less(score, 0.5)
-    assert_greater(pvalue, 0.4)
+    assert_greater(pvalue, 0.2)
 
     # test with deprecated interface
     with warnings.catch_warnings(record=True):
         score, scores, pvalue = cval.permutation_test_score(
             svm, X, y, score_func=accuracy_score, cv=cv)
     assert_less(score, 0.5)
-    assert_greater(pvalue, 0.4)
+    assert_greater(pvalue, 0.2)
 
 
 def test_cross_val_generator_with_mask():
@@ -634,3 +635,48 @@ def test_cross_indices_exception():
     assert_raises(ValueError, cval.check_cv, skf, X, y)
     assert_raises(ValueError, cval.check_cv, lolo, X, y)
     assert_raises(ValueError, cval.check_cv, lopo, X, y)
+
+
+def test_stratified_kfold_preserve_order():  # see #2372
+    y = np.array([3, 2, 1, 3, 2, 3] * 2)
+    skf = cval.StratifiedKFold(y, n_folds=2)
+    [(train0, test0), (train1, test1)] = tuple(skf)
+    assert_array_equal(train0, np.arange(6, 12))
+    assert_array_equal(test0, np.arange(0, 6))
+    assert_array_equal(train1, np.arange(0, 6))
+    assert_array_equal(test1, np.arange(6, 12))
+
+
+def test_stratified_kfold_preserve_order_with_digits():  # see #2372
+    # The digits samples are dependent as they are apparently grouped
+    # by authors although we don't have any information on the groups
+    # segment locations for this data. We can highlight this fact be
+    # computing k-fold cross-validation with and without shuffling: we
+    # observer that the shuffling case makes the IID assumption and is
+    # therefore too optimistic: it estimates a much higher accuracy
+    # (around 0.96) than than the non shuffling variant (around 0.86).
+
+    digits = load_digits()
+    X, y = digits.data[:800], digits.target[:800]
+    model = SVC(C=10, gamma=0.005)
+    n = len(y)
+
+    cv = cval.KFold(n, 5, shuffle=False)
+    assert_greater(0.91, cval.cross_val_score(model, X, y, cv=cv).mean())
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
+    assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95)
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
+    assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95)
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=2)
+    assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95)
+
+    # Similarly, StratifiedKFold should try to shuffle the data as few
+    # as possible (while respecting the balanced class constraints)
+    # and thus be able to detect the dependency by not overestimating
+    # the CV score either:
+
+    cv = cval.StratifiedKFold(y, 5)
+    assert_greater(0.91, cval.cross_val_score(model, X, y, cv=cv).mean())
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
@@ -338,21 +338,21 @@ def test_check_accuracy_on_digits():
 
     # Multinomial NB
     scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
-    assert_greater(scores.mean(), 0.90)
+    assert_greater(scores.mean(), 0.88)
 
     scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
     assert_greater(scores.mean(), 0.95)
 
     # Bernoulli NB
     scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
-    assert_greater(scores.mean(), 0.85)
+    assert_greater(scores.mean(), 0.84)
 
     scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
-    assert_greater(scores.mean(), 0.94)
+    assert_greater(scores.mean(), 0.93)
 
     # Gaussian NB
     scores = cross_val_score(GaussianNB(), X, y, cv=10)
-    assert_greater(scores.mean(), 0.81)
+    assert_greater(scores.mean(), 0.78)
 
     scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
     assert_greater(scores.mean(), 0.86)