scikit-learn · ogrisel · Sep 25, 2013 · Sep 20, 2013
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -105,24 +105,24 @@ time)::
   >>> scores = cross_validation.cross_val_score(
   ...    clf, iris.data, iris.target, cv=5)
   ...
-  >>> scores                                            # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.9 ...,  0.96...,  1.        ])
+  >>> scores                                              # doctest: +ELLIPSIS
+  array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
 
 The mean score and the standard deviation of the score estimate are hence given
 by::
 
   >>> print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
-  Accuracy: 0.97 (+/- 0.07)
+  Accuracy: 0.98 (+/- 0.03)
 
 By default, the score computed at each CV iteration is the ``score``
 method of the estimator. It is possible to change this by using the
 scoring parameter::
 
   >>> from sklearn import metrics
-  >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
-  ...     scoring='f1')
-  ...                                                     # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.89...,  0.96...,  1.        ])
+  >>> scores = cross_validation.cross_val_score(clf, iris.data, iris.target,
+  ...     cv=5, scoring='f1')
+  >>> scores                                              # doctest: +ELLIPSIS
+  array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
 
 See :ref:`scoring_parameter` for details.
 In the case of the Iris dataset, the samples are balanced across target
@@ -197,17 +197,18 @@ Stratified k-fold
 folds: each set contains approximately the same percentage of samples of each
 target class as the complete set.
 
-Example of stratified 2-fold cross-validation on a dataset with 7 samples from
-two unbalanced classes::
+Example of stratified 2-fold cross-validation on a dataset with 10 samples from
+two slightly unbalanced classes::
 
   >>> from sklearn.cross_validation import StratifiedKFold
 
-  >>> labels = [0, 0, 0, 1, 1, 1, 0]
-  >>> skf = StratifiedKFold(labels, 2)
+  >>> labels = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+  >>> skf = StratifiedKFold(labels, 3)
   >>> for train, test in skf:
   ...     print("%s %s" % (train, test))
-  [1 4 6] [0 2 3 5]
-  [0 2 3 5] [1 4 6]
+  [2 3 6 7 8 9] [0 1 4 5]
+  [0 1 3 4 5 8 9] [2 6 7]
+  [0 1 2 4 5 6 7] [3 8 9]
 
 
 Leave-One-Out - LOO

diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
@@ -143,12 +143,12 @@ estimator during the construction and exposes an estimator API::
     >>> gammas = np.logspace(-6, -1, 10)
     >>> clf = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas),
     ...                    n_jobs=-1)
-    >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS
+    >>> clf.fit(X_digits[:1000], y_digits[:1000])        # doctest: +ELLIPSIS
     GridSearchCV(cv=None,...
-    >>> clf.best_score_   # doctest: +ELLIPSIS
-    0.9889...
-    >>> clf.best_estimator_.gamma
-    9.9999999999999995e-07
+    >>> clf.best_score_                                  # doctest: +ELLIPSIS
+    0.924...
+    >>> clf.best_estimator_.gamma == 1e-6
+    True
 
     >>> # Prediction performance on test set is not as good as on train set
     >>> clf.score(X_digits[1000:], y_digits[1000:])
@@ -163,8 +163,9 @@ a stratified 3-fold.
 
     ::
 
-        >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
-	array([ 0.97996661,  0.98163606,  0.98330551])
+    >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
+    ...                                                  # doctest: +ELLIPSIS
+    array([ 0.935...,  0.958...,  0.937...])
 
     Two cross-validation loops are performed in parallel: one by the
     :class:`GridSearchCV` estimator to set `gamma` and the other one by

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -44,6 +44,11 @@ Changelog
    - Memory improvements of extra trees and random forest by
      `Arnaud Joly`_.
 
+   - Changed :class:`cross_validation.StratifiedKFold` to try and
+     preserve as much of the original ordering of samples as possible so as
+     not to hide overfitting on datasets with a non-negligible level of
+     samples dependency.
+     By `Daniel Nouri`_ and `Olivier Grisel`_.
 
 API changes summary
 -------------------
@@ -781,7 +786,7 @@ List of contributors for release 0.13 by number of commits.
  *  17  `Fabian Pedregosa`_
  *  17  Nelle Varoquaux
  *  16  `Christian Osendorfer`_
- *  14  Daniel Nouri
+ *  14  `Daniel Nouri`_
  *  13  `Virgile Fritsch`_
  *  13  syhw
  *  12  `Satrajit Ghosh`_
@@ -2288,3 +2293,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Kyle Kastner: http://kastnerkyle.github.io
 
 .. _@FedericoV: https://github.com/FedericoV/
+
+.. _Daniel Nouri: http://danielnouri.org
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -9,6 +9,7 @@
 # License: BSD 3 clause
 
 from __future__ import print_function
+from __future__ import division
 
 import warnings
 from itertools import chain, combinations
@@ -375,21 +376,42 @@ class StratifiedKFold(_BaseKFold):
     def __init__(self, y, n_folds=3, indices=None):
         super(StratifiedKFold, self).__init__(len(y), n_folds, indices)
         y = np.asarray(y)
-        _, y_sorted = unique(y, return_inverse=True)
-        min_labels = np.min(np.bincount(y_sorted))
+        n_samples = y.shape[0]
+        unique_labels, y_inversed = unique(y, return_inverse=True)
+        label_counts = np.bincount(y_inversed)
+        min_labels = np.min(label_counts)
         if self.n_folds > min_labels:
             warnings.warn(("The least populated class in y has only %d"
                           " members, which is too few. The minimum"
                           " number of labels for any class cannot"
                           " be less than n_folds=%d."
                           % (min_labels, self.n_folds)), Warning)
+
+        # pre-assign each sample to a test fold index using individual KFold
+        # splitting strategies for each label so as to respect the
+        # balance of labels
+        per_label_cvs = [KFold(max(c, self.n_folds), self.n_folds)
+                         for c in label_counts]
+        test_folds = np.zeros(n_samples, dtype=np.int)
+        for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
+            for label, (_, test_split) in zip(unique_labels, per_label_splits):
+                label_test_folds = test_folds[y == label]
+                # the test split can be too big because we used
+                # KFold(max(c, self.n_folds), self.n_folds) instead of
+                # KFold(c, self.n_folds) to make it possible to not crash even
+                # if the data is not 100% stratifiable for all the labels
+                # (we use a warning instead of raising an exception)
+                # If this is the case, let's trim it:
+                test_split = test_split[test_split < len(label_test_folds)]
+                label_test_folds[test_split] = test_fold_idx
+                test_folds[y == label] = label_test_folds
+
+        self.test_folds = test_folds
         self.y = y
 
-    def _iter_test_indices(self):
-        n_folds = self.n_folds
-        idx = np.argsort(self.y)
-        for i in range(n_folds):
-            yield idx[i::n_folds]
+    def _iter_test_masks(self):
+        for i in range(self.n_folds):
+            yield self.test_folds == i
 
     def __repr__(self):
         return '%s.%s(labels=%s, n_folds=%i)' % (

diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py
@@ -69,39 +69,35 @@ def test_rfecv():
     y = list(iris.target)   # regression test: list should be supported
 
     # Test using the score function
-    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
     rfecv.fit(X, y)
     # non-regression test for missing worst feature:
     assert_equal(len(rfecv.grid_scores_), X.shape[1])
     assert_equal(len(rfecv.ranking_), X.shape[1])
     X_r = rfecv.transform(X)
 
+    # All the noisy variable were filtered out
+    assert_array_equal(X_r, iris.data)
+
     # same in sparse
-    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
+    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
     X_sparse = sparse.csr_matrix(X)
     rfecv_sparse.fit(X_sparse, y)
     X_r_sparse = rfecv_sparse.transform(X_sparse)
-
-    assert_equal(X_r.shape, iris.data.shape)
-    assert_array_almost_equal(X_r[:10], iris.data[:10])
-    assert_array_almost_equal(X_r_sparse.toarray(), X_r)
+    assert_array_equal(X_r_sparse.toarray(), iris.data)
 
     # Test using a customized loss function
-    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3,
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                   loss_func=zero_one_loss)
     with warnings.catch_warnings(record=True):
         rfecv.fit(X, y)
     X_r = rfecv.transform(X)
-
-    assert_equal(X_r.shape, iris.data.shape)
-    assert_array_almost_equal(X_r[:10], iris.data[:10])
+    assert_array_equal(X_r, iris.data)
 
     # Test using a scorer
     scorer = SCORERS['accuracy']
-    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3,
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                   scoring=scorer)
     rfecv.fit(X, y)
     X_r = rfecv.transform(X)
-
-    assert_equal(X_r.shape, iris.data.shape)
-    assert_array_almost_equal(X_r[:10], iris.data[:10])
+    assert_array_equal(X_r, iris.data)