From e3002c5efc2a34defe65e017e2a0cc3b6f250c1a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Sep 2013 17:59:14 +0200
Subject: [PATCH] FIX #2372: non-shuffling StratifiedKFold implementation and
 updated tests

---
 doc/modules/cross_validation.rst              |  27 ++-
 .../statistical_inference/model_selection.rst |  15 +-
 doc/whats_new.rst                             |   9 +-
 sklearn/cross_validation.py                   |  36 ++-
 sklearn/feature_selection/tests/test_rfe.py   |  24 +-
 sklearn/tests/test_cross_validation.py        | 222 +++++++++++++++---
 sklearn/tests/test_naive_bayes.py             |  14 +-
 7 files changed, 261 insertions(+), 86 deletions(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 441fe445c691d..5f40613a0fa97 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -105,24 +105,24 @@ time)::
   >>> scores = cross_validation.cross_val_score(
   ...    clf, iris.data, iris.target, cv=5)
   ...
-  >>> scores                                            # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.9 ...,  0.96...,  1.        ])
+  >>> scores                                              # doctest: +ELLIPSIS
+  array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
 
 The mean score and the standard deviation of the score estimate are hence given
 by::
 
   >>> print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
-  Accuracy: 0.97 (+/- 0.07)
+  Accuracy: 0.98 (+/- 0.03)
 
 By default, the score computed at each CV iteration is the ``score``
 method of the estimator. It is possible to change this by using the
 scoring parameter::
 
   >>> from sklearn import metrics
-  >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
-  ...     scoring='f1')
-  ...                                                     # doctest: +ELLIPSIS
-  array([ 1.  ...,  0.96...,  0.89...,  0.96...,  1.        ])
+  >>> scores = cross_validation.cross_val_score(clf, iris.data, iris.target,
+  ...     cv=5, scoring='f1')
+  >>> scores                                              # doctest: +ELLIPSIS
+  array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
 
 See :ref:`scoring_parameter` for details.
 In the case of the Iris dataset, the samples are balanced across target
@@ -197,17 +197,18 @@ Stratified k-fold
 folds: each set contains approximately the same percentage of samples of each
 target class as the complete set.
 
-Example of stratified 2-fold cross-validation on a dataset with 7 samples from
-two unbalanced classes::
+Example of stratified 2-fold cross-validation on a dataset with 10 samples from
+two slightly unbalanced classes::
 
   >>> from sklearn.cross_validation import StratifiedKFold
 
-  >>> labels = [0, 0, 0, 1, 1, 1, 0]
-  >>> skf = StratifiedKFold(labels, 2)
+  >>> labels = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+  >>> skf = StratifiedKFold(labels, 3)
   >>> for train, test in skf:
   ...     print("%s %s" % (train, test))
-  [1 4 6] [0 2 3 5]
-  [0 2 3 5] [1 4 6]
+  [2 3 6 7 8 9] [0 1 4 5]
+  [0 1 3 4 5 8 9] [2 6 7]
+  [0 1 2 4 5 6 7] [3 8 9]
 
 
 Leave-One-Out - LOO
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index b069c31d5ec69..e7c898a6d9e46 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -143,12 +143,12 @@ estimator during the construction and exposes an estimator API::
     >>> gammas = np.logspace(-6, -1, 10)
     >>> clf = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas),
     ...                    n_jobs=-1)
-    >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS
+    >>> clf.fit(X_digits[:1000], y_digits[:1000])        # doctest: +ELLIPSIS
     GridSearchCV(cv=None,...
-    >>> clf.best_score_   # doctest: +ELLIPSIS
-    0.9889...
-    >>> clf.best_estimator_.gamma
-    9.9999999999999995e-07
+    >>> clf.best_score_                                  # doctest: +ELLIPSIS
+    0.924...
+    >>> clf.best_estimator_.gamma == 1e-6
+    True
 
     >>> # Prediction performance on test set is not as good as on train set
     >>> clf.score(X_digits[1000:], y_digits[1000:])
@@ -163,8 +163,9 @@ a stratified 3-fold.
 
     ::
 
-        >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
-	array([ 0.97996661,  0.98163606,  0.98330551])
+    >>> cross_validation.cross_val_score(clf, X_digits, y_digits)
+    ...                                                  # doctest: +ELLIPSIS
+    array([ 0.935...,  0.958...,  0.937...])
 
     Two cross-validation loops are performed in parallel: one by the
     :class:`GridSearchCV` estimator to set `gamma` and the other one by
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 39a55d31353ce..620adef0edc24 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -44,6 +44,11 @@ Changelog
    - Memory improvements of extra trees and random forest by
      `Arnaud Joly`_.
 
+   - Changed :class:`cross_validation.StratifiedKFold` to try and
+     preserve as much of the original ordering of samples as possible so as
+     not to hide overfitting on datasets with a non-negligible level of
+     samples dependency.
+     By `Daniel Nouri`_ and `Olivier Grisel`_.
 
 API changes summary
 -------------------
@@ -781,7 +786,7 @@ List of contributors for release 0.13 by number of commits.
  *  17  `Fabian Pedregosa`_
  *  17  Nelle Varoquaux
  *  16  `Christian Osendorfer`_
- *  14  Daniel Nouri
+ *  14  `Daniel Nouri`_
  *  13  `Virgile Fritsch`_
  *  13  syhw
  *  12  `Satrajit Ghosh`_
@@ -2288,3 +2293,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Kyle Kastner: http://kastnerkyle.github.io
 
 .. _@FedericoV: https://github.com/FedericoV/
+
+.. _Daniel Nouri: http://danielnouri.org
\ No newline at end of file
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index d08f76f975a30..a4e1763a6684f 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -9,6 +9,7 @@
 # License: BSD 3 clause
 
 from __future__ import print_function
+from __future__ import division
 
 import warnings
 from itertools import chain, combinations
@@ -375,21 +376,42 @@ class StratifiedKFold(_BaseKFold):
     def __init__(self, y, n_folds=3, indices=None):
         super(StratifiedKFold, self).__init__(len(y), n_folds, indices)
         y = np.asarray(y)
-        _, y_sorted = unique(y, return_inverse=True)
-        min_labels = np.min(np.bincount(y_sorted))
+        n_samples = y.shape[0]
+        unique_labels, y_inversed = unique(y, return_inverse=True)
+        label_counts = np.bincount(y_inversed)
+        min_labels = np.min(label_counts)
         if self.n_folds > min_labels:
             warnings.warn(("The least populated class in y has only %d"
                           " members, which is too few. The minimum"
                           " number of labels for any class cannot"
                           " be less than n_folds=%d."
                           % (min_labels, self.n_folds)), Warning)
+
+        # pre-assign each sample to a test fold index using individual KFold
+        # splitting strategies for each label so as to respect the
+        # balance of labels
+        per_label_cvs = [KFold(max(c, self.n_folds), self.n_folds)
+                         for c in label_counts]
+        test_folds = np.zeros(n_samples, dtype=np.int)
+        for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
+            for label, (_, test_split) in zip(unique_labels, per_label_splits):
+                label_test_folds = test_folds[y == label]
+                # the test split can be too big because we used
+                # KFold(max(c, self.n_folds), self.n_folds) instead of
+                # KFold(c, self.n_folds) to make it possible to not crash even
+                # if the data is not 100% stratifiable for all the labels
+                # (we use a warning instead of raising an exception)
+                # If this is the case, let's trim it:
+                test_split = test_split[test_split < len(label_test_folds)]
+                label_test_folds[test_split] = test_fold_idx
+                test_folds[y == label] = label_test_folds
+
+        self.test_folds = test_folds
         self.y = y
 
-    def _iter_test_indices(self):
-        n_folds = self.n_folds
-        idx = np.argsort(self.y)
-        for i in range(n_folds):
-            yield idx[i::n_folds]
+    def _iter_test_masks(self):
+        for i in range(self.n_folds):
+            yield self.test_folds == i
 
     def __repr__(self):
         return '%s.%s(labels=%s, n_folds=%i)' % (
diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py
index 853634ca3c700..220e667406ee4 100644
--- a/sklearn/feature_selection/tests/test_rfe.py
+++ b/sklearn/feature_selection/tests/test_rfe.py
@@ -69,39 +69,35 @@ def test_rfecv():
     y = list(iris.target)   # regression test: list should be supported
 
     # Test using the score function
-    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
     rfecv.fit(X, y)
     # non-regression test for missing worst feature:
     assert_equal(len(rfecv.grid_scores_), X.shape[1])
     assert_equal(len(rfecv.ranking_), X.shape[1])
     X_r = rfecv.transform(X)
 
+    # All the noisy variable were filtered out
+    assert_array_equal(X_r, iris.data)
+
     # same in sparse
-    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3)
+    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
     X_sparse = sparse.csr_matrix(X)
     rfecv_sparse.fit(X_sparse, y)
     X_r_sparse = rfecv_sparse.transform(X_sparse)
-
-    assert_equal(X_r.shape, iris.data.shape)
-    assert_array_almost_equal(X_r[:10], iris.data[:10])
-    assert_array_almost_equal(X_r_sparse.toarray(), X_r)
+    assert_array_equal(X_r_sparse.toarray(), iris.data)
 
     # Test using a customized loss function
-    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3,
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                   loss_func=zero_one_loss)
     with warnings.catch_warnings(record=True):
         rfecv.fit(X, y)
     X_r = rfecv.transform(X)
-
-    assert_equal(X_r.shape, iris.data.shape)
-    assert_array_almost_equal(X_r[:10], iris.data[:10])
+    assert_array_equal(X_r, iris.data)
 
     # Test using a scorer
     scorer = SCORERS['accuracy']
-    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=3,
+    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                   scoring=scorer)
     rfecv.fit(X, y)
     X_r = rfecv.transform(X)
-
-    assert_equal(X_r.shape, iris.data.shape)
-    assert_array_almost_equal(X_r[:10], iris.data[:10])
+    assert_array_equal(X_r, iris.data)
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 559bada01b70e..986624137f461 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1,5 +1,5 @@
 """Test the cross_validation module"""
-
+from __future__ import division
 import warnings
 
 import numpy as np
@@ -22,6 +22,7 @@
 from sklearn import cross_validation as cval
 from sklearn.base import BaseEstimator
 from sklearn.datasets import make_regression
+from sklearn.datasets import load_digits
 from sklearn.datasets import load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import f1_score
@@ -91,6 +92,37 @@ def score(self, X=None, Y=None):
 ##############################################################################
 # Tests
 
+def check_valid_split(train, test, n_samples=None):
+    # Use python sets to get more informative assertion failure messages
+    train, test = set(train), set(test)
+
+    # Train and test split should not overlap
+    assert_equal(train.intersection(test), set())
+
+    if n_samples is not None:
+        # Check that the union of train an test split cover all the indices
+        assert_equal(train.union(test), set(range(n_samples)))
+
+
+def check_cv_coverage(cv, expected_n_iter=None, n_samples=None):
+    # Check that a all the samples appear at least once in a test fold
+    if expected_n_iter is not None:
+        assert_equal(len(cv), expected_n_iter)
+    else:
+        expected_n_iter = len(cv)
+
+    collected_test_samples = set()
+    iterations = 0
+    for train, test in cv:
+        check_valid_split(train, test, n_samples=n_samples)
+        iterations += 1
+        collected_test_samples.update(test)
+
+    # Check that the accumulated test samples cover the whole dataset
+    assert_equal(iterations, expected_n_iter)
+    if n_samples is not None:
+        assert_equal(collected_test_samples, set(range(n_samples)))
+
 
 def test_kfold_valueerrors():
     # Check that errors are raised if there is not enough samples
@@ -100,8 +132,8 @@ def test_kfold_valueerrors():
     # members.
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter('always')
-        y = [0, 0, 1, 1, 2]
-        cval.StratifiedKFold(y, 3)
+        y = [3, 3, -1, -1, 2]
+        cv = cval.StratifiedKFold(y, 3)
         # checking there was only one warning.
         assert_equal(len(w), 1)
         # checking it has the right type
@@ -110,6 +142,11 @@ def test_kfold_valueerrors():
         # a characteristic of the code and not a behavior
         assert_true("The least populated class" in str(w[0]))
 
+        # Check that despite the warning the folds are still computed even
+        # though all the classes are not necessarily represented at on each
+        # side of the split at each split
+        check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))
+
     # Error when number of folds is <= 1
     assert_raises(ValueError, cval.KFold, 2, 0)
     assert_raises(ValueError, cval.KFold, 2, 1)
@@ -127,15 +164,72 @@ def test_kfold_valueerrors():
 def test_kfold_indices():
     # Check all indices are returned in the test folds
     kf = cval.KFold(300, 3)
-    all_folds = None
-    for train, test in kf:
-        if all_folds is None:
-            all_folds = test.copy()
-        else:
-            all_folds = np.concatenate((all_folds, test))
-
-    all_folds.sort()
-    assert_array_equal(all_folds, np.arange(300))
+    check_cv_coverage(kf, expected_n_iter=3, n_samples=300)
+
+    # Check all indices are returned in the test folds even when equal-sized
+    # folds are not possible
+    kf = cval.KFold(17, 3)
+    check_cv_coverage(kf, expected_n_iter=3, n_samples=17)
+
+
+def test_kfold_no_shuffle():
+    # Manually check that KFold preserves the data ordering on toy datasets
+    splits = iter(cval.KFold(4, 2))
+    train, test = splits.next()
+    assert_array_equal(test, [0, 1])
+    assert_array_equal(train, [2, 3])
+
+    train, test = splits.next()
+    assert_array_equal(test, [2, 3])
+    assert_array_equal(train, [0, 1])
+
+    splits = iter(cval.KFold(5, 2))
+    train, test = splits.next()
+    assert_array_equal(test, [0, 1, 2])
+    assert_array_equal(train, [3, 4])
+
+    train, test = splits.next()
+    assert_array_equal(test, [3, 4])
+    assert_array_equal(train, [0, 1, 2])
+
+
+def test_stratified_kfold_no_shuffle():
+    # Manually check that StratifiedKFold preserves the data ordering as much
+    # as possible on toy datasets in order to avoid hiding sample dependencies
+    # when possible
+    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
+    train, test = splits.next()
+    assert_array_equal(test, [0, 2])
+    assert_array_equal(train, [1, 3])
+
+    train, test = splits.next()
+    assert_array_equal(test, [1, 3])
+    assert_array_equal(train, [0, 2])
+
+    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
+    train, test = splits.next()
+    assert_array_equal(test, [0, 1, 3, 4])
+    assert_array_equal(train, [2, 5, 6])
+
+    train, test = splits.next()
+    assert_array_equal(test, [2, 5, 6])
+    assert_array_equal(train, [0, 1, 3, 4])
+
+
+def test_stratified_kfold_ratios():
+    # Check that stratified kfold preserves label ratios in individual splits
+    n_samples = 1000
+    labels = np.array([4] * int(0.10 * n_samples) +
+                      [0] * int(0.89 * n_samples) +
+                      [1] * int(0.01 * n_samples))
+
+    for train, test in cval.StratifiedKFold(labels, 5):
+        assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10, 2)
+        assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89, 2)
+        assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01, 2)
+        assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
+        assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
+        assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)
 
 
 def test_kfold_balance():
@@ -149,30 +243,84 @@ def test_kfold_balance():
         assert_equal(np.sum(sizes), kf.n)
 
 
-@ignore_warnings
+def test_stratifiedkfold_balance():
+    # Check that KFold returns folds with balanced sizes (only when
+    # stratification is possible)
+    labels = [0] * 3 + [1] * 14
+    for skf in [cval.StratifiedKFold(labels[:i], 3) for i in range(11, 17)]:
+        sizes = []
+        for _, test in skf:
+            sizes.append(len(test))
+
+        assert_true((np.max(sizes) - np.min(sizes)) <= 1)
+        assert_equal(np.sum(sizes), skf.n)
+
+
 def test_shuffle_kfold():
     # Check the indices are shuffled properly, and that all indices are
     # returned in the different test folds
-    kf1 = cval.KFold(300, 3, shuffle=True, random_state=0, indices=True)
-    kf2 = cval.KFold(300, 3, shuffle=True, random_state=0, indices=False)
+    kf = cval.KFold(300, 3, shuffle=True, random_state=0)
     ind = np.arange(300)
 
-    for kf in (kf1, kf2):
-        all_folds = None
-        for train, test in kf:
-            sorted_array = np.arange(100)
-            assert_true(np.any(sorted_array != ind[train]))
-            sorted_array = np.arange(101, 200)
-            assert_true(np.any(sorted_array != ind[train]))
-            sorted_array = np.arange(201, 300)
-            assert_true(np.any(sorted_array != ind[train]))
-            if all_folds is None:
-                all_folds = ind[test].copy()
-            else:
-                all_folds = np.concatenate((all_folds, ind[test]))
+    all_folds = None
+    for train, test in kf:
+        sorted_array = np.arange(100)
+        assert_true(np.any(sorted_array != ind[train]))
+        sorted_array = np.arange(101, 200)
+        assert_true(np.any(sorted_array != ind[train]))
+        sorted_array = np.arange(201, 300)
+        assert_true(np.any(sorted_array != ind[train]))
+        if all_folds is None:
+            all_folds = ind[test].copy()
+        else:
+            all_folds = np.concatenate((all_folds, ind[test]))
+
+    all_folds.sort()
+    assert_array_equal(all_folds, ind)
+
+
+def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
+    # The digits samples are dependent: they are apparently grouped by authors
+    # although we don't have any information on the groups segment locations
+    # for this data. We can highlight this fact be computing k-fold cross-
+    # validation with and without shuffling: we observe that the shuffling case
+    # wrongly makes the IID assumption and is therefore too optimistic: it
+    # estimates a much higher accuracy (around 0.96) than than the non
+    # shuffling variant (around 0.86).
+
+    digits = load_digits()
+    X, y = digits.data[:800], digits.target[:800]
+    model = SVC(C=10, gamma=0.005)
+    n = len(y)
+
+    cv = cval.KFold(n, 5, shuffle=False)
+    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
+    assert_greater(0.88, mean_score)
+    assert_greater(mean_score, 0.85)
+
+    # Shuffling the data artificially breaks the dependency and hides the
+    # overfitting of the model w.r.t. the writing style of the authors
+    # by yielding a seriously overestimated score:
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
+    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
+    assert_greater(mean_score, 0.95)
+
+    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
+    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
+    assert_greater(mean_score, 0.95)
+
+    # Similarly, StratifiedKFold should try to shuffle the data as little
+    # as possible (while respecting the balanced class constraints)
+    # and thus be able to detect the dependency by not overestimating
+    # the CV score either. As the digits dataset is approximately balanced
+    # the estimated mean score is close to the score measured with
+    # non-shuffled KFold
 
-        all_folds.sort()
-        assert_array_equal(all_folds, ind)
+    cv = cval.StratifiedKFold(y, 5)
+    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
+    assert_greater(0.88, mean_score)
+    assert_greater(mean_score, 0.85)
 
 
 def test_shuffle_split():
@@ -379,24 +527,24 @@ def test_cross_val_score_with_score_func_classification():
 
     # Default score (should be the accuracy score)
     scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
-    assert_array_almost_equal(scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
     # Correct classification score (aka. zero / one score) - should be the
     # same as the default estimator score
     zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                      scoring="accuracy", cv=5)
-    assert_array_almost_equal(zo_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
     # F1 score (class are balanced so f1_score should be equal to zero/one
     # score
     f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                      scoring="f1", cv=5)
-    assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
     # also test deprecated old way
     with warnings.catch_warnings(record=True):
         f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                          score_func=f1_score, cv=5)
-    assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2)
+    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
 
 
 def test_cross_val_score_with_score_func_regression():
@@ -450,7 +598,7 @@ def test_permutation_score():
     score_label, _, pvalue_label = cval.permutation_test_score(
         svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size),
         random_state=0)
-    assert_almost_equal(score_label, .95, 2)
+    assert_almost_equal(score_label, .97, 2)
     assert_almost_equal(pvalue_label, 0.01, 3)
 
     # check that we obtain the same results with a sparse representation
@@ -470,14 +618,14 @@ def test_permutation_score():
                                                         scoring="accuracy")
 
     assert_less(score, 0.5)
-    assert_greater(pvalue, 0.4)
+    assert_greater(pvalue, 0.2)
 
     # test with deprecated interface
     with warnings.catch_warnings(record=True):
         score, scores, pvalue = cval.permutation_test_score(
             svm, X, y, score_func=accuracy_score, cv=cv)
     assert_less(score, 0.5)
-    assert_greater(pvalue, 0.4)
+    assert_greater(pvalue, 0.2)
 
 
 def test_cross_val_generator_with_mask():
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index 77634726a386a..5ff2953896091 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -329,8 +329,8 @@ def test_coef_intercept_shape():
 
 def test_check_accuracy_on_digits():
     # Non regression test to make sure that any further refactoring / optim
-    # of the NB models do not harm the performance on a non linearly separable
-    # dataset
+    # of the NB models do not harm the performance on a slightly non-linearly
+    # separable dataset
     digits = load_digits()
     X, y = digits.data, digits.target
     binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
@@ -338,21 +338,21 @@ def test_check_accuracy_on_digits():
 
     # Multinomial NB
     scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
-    assert_greater(scores.mean(), 0.90)
+    assert_greater(scores.mean(), 0.86)
 
     scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
-    assert_greater(scores.mean(), 0.95)
+    assert_greater(scores.mean(), 0.94)
 
     # Bernoulli NB
     scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
-    assert_greater(scores.mean(), 0.85)
+    assert_greater(scores.mean(), 0.83)
 
     scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
-    assert_greater(scores.mean(), 0.94)
+    assert_greater(scores.mean(), 0.92)
 
     # Gaussian NB
     scores = cross_val_score(GaussianNB(), X, y, cv=10)
-    assert_greater(scores.mean(), 0.81)
+    assert_greater(scores.mean(), 0.77)
 
     scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
     assert_greater(scores.mean(), 0.86)