From 4f1d87469c7a828d699e6823787dc09dab79087f Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Tue, 17 Sep 2013 13:15:28 +0200 Subject: [PATCH 1/6] FIX #2372: StratifiedKFold less impact on the original order of samples. --- sklearn/cross_validation.py | 14 +++++++--- sklearn/tests/test_cross_validation.py | 36 +++++++++++++++++++++----- sklearn/tests/test_naive_bayes.py | 8 +++--- 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index d08f76f975a30..3e32baed243ad 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -386,10 +386,16 @@ def __init__(self, y, n_folds=3, indices=None): self.y = y def _iter_test_indices(self): - n_folds = self.n_folds - idx = np.argsort(self.y) - for i in range(n_folds): - yield idx[i::n_folds] + idx_cls = [] + for cls in unique(self.y): + idx_cls.append(np.where(self.y == cls)[0]) + + for i in range(self.n_folds): + idxs = [] + for idx in idx_cls: + len_idx = len(idx) / self.n_folds + idxs.extend(idx[i * len_idx:(i + 1) * len_idx]) + yield sorted(idxs) def __repr__(self): return '%s.%s(labels=%s, n_folds=%i)' % ( diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 559bada01b70e..84aec0c7cd0df 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -22,6 +22,7 @@ from sklearn import cross_validation as cval from sklearn.base import BaseEstimator from sklearn.datasets import make_regression +from sklearn.datasets import load_digits from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score @@ -379,24 +380,24 @@ def test_cross_val_score_with_score_func_classification(): # Default score (should be the accuracy score) scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5) - assert_array_almost_equal(scores, [1., 0.97, 0.90, 0.97, 1.], 2) + assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2) # Correct classification score (aka. zero / one score) - should be the # same as the default estimator score zo_scores = cval.cross_val_score(clf, iris.data, iris.target, scoring="accuracy", cv=5) - assert_array_almost_equal(zo_scores, [1., 0.97, 0.90, 0.97, 1.], 2) + assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2) # F1 score (class are balanced so f1_score should be equal to zero/one # score f1_scores = cval.cross_val_score(clf, iris.data, iris.target, scoring="f1", cv=5) - assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2) + assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2) # also test deprecated old way with warnings.catch_warnings(record=True): f1_scores = cval.cross_val_score(clf, iris.data, iris.target, score_func=f1_score, cv=5) - assert_array_almost_equal(f1_scores, [1., 0.97, 0.90, 0.97, 1.], 2) + assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2) def test_cross_val_score_with_score_func_regression(): @@ -450,7 +451,7 @@ def test_permutation_score(): score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size), random_state=0) - assert_almost_equal(score_label, .95, 2) + assert_almost_equal(score_label, .97, 2) assert_almost_equal(pvalue_label, 0.01, 3) # check that we obtain the same results with a sparse representation @@ -470,14 +471,14 @@ def test_permutation_score(): scoring="accuracy") assert_less(score, 0.5) - assert_greater(pvalue, 0.4) + assert_greater(pvalue, 0.2) # test with deprecated interface with warnings.catch_warnings(record=True): score, scores, pvalue = cval.permutation_test_score( svm, X, y, score_func=accuracy_score, cv=cv) assert_less(score, 0.5) - assert_greater(pvalue, 0.4) + assert_greater(pvalue, 0.2) def test_cross_val_generator_with_mask(): @@ -634,3 +635,24 @@ def test_cross_indices_exception(): assert_raises(ValueError, cval.check_cv, skf, X, y) assert_raises(ValueError, cval.check_cv, lolo, X, y) assert_raises(ValueError, cval.check_cv, lopo, X, y) + + +def test_stratified_kfold_preserve_order(): # see #2372 + y = np.array([3, 2, 1, 3, 2, 3] * 2) + skf = cval.StratifiedKFold(y, n_folds=2) + [(train0, test0), (train1, test1)] = tuple(skf) + assert_array_equal(train0, np.arange(6, 12)) + assert_array_equal(test0, np.arange(0, 6)) + assert_array_equal(train1, np.arange(0, 6)) + assert_array_equal(test1, np.arange(6, 12)) + + +def test_stratified_kfold_preserve_order_with_digits(): # see #2372 + # A regression test, taken from + # http://nbviewer.ipython.org/urls/raw.github.com/ogrisel/notebooks/master/Non%2520IID%2520cross-validation.ipynb + digits = load_digits() + X, y = digits.data, digits.target + + model = SVC(C=10, gamma=0.005) + cv = cval.StratifiedKFold(y, 5) + assert cval.cross_val_score(model, X, y, cv=cv, n_jobs=-1).mean() < 0.91 diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 77634726a386a..0dfad3f5cf69b 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -338,21 +338,21 @@ def test_check_accuracy_on_digits(): # Multinomial NB scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) - assert_greater(scores.mean(), 0.90) + assert_greater(scores.mean(), 0.88) scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) assert_greater(scores.mean(), 0.95) # Bernoulli NB scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) - assert_greater(scores.mean(), 0.85) + assert_greater(scores.mean(), 0.84) scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) - assert_greater(scores.mean(), 0.94) + assert_greater(scores.mean(), 0.93) # Gaussian NB scores = cross_val_score(GaussianNB(), X, y, cv=10) - assert_greater(scores.mean(), 0.81) + assert_greater(scores.mean(), 0.78) scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) assert_greater(scores.mean(), 0.86) From 9777eae31a64e4ddd9556922edb182676dd0aae0 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Tue, 17 Sep 2013 23:05:21 +0200 Subject: [PATCH 2/6] Fix accidental doctest breakage. --- doc/modules/cross_validation.rst | 10 +++++----- doc/tutorial/statistical_inference/model_selection.rst | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 441fe445c691d..f673089862a45 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -106,13 +106,13 @@ time):: ... clf, iris.data, iris.target, cv=5) ... >>> scores # doctest: +ELLIPSIS - array([ 1. ..., 0.96..., 0.9 ..., 0.96..., 1. ]) + array([ 0.9666..., 1. , 0.9666..., 0.9666..., 1. ]) The mean score and the standard deviation of the score estimate are hence given by:: >>> print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - Accuracy: 0.97 (+/- 0.07) + Accuracy: 0.98 (+/- 0.03) By default, the score computed at each CV iteration is the ``score`` method of the estimator. It is possible to change this by using the @@ -122,7 +122,7 @@ scoring parameter:: >>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5, ... scoring='f1') ... # doctest: +ELLIPSIS - array([ 1. ..., 0.96..., 0.89..., 0.96..., 1. ]) + array([ 0.9665..., 1. , 0.9665..., 0.9665..., 1. ]) See :ref:`scoring_parameter` for details. In the case of the Iris dataset, the samples are balanced across target @@ -206,8 +206,8 @@ two unbalanced classes:: >>> skf = StratifiedKFold(labels, 2) >>> for train, test in skf: ... print("%s %s" % (train, test)) - [1 4 6] [0 2 3 5] - [0 2 3 5] [1 4 6] + [2 4 5 6] [0 1 3] + [0 1 3 5] [2 4 6] Leave-One-Out - LOO diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst index b069c31d5ec69..8200bf0b9ecf2 100644 --- a/doc/tutorial/statistical_inference/model_selection.rst +++ b/doc/tutorial/statistical_inference/model_selection.rst @@ -146,7 +146,7 @@ estimator during the construction and exposes an estimator API:: >>> clf.fit(X_digits[:1000], y_digits[:1000]) # doctest: +ELLIPSIS GridSearchCV(cv=None,... >>> clf.best_score_ # doctest: +ELLIPSIS - 0.9889... + 0.9272... >>> clf.best_estimator_.gamma 9.9999999999999995e-07 @@ -164,7 +164,7 @@ a stratified 3-fold. :: >>> cross_validation.cross_val_score(clf, X_digits, y_digits) - array([ 0.97996661, 0.98163606, 0.98330551]) + array([ 0.93456376, 0.95805369, 0.93624161]) Two cross-validation loops are performed in parallel: one by the :class:`GridSearchCV` estimator to set `gamma` and the other one by From 535829421930a87c126400b9d3dff888e788bc68 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 19 Sep 2013 14:43:20 +0200 Subject: [PATCH 3/6] Instead of linking to NB, explain the problem inside the test itself. I'm using a smaller number of examples from the digits dataset because that cuts down test execution time for me from 17s to 4s, and still yields the very similar results. --- sklearn/tests/test_cross_validation.py | 35 ++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 84aec0c7cd0df..45d986a7ce524 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -648,11 +648,36 @@ def test_stratified_kfold_preserve_order(): # see #2372 def test_stratified_kfold_preserve_order_with_digits(): # see #2372 - # A regression test, taken from - # http://nbviewer.ipython.org/urls/raw.github.com/ogrisel/notebooks/master/Non%2520IID%2520cross-validation.ipynb - digits = load_digits() - X, y = digits.data, digits.target + # The digits samples are dependent as they are apparently grouped + # by authors although we don't have any information on the groups + # segment locations for this data. We can highlight this fact be + # computing k-fold cross-validation with and without shuffling: we + # observer that the shuffling case makes the IID assumption and is + # therefore too optimistic: it estimates a much higher accuracy + # (around 0.965) than than the non shuffling variant (around + # 0.905). + digits = load_digits() + X, y = digits.data[:800], digits.target[:800] model = SVC(C=10, gamma=0.005) + n = len(y) + + cv = cval.KFold(n, 5, shuffle=False) + assert_greater(0.91, cval.cross_val_score(model, X, y, cv=cv).mean()) + + cv = cval.KFold(n, 5, shuffle=True, random_state=0) + assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95) + + cv = cval.KFold(n, 5, shuffle=True, random_state=1) + assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95) + + cv = cval.KFold(n, 5, shuffle=True, random_state=2) + assert_greater(cval.cross_val_score(model, X, y, cv=cv).mean(), 0.95) + + # Similarly, StratifiedKFold should try to shuffle the data as few + # as possible (while respecting the balanced class constraints) + # and thus be able to detect the dependency by not overestimating + # the CV score either: + cv = cval.StratifiedKFold(y, 5) - assert cval.cross_val_score(model, X, y, cv=cv, n_jobs=-1).mean() < 0.91 + assert_greater(0.91, cval.cross_val_score(model, X, y, cv=cv).mean()) From d9fa475af7ba36bbc0fac374ce733dbcb3c6c3ea Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 19 Sep 2013 16:45:06 +0200 Subject: [PATCH 4/6] Avoid list, preallocate a numpy array for indices instead. It appears that the sorted() call is unnecessary here. --- sklearn/cross_validation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 3e32baed243ad..02c85e0566575 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -386,16 +386,18 @@ def __init__(self, y, n_folds=3, indices=None): self.y = y def _iter_test_indices(self): - idx_cls = [] - for cls in unique(self.y): - idx_cls.append(np.where(self.y == cls)[0]) + idx_per_label = [] + for label in unique(self.y): + idx_per_label.append(np.where(self.y == label)[0]) + idxs = np.empty(len(self.y) / self.n_folds, dtype=np.int) for i in range(self.n_folds): - idxs = [] - for idx in idx_cls: + j = 0 + for idx in idx_per_label: len_idx = len(idx) / self.n_folds - idxs.extend(idx[i * len_idx:(i + 1) * len_idx]) - yield sorted(idxs) + idxs[j:j + len_idx] = idx[i * len_idx:(i + 1) * len_idx] + j += len_idx + yield idxs[:j] def __repr__(self): return '%s.%s(labels=%s, n_folds=%i)' % ( From 5a084bd0243380072e307f9111e54b262d478f1e Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Thu, 19 Sep 2013 17:05:22 +0200 Subject: [PATCH 5/6] Update comment with numbers for when we run with 800 samples. --- sklearn/tests/test_cross_validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 45d986a7ce524..024a8e7b143f8 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -654,8 +654,7 @@ def test_stratified_kfold_preserve_order_with_digits(): # see #2372 # computing k-fold cross-validation with and without shuffling: we # observer that the shuffling case makes the IID assumption and is # therefore too optimistic: it estimates a much higher accuracy - # (around 0.965) than than the non shuffling variant (around - # 0.905). + # (around 0.96) than than the non shuffling variant (around 0.86). digits = load_digits() X, y = digits.data[:800], digits.target[:800] From f01dc1835ed1e24df0a9aca944757c34ef86a448 Mon Sep 17 00:00:00 2001 From: Daniel Nouri Date: Fri, 20 Sep 2013 12:02:35 +0200 Subject: [PATCH 6/6] Add entry for #2372 to whats_new.rst --- doc/whats_new.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index b641540c5fdbb..20d4194088e6c 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -41,6 +41,10 @@ Changelog - Memory improvements of extra trees and random forest by `Arnaud Joly`_. + - Changed :class:`cross_validation.StratifiedKFold` to try and + preserve as much of the original ordering of samples as possible. + By `Daniel Nouri`_ and `Olivier Grisel`_. + API changes summary ------------------- @@ -778,7 +782,7 @@ List of contributors for release 0.13 by number of commits. * 17 `Fabian Pedregosa`_ * 17 Nelle Varoquaux * 16 `Christian Osendorfer`_ - * 14 Daniel Nouri + * 14 `Daniel Nouri`_ * 13 `Virgile Fritsch`_ * 13 syhw * 12 `Satrajit Ghosh`_ @@ -2285,3 +2289,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Kyle Kastner: http://kastnerkyle.github.io .. _@FedericoV: https://github.com/FedericoV/ + +.. _Daniel Nouri: http://danielnouri.org