From dea5a0e361472cec203124292e02d8280bf9b73e Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Thu, 3 Mar 2016 14:58:06 -0500
Subject: [PATCH 01/13] fix sampling in stratified shuffle split, break tests
 that test sampling.

---
 sklearn/model_selection/_split.py | 39 ++++++++++++++-----------------
 sklearn/utils/random.py           |  6 ++---
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 5989edd30b109..146bc36971c57 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1098,6 +1098,21 @@ def _iter_indices(self, X, y, labels):
             yield train, test
 
 
+def _approximate_mode(class_counts, n_draws):
+    # this computes a bad approximation to the mode of the
+    # multivariate hypergeometric given by class_counts and n_draws
+    continuous = n_draws * class_counts / class_counts.sum()
+    # floored means we don't overshoot n_samples, but probably undershoot
+    floored = np.floor(continuous)
+    # we add samples according to how much "left over" probability
+    # they had, until we arrive at n_samples
+    remainder = continuous - floored
+    sorting = np.argsort(remainder)[::-1]
+    need_to_add = int(n_draws - floored.sum())
+    floored[sorting[:need_to_add]] += 1
+    return floored.astype(np.int)
+
+
 class StratifiedShuffleSplit(BaseShuffleSplit):
     """Stratified ShuffleSplit cross-validator
 
@@ -1181,10 +1196,9 @@ def _iter_indices(self, X, y, labels=None):
                              (n_test, n_classes))
 
         rng = check_random_state(self.random_state)
-        p_i = class_counts / float(n_samples)
-        n_i = np.round(n_train * p_i).astype(int)
-        t_i = np.minimum(class_counts - n_i,
-                         np.round(n_test * p_i).astype(int))
+        n_i = _approximate_mode(class_counts, n_train)
+        class_counts_remaining = class_counts - n_i
+        t_i = _approximate_mode(class_counts_remaining, n_test)
 
         for _ in range(self.n_splits):
             train = []
@@ -1196,23 +1210,6 @@ def _iter_indices(self, X, y, labels=None):
 
                 train.extend(perm_indices_class_i[:n_i[i]])
                 test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
-
-            # Because of rounding issues (as n_train and n_test are not
-            # dividers of the number of elements per class), we may end
-            # up here with less samples in train and test than asked for.
-            if len(train) + len(test) < n_train + n_test:
-                # We complete by affecting randomly the missing indexes
-                missing_indices = np.where(bincount(train + test,
-                                                    minlength=len(y)) == 0)[0]
-                missing_indices = rng.permutation(missing_indices)
-                n_missing_train = n_train - len(train)
-                n_missing_test = n_test - len(test)
-
-                if n_missing_train > 0:
-                    train.extend(missing_indices[:n_missing_train])
-                if n_missing_test > 0:
-                    test.extend(missing_indices[-n_missing_test:])
-
             train = rng.permutation(train)
             test = rng.permutation(test)
 
diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py
index 34738d8653b74..5805f9be2c8fa 100644
--- a/sklearn/utils/random.py
+++ b/sklearn/utils/random.py
@@ -123,7 +123,7 @@ def choice(a, size=None, replace=True, p=None, random_state=None):
         if pop_size is 0:
             raise ValueError("a must be non-empty")
 
-    if None != p:
+    if p is not None:
         p = np.array(p, dtype=np.double, ndmin=1, copy=False)
         if p.ndim != 1:
             raise ValueError("p must be 1-dimensional")
@@ -142,7 +142,7 @@ def choice(a, size=None, replace=True, p=None, random_state=None):
 
     # Actual sampling
     if replace:
-        if None != p:
+        if p is not None:
             cdf = p.cumsum()
             cdf /= cdf[-1]
             uniform_samples = random_state.random_sample(shape)
@@ -156,7 +156,7 @@ def choice(a, size=None, replace=True, p=None, random_state=None):
             raise ValueError("Cannot take a larger sample than "
                              "population when 'replace=False'")
 
-        if None != p:
+        if p is not None:
             if np.sum(p > 0) < size:
                 raise ValueError("Fewer non-zero entries in p than size")
             n_uniq = 0

From 3d13d3767ea3c8d8a4878d7dfea81aa4ce4cad88 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Fri, 29 Jul 2016 15:42:10 -0400
Subject: [PATCH 02/13] get rid of rounding issues

---
 sklearn/model_selection/tests/test_split.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index d28148efe6956..c1e3ddb4e8196 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -538,7 +538,7 @@ def test_stratified_shuffle_split_init():
 def test_stratified_shuffle_split_iter():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
-          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
           np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
           np.array([-1] * 800 + [1] * 50)
           ]
@@ -594,8 +594,8 @@ def assert_counts_are_ok(idx_counts, p):
         assert_equal(n_splits_actual, n_splits)
 
         n_train, n_test = _validate_shuffle_split(n_samples,
-                                                  test_size=1./n_folds,
-                                                  train_size=1.-(1./n_folds))
+                                                  test_size=1. / n_folds,
+                                                  train_size=1. - (1. / n_folds))
 
         assert_equal(len(train), n_train)
         assert_equal(len(test), n_test)
@@ -656,7 +656,7 @@ def test_label_shuffle_split():
     for l in labels:
         X = y = np.ones(len(l))
         n_splits = 6
-        test_size = 1./3
+        test_size = 1. / 3
         slo = LabelShuffleSplit(n_splits, test_size=test_size, random_state=0)
 
         # Make sure the repr works

From 2644a6e30d2f454ca48b7b2dfb595e239ec7f309 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Fri, 26 Aug 2016 11:57:58 -0400
Subject: [PATCH 03/13] fixed the randomization to make everything nice and iid
 etc

---
 sklearn/model_selection/_split.py           | 20 +++++++++++++-------
 sklearn/model_selection/tests/test_split.py |  4 ++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 146bc36971c57..3e74e538ddd2b 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1098,7 +1098,7 @@ def _iter_indices(self, X, y, labels):
             yield train, test
 
 
-def _approximate_mode(class_counts, n_draws):
+def _approximate_mode(class_counts, n_draws, rng):
     # this computes a bad approximation to the mode of the
     # multivariate hypergeometric given by class_counts and n_draws
     continuous = n_draws * class_counts / class_counts.sum()
@@ -1106,10 +1106,13 @@ def _approximate_mode(class_counts, n_draws):
     floored = np.floor(continuous)
     # we add samples according to how much "left over" probability
     # they had, until we arrive at n_samples
-    remainder = continuous - floored
-    sorting = np.argsort(remainder)[::-1]
     need_to_add = int(n_draws - floored.sum())
-    floored[sorting[:need_to_add]] += 1
+    remainder = continuous - floored
+    if need_to_add > 0:
+        remainder /= remainder.sum()
+        choices = rng.choice(range(len(class_counts)), size=need_to_add, replace=False, p=remainder)
+        for choice in choices:
+            floored[choice] += 1
     return floored.astype(np.int)
 
 
@@ -1196,11 +1199,14 @@ def _iter_indices(self, X, y, labels=None):
                              (n_test, n_classes))
 
         rng = check_random_state(self.random_state)
-        n_i = _approximate_mode(class_counts, n_train)
-        class_counts_remaining = class_counts - n_i
-        t_i = _approximate_mode(class_counts_remaining, n_test)
 
         for _ in range(self.n_splits):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(class_counts, n_train, rng)
+            class_counts_remaining = class_counts - n_i
+            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
+
             train = []
             test = []
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index c1e3ddb4e8196..c92d3672597bc 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -572,8 +572,8 @@ def assert_counts_are_ok(idx_counts, p):
         threshold = 0.05 / n_splits
         bf = stats.binom(n_splits, p)
         for count in idx_counts:
-            p = bf.pmf(count)
-            assert_true(p > threshold,
+            prob = bf.pmf(count)
+            assert_true(prob > threshold,
                         "An index is not drawn with chance corresponding "
                         "to even draws")
 

From 143addff5accd70b97be8e9bf9223d8a794c2703 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Mon, 29 Aug 2016 12:05:15 -0400
Subject: [PATCH 04/13] old numpy choice compatibility.

---
 sklearn/model_selection/_split.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 3e74e538ddd2b..30127a9a7d88f 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -30,6 +30,7 @@
 from ..externals.six.moves import zip
 from ..utils.fixes import bincount
 from ..utils.fixes import signature
+from ..utils.random import choice
 from ..base import _pprint
 from ..gaussian_process.kernels import Kernel as GPKernel
 
@@ -1110,9 +1111,10 @@ def _approximate_mode(class_counts, n_draws, rng):
     remainder = continuous - floored
     if need_to_add > 0:
         remainder /= remainder.sum()
-        choices = rng.choice(range(len(class_counts)), size=need_to_add, replace=False, p=remainder)
-        for choice in choices:
-            floored[choice] += 1
+        choices = choice(range(len(class_counts)), size=need_to_add,
+                         replace=False, p=remainder, random_state=rng)
+        for pick in choices:
+            floored[pick] += 1
     return floored.astype(np.int)
 
 

From f227e8c436b7da7a401d0f6a82e15aa99c4a3c41 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Mon, 29 Aug 2016 15:11:30 -0400
Subject: [PATCH 05/13] don't draw at random, but break ties randomly. This now
 passes all tests but is slightly complicated.

---
 sklearn/model_selection/_split.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 30127a9a7d88f..7e5d55495fea2 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1108,13 +1108,23 @@ def _approximate_mode(class_counts, n_draws, rng):
     # we add samples according to how much "left over" probability
     # they had, until we arrive at n_samples
     need_to_add = int(n_draws - floored.sum())
-    remainder = continuous - floored
     if need_to_add > 0:
-        remainder /= remainder.sum()
-        choices = choice(range(len(class_counts)), size=need_to_add,
-                         replace=False, p=remainder, random_state=rng)
-        for pick in choices:
-            floored[pick] += 1
+        remainder = continuous - floored
+        values = np.sort(np.unique(remainder))[::-1]
+        # add according to remainder, but break ties
+        # randomly to avoid biases
+        for value in values:
+            inds, = np.where(remainder == value)
+            # if we need_to_add less than what's in inds
+            # we draw randomly from them.
+            # if we need to add more, we add them all and
+            # go to the next value
+            add_now = min(len(inds), need_to_add)
+            inds = choice(inds, size=add_now, replace=False, random_state=rng)
+            floored[inds] += 1
+            need_to_add -= add_now
+            if need_to_add == 0:
+                    break
     return floored.astype(np.int)
 
 

From 7fdf8527d81e978f28cd8067f31d9a02075eb96f Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Mon, 29 Aug 2016 15:16:52 -0400
Subject: [PATCH 06/13] added docstring to approximate hypergeometric mode
 computation

---
 sklearn/model_selection/_split.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 7e5d55495fea2..93a5f483d9ada 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1100,6 +1100,30 @@ def _iter_indices(self, X, y, labels):
 
 
 def _approximate_mode(class_counts, n_draws, rng):
+    """Computes approximate mode of multivariate hypergemetric.
+
+    This is an approximation to the mode of the multivariate
+    hypergeometric given by class_counts and n_draws.
+    It shouldn't be off by more than one.
+
+    It is the mostly likely outcome of drawing n_draws many
+    samples from the population given by class_counts.
+
+    Parameters
+    ----------
+    class_counts : ndarray of int
+        Population per class.
+    n_draws : int
+        Number of draws (samples to draw) from the overall population.
+    rng : random state
+        Used to break ties.
+
+    Returns
+    -------
+    sampled_classes : ndarray of int
+        Number of samples drawn from each class.
+        np.sum(sampled_classes) == n_draws
+    """
     # this computes a bad approximation to the mode of the
     # multivariate hypergeometric given by class_counts and n_draws
     continuous = n_draws * class_counts / class_counts.sum()

From 7c59f2faf423e750c081f2d319670748363fe2de Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Mon, 29 Aug 2016 15:35:43 -0400
Subject: [PATCH 07/13] made test stronger, added a very explicit regression
 test (that fails in current master!!)

---
 sklearn/model_selection/tests/test_split.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index c92d3672597bc..273eeb0e327ff 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -535,6 +535,17 @@ def test_stratified_shuffle_split_init():
                   StratifiedShuffleSplit(test_size=2).split(X, y))
 
 
+def test_stratified_shuffle_split_respects_test_size():
+    y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
+    test_size = 5
+    train_size = 10
+    sss = StratifiedShuffleSplit(6, test_size=test_size, train_size=train_size,
+                                 random_state=0).split(np.ones(len(y)), y)
+    for train, test in sss:
+        assert_equal(len(train), train_size)
+        assert_equal(len(test), test_size)
+
+
 def test_stratified_shuffle_split_iter():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
@@ -546,6 +557,9 @@ def test_stratified_shuffle_split_iter():
     for y in ys:
         sss = StratifiedShuffleSplit(6, test_size=0.33,
                                      random_state=0).split(np.ones(len(y)), y)
+        # this is how test-size is computed internally in _validate_shuffle_split
+        test_size = np.ceil(0.33 * len(y))
+        train_size = len(y) - test_size
         for train, test in sss:
             assert_array_equal(np.unique(y[train]), np.unique(y[test]))
             # Checks if folds keep classes proportions
@@ -556,7 +570,9 @@ def test_stratified_shuffle_split_iter():
                                   return_inverse=True)[1]) /
                       float(len(y[test])))
             assert_array_almost_equal(p_train, p_test, 1)
-            assert_equal(y[train].size + y[test].size, y.size)
+            assert_equal(len(train) + len(test), y.size)
+            assert_equal(len(train), train_size)
+            assert_equal(len(test), test_size)
             assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
 
 

From 1903e85ab989a0c8dcc387e544bd032d1ea5e515 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Tue, 6 Sep 2016 13:49:58 -0400
Subject: [PATCH 08/13] backport StratifiedShuffleSplit fix to
 cross_validation.py

---
 sklearn/cross_validation.py            | 101 ++++++++++++++++++-------
 sklearn/tests/test_cross_validation.py |  20 +++--
 2 files changed, 85 insertions(+), 36 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 508b0460ec154..f63124ab3aba7 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -27,6 +27,7 @@
 from .utils.validation import (_is_arraylike, _num_samples,
                                column_or_1d)
 from .utils.multiclass import type_of_target
+from .utils.random import choice
 from .externals.joblib import Parallel, delayed, logger
 from .externals.six import with_metaclass
 from .externals.six.moves import zip
@@ -414,9 +415,9 @@ def __init__(self, labels, n_folds=3):
 
         if n_folds > n_labels:
             raise ValueError(
-                    ("Cannot have number of folds n_folds={0} greater"
-                     " than the number of labels: {1}.").format(n_folds,
-                                                                n_labels))
+                ("Cannot have number of folds n_folds={0} greater"
+                 " than the number of labels: {1}.").format(n_folds,
+                                                            n_labels))
 
         # Weight labels by their number of occurrences
         n_samples_per_label = np.bincount(labels)
@@ -906,6 +907,59 @@ def _validate_shuffle_split(n, test_size, train_size):
     return int(n_train), int(n_test)
 
 
+def _approximate_mode(class_counts, n_draws, rng):
+    """Computes approximate mode of multivariate hypergemetric.
+
+    This is an approximation to the mode of the multivariate
+    hypergeometric given by class_counts and n_draws.
+    It shouldn't be off by more than one.
+
+    It is the mostly likely outcome of drawing n_draws many
+    samples from the population given by class_counts.
+
+    Parameters
+    ----------
+    class_counts : ndarray of int
+        Population per class.
+    n_draws : int
+        Number of draws (samples to draw) from the overall population.
+    rng : random state
+        Used to break ties.
+
+    Returns
+    -------
+    sampled_classes : ndarray of int
+        Number of samples drawn from each class.
+        np.sum(sampled_classes) == n_draws
+    """
+    # this computes a bad approximation to the mode of the
+    # multivariate hypergeometric given by class_counts and n_draws
+    continuous = n_draws * class_counts / class_counts.sum()
+    # floored means we don't overshoot n_samples, but probably undershoot
+    floored = np.floor(continuous)
+    # we add samples according to how much "left over" probability
+    # they had, until we arrive at n_samples
+    need_to_add = int(n_draws - floored.sum())
+    if need_to_add > 0:
+        remainder = continuous - floored
+        values = np.sort(np.unique(remainder))[::-1]
+        # add according to remainder, but break ties
+        # randomly to avoid biases
+        for value in values:
+            inds, = np.where(remainder == value)
+            # if we need_to_add less than what's in inds
+            # we draw randomly from them.
+            # if we need to add more, we add them all and
+            # go to the next value
+            add_now = min(len(inds), need_to_add)
+            inds = choice(inds, size=add_now, replace=False, random_state=rng)
+            floored[inds] += 1
+            need_to_add -= add_now
+            if need_to_add == 0:
+                    break
+    return floored.astype(np.int)
+
+
 class StratifiedShuffleSplit(BaseShuffleSplit):
     """Stratified ShuffleSplit cross validation iterator
 
@@ -991,39 +1045,28 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
     def _iter_indices(self):
         rng = check_random_state(self.random_state)
         cls_count = bincount(self.y_indices)
-        p_i = cls_count / float(self.n)
-        n_i = np.round(self.n_train * p_i).astype(int)
-        t_i = np.minimum(cls_count - n_i,
-                         np.round(self.n_test * p_i).astype(int))
+        #p_i = cls_count / float(self.n)
+        #n_i = np.round(self.n_train * p_i).astype(int)
+        #t_i = np.minimum(cls_count - n_i,
+        #                 np.round(self.n_test * p_i).astype(int))
 
         for n in range(self.n_iter):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(cls_count, self.n_train, rng)
+            class_counts_remaining = cls_count - n_i
+            t_i = _approximate_mode(class_counts_remaining, self.n_test, rng)
+
             train = []
             test = []
 
-            for i, cls in enumerate(self.classes):
+            for i, _ in enumerate(self.classes):
                 permutation = rng.permutation(cls_count[i])
-                cls_i = np.where((self.y == cls))[0][permutation]
-
-                train.extend(cls_i[:n_i[i]])
-                test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]])
-
-            # Because of rounding issues (as n_train and n_test are not
-            # dividers of the number of elements per class), we may end
-            # up here with less samples in train and test than asked for.
-            if len(train) + len(test) < self.n_train + self.n_test:
-                # We complete by affecting randomly the missing indexes
-                missing_idx = np.where(bincount(train + test,
-                                                minlength=len(self.y)) == 0,
-                                       )[0]
-                missing_idx = rng.permutation(missing_idx)
-                n_missing_train = self.n_train - len(train)
-                n_missing_test = self.n_test - len(test)
-
-                if n_missing_train > 0:
-                    train.extend(missing_idx[:n_missing_train])
-                if n_missing_test > 0:
-                    test.extend(missing_idx[-n_missing_test:])
+                perm_indices_class_i = np.where(
+                    (i == self.y_indices))[0][permutation]
 
+                train.extend(perm_indices_class_i[:n_i[i]])
+                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
             train = rng.permutation(train)
             test = rng.permutation(test)
 
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 0e03cad783e53..3a0c0b1ed1424 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -479,7 +479,7 @@ def test_stratified_shuffle_split_init():
 def test_stratified_shuffle_split_iter():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
-          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
           np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
           np.array([-1] * 800 + [1] * 50)
           ]
@@ -487,16 +487,22 @@ def test_stratified_shuffle_split_iter():
     for y in ys:
         sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                           random_state=0)
+        test_size = np.ceil(0.33 * len(y))
+        train_size = len(y) - test_size
         for train, test in sss:
             assert_array_equal(np.unique(y[train]), np.unique(y[test]))
             # Checks if folds keep classes proportions
-            p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1])
-                       / float(len(y[train])))
-            p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1])
-                      / float(len(y[test])))
+            p_train = (np.bincount(np.unique(y[train],
+                                   return_inverse=True)[1]) /
+                       float(len(y[train])))
+            p_test = (np.bincount(np.unique(y[test],
+                                  return_inverse=True)[1]) /
+                      float(len(y[test])))
             assert_array_almost_equal(p_train, p_test, 1)
-            assert_equal(y[train].size + y[test].size, y.size)
-            assert_array_equal(np.intersect1d(train, test), [])
+            assert_equal(len(train) + len(test), y.size)
+            assert_equal(len(train), train_size)
+            assert_equal(len(test), test_size)
+            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
 
 
 def test_stratified_shuffle_split_even():

From ae2015842b38c00cbf8db19d6e0d1070b2e39913 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Tue, 6 Sep 2016 13:54:14 -0400
Subject: [PATCH 09/13] added whatsnew.

---
 doc/whats_new.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 7ac7c5fcc8241..a450c175ae8fc 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -397,6 +397,11 @@ Bug fixes
     - Fix :class:`linear_model.ElasticNet` sparse decision function to match
       output with dense in the multioutput case.
 
+    - Fix in :class:`sklearn.model_selection.StratifiedShuffleSplit` to
+      return splits of size ``train_size`` and ``test_size`` in all cases
+      (`#6472 <https://github.com/scikit-learn/scikit-learn/pull/6472>`).
+      By `Andreas Müller`_.
+
 API changes summary
 -------------------
 

From e2883e3e722c901b5fb5340518d76f4377dbeba6 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Tue, 6 Sep 2016 14:48:05 -0400
Subject: [PATCH 10/13] fix pep8

---
 sklearn/cross_validation.py                 | 4 ----
 sklearn/model_selection/tests/test_split.py | 8 ++++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index f63124ab3aba7..c4c95c49dce39 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -1045,10 +1045,6 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
     def _iter_indices(self):
         rng = check_random_state(self.random_state)
         cls_count = bincount(self.y_indices)
-        #p_i = cls_count / float(self.n)
-        #n_i = np.round(self.n_train * p_i).astype(int)
-        #t_i = np.minimum(cls_count - n_i,
-        #                 np.round(self.n_test * p_i).astype(int))
 
         for n in range(self.n_iter):
             # if there are ties in the class-counts, we want
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 273eeb0e327ff..f127ab15464d0 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -557,7 +557,8 @@ def test_stratified_shuffle_split_iter():
     for y in ys:
         sss = StratifiedShuffleSplit(6, test_size=0.33,
                                      random_state=0).split(np.ones(len(y)), y)
-        # this is how test-size is computed internally in _validate_shuffle_split
+        # this is how test-size is computed internally
+        # in _validate_shuffle_split
         test_size = np.ceil(0.33 * len(y))
         train_size = len(y) - test_size
         for train, test in sss:
@@ -609,9 +610,8 @@ def assert_counts_are_ok(idx_counts, p):
                     counter[id] += 1
         assert_equal(n_splits_actual, n_splits)
 
-        n_train, n_test = _validate_shuffle_split(n_samples,
-                                                  test_size=1. / n_folds,
-                                                  train_size=1. - (1. / n_folds))
+        n_train, n_test = _validate_shuffle_split(
+            n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds))
 
         assert_equal(len(train), n_train)
         assert_equal(len(test), n_test)

From e982ace64e1a3e194d134484096be972f5a29435 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Tue, 6 Sep 2016 17:18:28 -0400
Subject: [PATCH 11/13] fix typo in docstring

---
 sklearn/cross_validation.py       | 2 +-
 sklearn/model_selection/_split.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index c4c95c49dce39..010f7106a4870 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -908,7 +908,7 @@ def _validate_shuffle_split(n, test_size, train_size):
 
 
 def _approximate_mode(class_counts, n_draws, rng):
-    """Computes approximate mode of multivariate hypergemetric.
+    """Computes approximate mode of multivariate hypergeometric.
 
     This is an approximation to the mode of the multivariate
     hypergeometric given by class_counts and n_draws.
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 93a5f483d9ada..337e0fe5d5692 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1100,7 +1100,7 @@ def _iter_indices(self, X, y, labels):
 
 
 def _approximate_mode(class_counts, n_draws, rng):
-    """Computes approximate mode of multivariate hypergemetric.
+    """Computes approximate mode of multivariate hypergeometric.
 
     This is an approximation to the mode of the multivariate
     hypergeometric given by class_counts and n_draws.

From c865c83efa4bee283026b9a7fe5f18799ecac61a Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Thu, 8 Sep 2016 10:54:41 -0400
Subject: [PATCH 12/13] added another example to test, indentation

---
 sklearn/model_selection/_split.py           | 2 +-
 sklearn/model_selection/tests/test_split.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 337e0fe5d5692..27e9d6edf529f 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1148,7 +1148,7 @@ def _approximate_mode(class_counts, n_draws, rng):
             floored[inds] += 1
             need_to_add -= add_now
             if need_to_add == 0:
-                    break
+                break
     return floored.astype(np.int)
 
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index f127ab15464d0..d4130182b0e10 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -551,7 +551,8 @@ def test_stratified_shuffle_split_iter():
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
           np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
           np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
-          np.array([-1] * 800 + [1] * 50)
+          np.array([-1] * 800 + [1] * 50),
+          np.concatenate([[i] * (100 + i) for i in range(11)])
           ]
 
     for y in ys:

From 4bf494cc12509d08cb98b6aa0caae6f2b969c5a7 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Thu, 8 Sep 2016 16:33:28 -0400
Subject: [PATCH 13/13] add doctests to _approximate_mode

---
 sklearn/model_selection/_split.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 27e9d6edf529f..a476265285437 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1123,6 +1123,20 @@ def _approximate_mode(class_counts, n_draws, rng):
     sampled_classes : ndarray of int
         Number of samples drawn from each class.
         np.sum(sampled_classes) == n_draws
+
+    Examples
+    --------
+    >>> from sklearn.model_selection._split import _approximate_mode
+    >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
+    array([2, 1])
+    >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
+    array([3, 1])
+    >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
+    ...                   n_draws=2, rng=0)
+    array([0, 1, 1, 0])
+    >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
+    ...                   n_draws=2, rng=42)
+    array([1, 1, 0, 0])
     """
     # this computes a bad approximation to the mode of the
     # multivariate hypergeometric given by class_counts and n_draws