From dea5a0e361472cec203124292e02d8280bf9b73e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 3 Mar 2016 14:58:06 -0500 Subject: [PATCH 01/13] fix sampling in stratified shuffle split, break tests that test sampling. --- sklearn/model_selection/_split.py | 39 ++++++++++++++----------------- sklearn/utils/random.py | 6 ++--- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 5989edd30b109..146bc36971c57 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1098,6 +1098,21 @@ def _iter_indices(self, X, y, labels): yield train, test +def _approximate_mode(class_counts, n_draws): + # this computes a bad approximation to the mode of the + # multivariate hypergeometric given by class_counts and n_draws + continuous = n_draws * class_counts / class_counts.sum() + # floored means we don't overshoot n_samples, but probably undershoot + floored = np.floor(continuous) + # we add samples according to how much "left over" probability + # they had, until we arrive at n_samples + remainder = continuous - floored + sorting = np.argsort(remainder)[::-1] + need_to_add = int(n_draws - floored.sum()) + floored[sorting[:need_to_add]] += 1 + return floored.astype(np.int) + + class StratifiedShuffleSplit(BaseShuffleSplit): """Stratified ShuffleSplit cross-validator @@ -1181,10 +1196,9 @@ def _iter_indices(self, X, y, labels=None): (n_test, n_classes)) rng = check_random_state(self.random_state) - p_i = class_counts / float(n_samples) - n_i = np.round(n_train * p_i).astype(int) - t_i = np.minimum(class_counts - n_i, - np.round(n_test * p_i).astype(int)) + n_i = _approximate_mode(class_counts, n_train) + class_counts_remaining = class_counts - n_i + t_i = _approximate_mode(class_counts_remaining, n_test) for _ in range(self.n_splits): train = [] @@ -1196,23 +1210,6 @@ def _iter_indices(self, X, y, labels=None): train.extend(perm_indices_class_i[:n_i[i]]) test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) - - # Because of rounding issues (as n_train and n_test are not - # dividers of the number of elements per class), we may end - # up here with less samples in train and test than asked for. - if len(train) + len(test) < n_train + n_test: - # We complete by affecting randomly the missing indexes - missing_indices = np.where(bincount(train + test, - minlength=len(y)) == 0)[0] - missing_indices = rng.permutation(missing_indices) - n_missing_train = n_train - len(train) - n_missing_test = n_test - len(test) - - if n_missing_train > 0: - train.extend(missing_indices[:n_missing_train]) - if n_missing_test > 0: - test.extend(missing_indices[-n_missing_test:]) - train = rng.permutation(train) test = rng.permutation(test) diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py index 34738d8653b74..5805f9be2c8fa 100644 --- a/sklearn/utils/random.py +++ b/sklearn/utils/random.py @@ -123,7 +123,7 @@ def choice(a, size=None, replace=True, p=None, random_state=None): if pop_size is 0: raise ValueError("a must be non-empty") - if None != p: + if p is not None: p = np.array(p, dtype=np.double, ndmin=1, copy=False) if p.ndim != 1: raise ValueError("p must be 1-dimensional") @@ -142,7 +142,7 @@ def choice(a, size=None, replace=True, p=None, random_state=None): # Actual sampling if replace: - if None != p: + if p is not None: cdf = p.cumsum() cdf /= cdf[-1] uniform_samples = random_state.random_sample(shape) @@ -156,7 +156,7 @@ def choice(a, size=None, replace=True, p=None, random_state=None): raise ValueError("Cannot take a larger sample than " "population when 'replace=False'") - if None != p: + if p is not None: if np.sum(p > 0) < size: raise ValueError("Fewer non-zero entries in p than size") n_uniq = 0 From 3d13d3767ea3c8d8a4878d7dfea81aa4ce4cad88 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 29 Jul 2016 15:42:10 -0400 Subject: [PATCH 02/13] get rid of rounding issues --- sklearn/model_selection/tests/test_split.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index d28148efe6956..c1e3ddb4e8196 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -538,7 +538,7 @@ def test_stratified_shuffle_split_init(): def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), - np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50) ] @@ -594,8 +594,8 @@ def assert_counts_are_ok(idx_counts, p): assert_equal(n_splits_actual, n_splits) n_train, n_test = _validate_shuffle_split(n_samples, - test_size=1./n_folds, - train_size=1.-(1./n_folds)) + test_size=1. / n_folds, + train_size=1. - (1. / n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) @@ -656,7 +656,7 @@ def test_label_shuffle_split(): for l in labels: X = y = np.ones(len(l)) n_splits = 6 - test_size = 1./3 + test_size = 1. / 3 slo = LabelShuffleSplit(n_splits, test_size=test_size, random_state=0) # Make sure the repr works From 2644a6e30d2f454ca48b7b2dfb595e239ec7f309 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 26 Aug 2016 11:57:58 -0400 Subject: [PATCH 03/13] fixed the randomization to make everything nice and iid etc --- sklearn/model_selection/_split.py | 20 +++++++++++++------- sklearn/model_selection/tests/test_split.py | 4 ++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 146bc36971c57..3e74e538ddd2b 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1098,7 +1098,7 @@ def _iter_indices(self, X, y, labels): yield train, test -def _approximate_mode(class_counts, n_draws): +def _approximate_mode(class_counts, n_draws, rng): # this computes a bad approximation to the mode of the # multivariate hypergeometric given by class_counts and n_draws continuous = n_draws * class_counts / class_counts.sum() @@ -1106,10 +1106,13 @@ def _approximate_mode(class_counts, n_draws): floored = np.floor(continuous) # we add samples according to how much "left over" probability # they had, until we arrive at n_samples - remainder = continuous - floored - sorting = np.argsort(remainder)[::-1] need_to_add = int(n_draws - floored.sum()) - floored[sorting[:need_to_add]] += 1 + remainder = continuous - floored + if need_to_add > 0: + remainder /= remainder.sum() + choices = rng.choice(range(len(class_counts)), size=need_to_add, replace=False, p=remainder) + for choice in choices: + floored[choice] += 1 return floored.astype(np.int) @@ -1196,11 +1199,14 @@ def _iter_indices(self, X, y, labels=None): (n_test, n_classes)) rng = check_random_state(self.random_state) - n_i = _approximate_mode(class_counts, n_train) - class_counts_remaining = class_counts - n_i - t_i = _approximate_mode(class_counts_remaining, n_test) for _ in range(self.n_splits): + # if there are ties in the class-counts, we want + # to make sure to break them anew in each iteration + n_i = _approximate_mode(class_counts, n_train, rng) + class_counts_remaining = class_counts - n_i + t_i = _approximate_mode(class_counts_remaining, n_test, rng) + train = [] test = [] diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index c1e3ddb4e8196..c92d3672597bc 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -572,8 +572,8 @@ def assert_counts_are_ok(idx_counts, p): threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: - p = bf.pmf(count) - assert_true(p > threshold, + prob = bf.pmf(count) + assert_true(prob > threshold, "An index is not drawn with chance corresponding " "to even draws") From 143addff5accd70b97be8e9bf9223d8a794c2703 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 29 Aug 2016 12:05:15 -0400 Subject: [PATCH 04/13] old numpy choice compatibility. --- sklearn/model_selection/_split.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 3e74e538ddd2b..30127a9a7d88f 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -30,6 +30,7 @@ from ..externals.six.moves import zip from ..utils.fixes import bincount from ..utils.fixes import signature +from ..utils.random import choice from ..base import _pprint from ..gaussian_process.kernels import Kernel as GPKernel @@ -1110,9 +1111,10 @@ def _approximate_mode(class_counts, n_draws, rng): remainder = continuous - floored if need_to_add > 0: remainder /= remainder.sum() - choices = rng.choice(range(len(class_counts)), size=need_to_add, replace=False, p=remainder) - for choice in choices: - floored[choice] += 1 + choices = choice(range(len(class_counts)), size=need_to_add, + replace=False, p=remainder, random_state=rng) + for pick in choices: + floored[pick] += 1 return floored.astype(np.int) From f227e8c436b7da7a401d0f6a82e15aa99c4a3c41 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 29 Aug 2016 15:11:30 -0400 Subject: [PATCH 05/13] don't draw at random, but break ties randomly. This now passes all tests but is slightly complicated. --- sklearn/model_selection/_split.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 30127a9a7d88f..7e5d55495fea2 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1108,13 +1108,23 @@ def _approximate_mode(class_counts, n_draws, rng): # we add samples according to how much "left over" probability # they had, until we arrive at n_samples need_to_add = int(n_draws - floored.sum()) - remainder = continuous - floored if need_to_add > 0: - remainder /= remainder.sum() - choices = choice(range(len(class_counts)), size=need_to_add, - replace=False, p=remainder, random_state=rng) - for pick in choices: - floored[pick] += 1 + remainder = continuous - floored + values = np.sort(np.unique(remainder))[::-1] + # add according to remainder, but break ties + # randomly to avoid biases + for value in values: + inds, = np.where(remainder == value) + # if we need_to_add less than what's in inds + # we draw randomly from them. + # if we need to add more, we add them all and + # go to the next value + add_now = min(len(inds), need_to_add) + inds = choice(inds, size=add_now, replace=False, random_state=rng) + floored[inds] += 1 + need_to_add -= add_now + if need_to_add == 0: + break return floored.astype(np.int) From 7fdf8527d81e978f28cd8067f31d9a02075eb96f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 29 Aug 2016 15:16:52 -0400 Subject: [PATCH 06/13] added docstring to approximate hypergeometric mode computation --- sklearn/model_selection/_split.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 7e5d55495fea2..93a5f483d9ada 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1100,6 +1100,30 @@ def _iter_indices(self, X, y, labels): def _approximate_mode(class_counts, n_draws, rng): + """Computes approximate mode of multivariate hypergemetric. + + This is an approximation to the mode of the multivariate + hypergeometric given by class_counts and n_draws. + It shouldn't be off by more than one. + + It is the mostly likely outcome of drawing n_draws many + samples from the population given by class_counts. + + Parameters + ---------- + class_counts : ndarray of int + Population per class. + n_draws : int + Number of draws (samples to draw) from the overall population. + rng : random state + Used to break ties. + + Returns + ------- + sampled_classes : ndarray of int + Number of samples drawn from each class. + np.sum(sampled_classes) == n_draws + """ # this computes a bad approximation to the mode of the # multivariate hypergeometric given by class_counts and n_draws continuous = n_draws * class_counts / class_counts.sum() From 7c59f2faf423e750c081f2d319670748363fe2de Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 29 Aug 2016 15:35:43 -0400 Subject: [PATCH 07/13] made test stronger, added a very explicit regression test (that fails in current master!!) --- sklearn/model_selection/tests/test_split.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index c92d3672597bc..273eeb0e327ff 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -535,6 +535,17 @@ def test_stratified_shuffle_split_init(): StratifiedShuffleSplit(test_size=2).split(X, y)) +def test_stratified_shuffle_split_respects_test_size(): + y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]) + test_size = 5 + train_size = 10 + sss = StratifiedShuffleSplit(6, test_size=test_size, train_size=train_size, + random_state=0).split(np.ones(len(y)), y) + for train, test in sss: + assert_equal(len(train), train_size) + assert_equal(len(test), test_size) + + def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), @@ -546,6 +557,9 @@ def test_stratified_shuffle_split_iter(): for y in ys: sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split(np.ones(len(y)), y) + # this is how test-size is computed internally in _validate_shuffle_split + test_size = np.ceil(0.33 * len(y)) + train_size = len(y) - test_size for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions @@ -556,7 +570,9 @@ def test_stratified_shuffle_split_iter(): return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) - assert_equal(y[train].size + y[test].size, y.size) + assert_equal(len(train) + len(test), y.size) + assert_equal(len(train), train_size) + assert_equal(len(test), test_size) assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) From 1903e85ab989a0c8dcc387e544bd032d1ea5e515 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Sep 2016 13:49:58 -0400 Subject: [PATCH 08/13] backport StratifiedShuffleSplit fix to cross_validation.py --- sklearn/cross_validation.py | 101 ++++++++++++++++++------- sklearn/tests/test_cross_validation.py | 20 +++-- 2 files changed, 85 insertions(+), 36 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 508b0460ec154..f63124ab3aba7 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -27,6 +27,7 @@ from .utils.validation import (_is_arraylike, _num_samples, column_or_1d) from .utils.multiclass import type_of_target +from .utils.random import choice from .externals.joblib import Parallel, delayed, logger from .externals.six import with_metaclass from .externals.six.moves import zip @@ -414,9 +415,9 @@ def __init__(self, labels, n_folds=3): if n_folds > n_labels: raise ValueError( - ("Cannot have number of folds n_folds={0} greater" - " than the number of labels: {1}.").format(n_folds, - n_labels)) + ("Cannot have number of folds n_folds={0} greater" + " than the number of labels: {1}.").format(n_folds, + n_labels)) # Weight labels by their number of occurrences n_samples_per_label = np.bincount(labels) @@ -906,6 +907,59 @@ def _validate_shuffle_split(n, test_size, train_size): return int(n_train), int(n_test) +def _approximate_mode(class_counts, n_draws, rng): + """Computes approximate mode of multivariate hypergemetric. + + This is an approximation to the mode of the multivariate + hypergeometric given by class_counts and n_draws. + It shouldn't be off by more than one. + + It is the mostly likely outcome of drawing n_draws many + samples from the population given by class_counts. + + Parameters + ---------- + class_counts : ndarray of int + Population per class. + n_draws : int + Number of draws (samples to draw) from the overall population. + rng : random state + Used to break ties. + + Returns + ------- + sampled_classes : ndarray of int + Number of samples drawn from each class. + np.sum(sampled_classes) == n_draws + """ + # this computes a bad approximation to the mode of the + # multivariate hypergeometric given by class_counts and n_draws + continuous = n_draws * class_counts / class_counts.sum() + # floored means we don't overshoot n_samples, but probably undershoot + floored = np.floor(continuous) + # we add samples according to how much "left over" probability + # they had, until we arrive at n_samples + need_to_add = int(n_draws - floored.sum()) + if need_to_add > 0: + remainder = continuous - floored + values = np.sort(np.unique(remainder))[::-1] + # add according to remainder, but break ties + # randomly to avoid biases + for value in values: + inds, = np.where(remainder == value) + # if we need_to_add less than what's in inds + # we draw randomly from them. + # if we need to add more, we add them all and + # go to the next value + add_now = min(len(inds), need_to_add) + inds = choice(inds, size=add_now, replace=False, random_state=rng) + floored[inds] += 1 + need_to_add -= add_now + if need_to_add == 0: + break + return floored.astype(np.int) + + class StratifiedShuffleSplit(BaseShuffleSplit): """Stratified ShuffleSplit cross validation iterator @@ -991,39 +1045,28 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, def _iter_indices(self): rng = check_random_state(self.random_state) cls_count = bincount(self.y_indices) - p_i = cls_count / float(self.n) - n_i = np.round(self.n_train * p_i).astype(int) - t_i = np.minimum(cls_count - n_i, - np.round(self.n_test * p_i).astype(int)) + #p_i = cls_count / float(self.n) + #n_i = np.round(self.n_train * p_i).astype(int) + #t_i = np.minimum(cls_count - n_i, + # np.round(self.n_test * p_i).astype(int)) for n in range(self.n_iter): + # if there are ties in the class-counts, we want + # to make sure to break them anew in each iteration + n_i = _approximate_mode(cls_count, self.n_train, rng) + class_counts_remaining = cls_count - n_i + t_i = _approximate_mode(class_counts_remaining, self.n_test, rng) + train = [] test = [] - for i, cls in enumerate(self.classes): + for i, _ in enumerate(self.classes): permutation = rng.permutation(cls_count[i]) - cls_i = np.where((self.y == cls))[0][permutation] - - train.extend(cls_i[:n_i[i]]) - test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]]) - - # Because of rounding issues (as n_train and n_test are not - # dividers of the number of elements per class), we may end - # up here with less samples in train and test than asked for. - if len(train) + len(test) < self.n_train + self.n_test: - # We complete by affecting randomly the missing indexes - missing_idx = np.where(bincount(train + test, - minlength=len(self.y)) == 0, - )[0] - missing_idx = rng.permutation(missing_idx) - n_missing_train = self.n_train - len(train) - n_missing_test = self.n_test - len(test) - - if n_missing_train > 0: - train.extend(missing_idx[:n_missing_train]) - if n_missing_test > 0: - test.extend(missing_idx[-n_missing_test:]) + perm_indices_class_i = np.where( + (i == self.y_indices))[0][permutation] + train.extend(perm_indices_class_i[:n_i[i]]) + test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) train = rng.permutation(train) test = rng.permutation(test) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 0e03cad783e53..3a0c0b1ed1424 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -479,7 +479,7 @@ def test_stratified_shuffle_split_init(): def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), - np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50) ] @@ -487,16 +487,22 @@ def test_stratified_shuffle_split_iter(): for y in ys: sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33, random_state=0) + test_size = np.ceil(0.33 * len(y)) + train_size = len(y) - test_size for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions - p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1]) - / float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) - / float(len(y[test]))) + p_train = (np.bincount(np.unique(y[train], + return_inverse=True)[1]) / + float(len(y[train]))) + p_test = (np.bincount(np.unique(y[test], + return_inverse=True)[1]) / + float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) - assert_equal(y[train].size + y[test].size, y.size) - assert_array_equal(np.intersect1d(train, test), []) + assert_equal(len(train) + len(test), y.size) + assert_equal(len(train), train_size) + assert_equal(len(test), test_size) + assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) def test_stratified_shuffle_split_even(): From ae2015842b38c00cbf8db19d6e0d1070b2e39913 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Sep 2016 13:54:14 -0400 Subject: [PATCH 09/13] added whatsnew. --- doc/whats_new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 7ac7c5fcc8241..a450c175ae8fc 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -397,6 +397,11 @@ Bug fixes - Fix :class:`linear_model.ElasticNet` sparse decision function to match output with dense in the multioutput case. + - Fix in :class:`sklearn.model_selection.StratifiedShuffleSplit` to + return splits of size ``train_size`` and ``test_size`` in all cases + (`#6472 `). + By `Andreas Müller`_. + API changes summary ------------------- From e2883e3e722c901b5fb5340518d76f4377dbeba6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Sep 2016 14:48:05 -0400 Subject: [PATCH 10/13] fix pep8 --- sklearn/cross_validation.py | 4 ---- sklearn/model_selection/tests/test_split.py | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index f63124ab3aba7..c4c95c49dce39 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1045,10 +1045,6 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, def _iter_indices(self): rng = check_random_state(self.random_state) cls_count = bincount(self.y_indices) - #p_i = cls_count / float(self.n) - #n_i = np.round(self.n_train * p_i).astype(int) - #t_i = np.minimum(cls_count - n_i, - # np.round(self.n_test * p_i).astype(int)) for n in range(self.n_iter): # if there are ties in the class-counts, we want diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 273eeb0e327ff..f127ab15464d0 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -557,7 +557,8 @@ def test_stratified_shuffle_split_iter(): for y in ys: sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split(np.ones(len(y)), y) - # this is how test-size is computed internally in _validate_shuffle_split + # this is how test-size is computed internally + # in _validate_shuffle_split test_size = np.ceil(0.33 * len(y)) train_size = len(y) - test_size for train, test in sss: @@ -609,9 +610,8 @@ def assert_counts_are_ok(idx_counts, p): counter[id] += 1 assert_equal(n_splits_actual, n_splits) - n_train, n_test = _validate_shuffle_split(n_samples, - test_size=1. / n_folds, - train_size=1. - (1. / n_folds)) + n_train, n_test = _validate_shuffle_split( + n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) From e982ace64e1a3e194d134484096be972f5a29435 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Sep 2016 17:18:28 -0400 Subject: [PATCH 11/13] fix typo in docstring --- sklearn/cross_validation.py | 2 +- sklearn/model_selection/_split.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index c4c95c49dce39..010f7106a4870 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -908,7 +908,7 @@ def _validate_shuffle_split(n, test_size, train_size): def _approximate_mode(class_counts, n_draws, rng): - """Computes approximate mode of multivariate hypergemetric. + """Computes approximate mode of multivariate hypergeometric. This is an approximation to the mode of the multivariate hypergeometric given by class_counts and n_draws. diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 93a5f483d9ada..337e0fe5d5692 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1100,7 +1100,7 @@ def _iter_indices(self, X, y, labels): def _approximate_mode(class_counts, n_draws, rng): - """Computes approximate mode of multivariate hypergemetric. + """Computes approximate mode of multivariate hypergeometric. This is an approximation to the mode of the multivariate hypergeometric given by class_counts and n_draws. From c865c83efa4bee283026b9a7fe5f18799ecac61a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 8 Sep 2016 10:54:41 -0400 Subject: [PATCH 12/13] added another example to test, indentation --- sklearn/model_selection/_split.py | 2 +- sklearn/model_selection/tests/test_split.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 337e0fe5d5692..27e9d6edf529f 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1148,7 +1148,7 @@ def _approximate_mode(class_counts, n_draws, rng): floored[inds] += 1 need_to_add -= add_now if need_to_add == 0: - break + break return floored.astype(np.int) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index f127ab15464d0..d4130182b0e10 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -551,7 +551,8 @@ def test_stratified_shuffle_split_iter(): np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), - np.array([-1] * 800 + [1] * 50) + np.array([-1] * 800 + [1] * 50), + np.concatenate([[i] * (100 + i) for i in range(11)]) ] for y in ys: From 4bf494cc12509d08cb98b6aa0caae6f2b969c5a7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 8 Sep 2016 16:33:28 -0400 Subject: [PATCH 13/13] add doctests to _approximate_mode --- sklearn/model_selection/_split.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 27e9d6edf529f..a476265285437 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1123,6 +1123,20 @@ def _approximate_mode(class_counts, n_draws, rng): sampled_classes : ndarray of int Number of samples drawn from each class. np.sum(sampled_classes) == n_draws + + Examples + -------- + >>> from sklearn.model_selection._split import _approximate_mode + >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) + array([2, 1]) + >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) + array([3, 1]) + >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), + ... n_draws=2, rng=0) + array([0, 1, 1, 0]) + >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), + ... n_draws=2, rng=42) + array([1, 1, 0, 0]) """ # this computes a bad approximation to the mode of the # multivariate hypergeometric given by class_counts and n_draws