diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index da841fd8dca27..74513c669126d 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -60,17 +60,19 @@ def __init__(self): # see #6304 pass - def split(self, X, y=None, labels=None): + def split(self, X=None, y=None, labels=None): """Generate indices to split data into training and test set. Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like, shape (n_samples, n_features), optional Training data, where n_samples is the number of samples and n_features is the number of features. + Can be None if y is specified. - y : array-like, of length n_samples + y : array-like, of length n_samples, optional The target variable for supervised learning problems. + Must be specified if X is None. labels : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into @@ -84,6 +86,7 @@ def split(self, X, y=None, labels=None): test : ndarray The testing set indices for that split. """ + X = _default_split_X(X, y) X, y, labels = indexable(X, y, labels) indices = np.arange(_num_samples(X)) for test_index in self._iter_test_masks(X, y, labels): @@ -285,17 +288,19 @@ def __init__(self, n_folds, shuffle, random_state): self.shuffle = shuffle self.random_state = random_state - def split(self, X, y=None, labels=None): + def split(self, X=None, y=None, labels=None): """Generate indices to split data into training and test set. Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like, shape (n_samples, n_features), optional Training data, where n_samples is the number of samples and n_features is the number of features. + Can be None if y is specified. - y : array-like, shape (n_samples,) + y : array-like, of length n_samples, optional The target variable for supervised learning problems. + Must be specified if X is None. labels : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into @@ -309,6 +314,7 @@ def split(self, X, y=None, labels=None): test : ndarray The testing set indices for that split. """ + X = _default_split_X(X, y) X, y, labels = indexable(X, y, labels) n_samples = _num_samples(X) if self.n_folds > n_samples: @@ -608,17 +614,19 @@ def _iter_test_masks(self, X, y=None, labels=None): for i in range(self.n_folds): yield test_folds == i - def split(self, X, y, labels=None): + def split(self, X=None, y=None, labels=None): """Generate indices to split data into training and test set. Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like, shape (n_samples, n_features), optional Training data, where n_samples is the number of samples and n_features is the number of features. + Can be None if y is specified. - y : array-like, shape (n_samples,) + y : array-like, of length n_samples, optional The target variable for supervised learning problems. + Must be specified if X is None. labels : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into @@ -634,6 +642,7 @@ def split(self, X, y, labels=None): """ return super(StratifiedKFold, self).split(X, y, labels) + class LeaveOneLabelOut(BaseCrossValidator): """Leave One Label Out cross-validator @@ -811,17 +820,19 @@ def __init__(self, n_iter=10, test_size=0.1, train_size=None, self.train_size = train_size self.random_state = random_state - def split(self, X, y=None, labels=None): + def split(self, X=None, y=None, labels=None): """Generate indices to split data into training and test set. Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like, shape (n_samples, n_features), optional Training data, where n_samples is the number of samples and n_features is the number of features. + Can be None if y is specified. - y : array-like, shape (n_samples,) + y : array-like, of length n_samples, optional The target variable for supervised learning problems. + Must be specified if X is None. labels : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into @@ -835,6 +846,7 @@ def split(self, X, y=None, labels=None): test : ndarray The testing set indices for that split. """ + X = _default_split_X(X, y) X, y, labels = indexable(X, y, labels) for train, test in self._iter_indices(X, y, labels): yield train, test @@ -1125,17 +1137,19 @@ def _iter_indices(self, X, y, labels=None): yield train, test - def split(self, X, y, labels=None): + def split(self, X=None, y=None, labels=None): """Generate indices to split data into training and test set. Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like, shape (n_samples, n_features), optional Training data, where n_samples is the number of samples and n_features is the number of features. + Can be None if y is specified. - y : array-like, shape (n_samples,) + y : array-like, of length n_samples, optional The target variable for supervised learning problems. + Must be specified if X is None. labels : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into @@ -1567,6 +1581,14 @@ def _safe_split(estimator, X, y, indices, train_indices=None): return X_subset, y_subset +def _default_split_X(X, y): + if X is None: + if y is None: + raise ValueError("At least X or y should be specified.") + X = np.empty((len(y), 1)) + return X + + def _build_repr(self): # XXX This is copied from BaseEstimator's get_params cls = self.__class__ diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 69749f8e4c0aa..ffec71557f4df 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -242,6 +242,9 @@ def test_kfold_valueerrors(): assert_raises(ValueError, next, skf_3.split(X2, y)) + # Make sure at least X or y is specified + assert_raises(ValueError, next, skf_3.split(X=None, y=None)) + # Error when number of folds is <= 1 assert_raises(ValueError, KFold, 0) assert_raises(ValueError, KFold, 1) @@ -325,6 +328,16 @@ def test_stratified_kfold_no_shuffle(): assert_array_equal(test, [2, 5, 6]) assert_array_equal(train, [0, 1, 3, 4]) + # Ensure that X is not requited + splits = StratifiedKFold(2).split(y=y) + train, test = next(splits) + assert_array_equal(test, [0, 1, 3, 4]) + assert_array_equal(train, [2, 5, 6]) + + train, test = next(splits) + assert_array_equal(test, [2, 5, 6]) + assert_array_equal(train, [0, 1, 3, 4]) + # Check if get_n_splits returns the number of folds assert_equal(5, StratifiedKFold(5).get_n_splits(X, y)) @@ -532,6 +545,10 @@ def test_stratified_shuffle_split_init(): assert_raises(ValueError, next, StratifiedShuffleSplit(test_size=2).split(X, y)) + # Test at least X or Y is specified + assert_raises(ValueError, next, + StratifiedShuffleSplit(test_size=2).split(X=None, y=None)) + def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), @@ -542,20 +559,23 @@ def test_stratified_shuffle_split_iter(): ] for y in ys: - sss = StratifiedShuffleSplit(6, test_size=0.33, - random_state=0).split(np.ones(len(y)), y) - for train, test in sss: - assert_array_equal(np.unique(y[train]), np.unique(y[test])) - # Checks if folds keep classes proportions - p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / - float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], - return_inverse=True)[1]) / - float(len(y[test]))) - assert_array_almost_equal(p_train, p_test, 1) - assert_equal(y[train].size + y[test].size, y.size) - assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) + # Split should work if X is specified or not specified + X_ = np.ones(len(y)) + for X in [X_, None]: + sss = StratifiedShuffleSplit(6, test_size=0.33, + random_state=0).split(X, y) + for train, test in sss: + assert_array_equal(np.unique(y[train]), np.unique(y[test])) + # Checks if folds keep classes proportions + p_train = (np.bincount(np.unique(y[train], + return_inverse=True)[1]) / + float(len(y[train]))) + p_test = (np.bincount(np.unique(y[test], + return_inverse=True)[1]) / + float(len(y[test]))) + assert_array_almost_equal(p_train, p_test, 1) + assert_equal(y[train].size + y[test].size, y.size) + assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) def test_stratified_shuffle_split_even(): @@ -806,6 +826,9 @@ def test_shufflesplit_errors(): assert_raises(ValueError, next, ShuffleSplit(test_size=8, train_size=3).split(X)) + # Test at least X or Y is specified + assert_raises(ValueError, next, ShuffleSplit(test_size=5).split(X=None, y=None)) + def test_shufflesplit_reproducible(): # Check that iterating twice on the ShuffleSplit gives the same