scikit-learn · Erotemic · Aug 2, 2016
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -60,17 +60,19 @@ def __init__(self):
         # see #6304
         pass
 
-    def split(self, X, y=None, labels=None):
+    def split(self, X=None, y=None, labels=None):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features), optional
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
+            Can be None if y is specified.
 
-        y : array-like, of length n_samples
+        y : array-like, of length n_samples, optional
             The target variable for supervised learning problems.
+            Must be specified if X is None.
 
         labels : array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
@@ -84,6 +86,7 @@ def split(self, X, y=None, labels=None):
         test : ndarray
             The testing set indices for that split.
         """
+        X = _default_split_X(X, y)
         X, y, labels = indexable(X, y, labels)
         indices = np.arange(_num_samples(X))
         for test_index in self._iter_test_masks(X, y, labels):
@@ -285,17 +288,19 @@ def __init__(self, n_folds, shuffle, random_state):
         self.shuffle = shuffle
         self.random_state = random_state
 
-    def split(self, X, y=None, labels=None):
+    def split(self, X=None, y=None, labels=None):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features), optional
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
+            Can be None if y is specified.
 
-        y : array-like, shape (n_samples,)
+        y : array-like, of length n_samples, optional
             The target variable for supervised learning problems.
+            Must be specified if X is None.
 
         labels : array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
@@ -309,6 +314,7 @@ def split(self, X, y=None, labels=None):
         test : ndarray
             The testing set indices for that split.
         """
+        X = _default_split_X(X, y)
         X, y, labels = indexable(X, y, labels)
         n_samples = _num_samples(X)
         if self.n_folds > n_samples:
@@ -608,17 +614,19 @@ def _iter_test_masks(self, X, y=None, labels=None):
         for i in range(self.n_folds):
             yield test_folds == i
 
-    def split(self, X, y, labels=None):
+    def split(self, X=None, y=None, labels=None):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features), optional
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
+            Can be None if y is specified.
 
-        y : array-like, shape (n_samples,)
+        y : array-like, of length n_samples, optional
             The target variable for supervised learning problems.
+            Must be specified if X is None.
 
         labels : array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
@@ -634,6 +642,7 @@ def split(self, X, y, labels=None):
         """
         return super(StratifiedKFold, self).split(X, y, labels)
 
+
 class LeaveOneLabelOut(BaseCrossValidator):
     """Leave One Label Out cross-validator
 
@@ -811,17 +820,19 @@ def __init__(self, n_iter=10, test_size=0.1, train_size=None,
         self.train_size = train_size
         self.random_state = random_state
 
-    def split(self, X, y=None, labels=None):
+    def split(self, X=None, y=None, labels=None):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features), optional
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
+            Can be None if y is specified.
 
-        y : array-like, shape (n_samples,)
+        y : array-like, of length n_samples, optional
             The target variable for supervised learning problems.
+            Must be specified if X is None.
 
         labels : array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
@@ -835,6 +846,7 @@ def split(self, X, y=None, labels=None):
         test : ndarray
             The testing set indices for that split.
         """
+        X = _default_split_X(X, y)
         X, y, labels = indexable(X, y, labels)
         for train, test in self._iter_indices(X, y, labels):
             yield train, test
@@ -1125,17 +1137,19 @@ def _iter_indices(self, X, y, labels=None):
 
             yield train, test
 
-    def split(self, X, y, labels=None):
+    def split(self, X=None, y=None, labels=None):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features), optional
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
+            Can be None if y is specified.
 
-        y : array-like, shape (n_samples,)
+        y : array-like, of length n_samples, optional
             The target variable for supervised learning problems.
+            Must be specified if X is None.
 
         labels : array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
@@ -1567,6 +1581,14 @@ def _safe_split(estimator, X, y, indices, train_indices=None):
     return X_subset, y_subset
 
 
+def _default_split_X(X, y):
+    if X is None:
+        if y is None:
+            raise ValueError("At least X or y should be specified.")
+        X = np.empty((len(y), 1))
+    return X
+
+
 def _build_repr(self):
     # XXX This is copied from BaseEstimator's get_params
     cls = self.__class__

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -242,6 +242,9 @@ def test_kfold_valueerrors():
 
     assert_raises(ValueError, next, skf_3.split(X2, y))
 
+    # Make sure at least X or y is specified
+    assert_raises(ValueError, next, skf_3.split(X=None, y=None))
+
     # Error when number of folds is <= 1
     assert_raises(ValueError, KFold, 0)
     assert_raises(ValueError, KFold, 1)
@@ -325,6 +328,16 @@ def test_stratified_kfold_no_shuffle():
     assert_array_equal(test, [2, 5, 6])
     assert_array_equal(train, [0, 1, 3, 4])
 
+    # Ensure that X is not requited
+    splits = StratifiedKFold(2).split(y=y)
+    train, test = next(splits)
+    assert_array_equal(test, [0, 1, 3, 4])
+    assert_array_equal(train, [2, 5, 6])
+
+    train, test = next(splits)
+    assert_array_equal(test, [2, 5, 6])
+    assert_array_equal(train, [0, 1, 3, 4])
+
     # Check if get_n_splits returns the number of folds
     assert_equal(5, StratifiedKFold(5).get_n_splits(X, y))
 
@@ -532,6 +545,10 @@ def test_stratified_shuffle_split_init():
     assert_raises(ValueError, next,
                   StratifiedShuffleSplit(test_size=2).split(X, y))
 
+    # Test at least X or Y is specified
+    assert_raises(ValueError, next,
+                  StratifiedShuffleSplit(test_size=2).split(X=None, y=None))
+
 
 def test_stratified_shuffle_split_iter():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
@@ -542,20 +559,23 @@ def test_stratified_shuffle_split_iter():
           ]
 
     for y in ys:
-        sss = StratifiedShuffleSplit(6, test_size=0.33,
-                                     random_state=0).split(np.ones(len(y)), y)
-        for train, test in sss:
-            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
-            # Checks if folds keep classes proportions
-            p_train = (np.bincount(np.unique(y[train],
-                                   return_inverse=True)[1]) /
-                       float(len(y[train])))
-            p_test = (np.bincount(np.unique(y[test],
-                                  return_inverse=True)[1]) /
-                      float(len(y[test])))
-            assert_array_almost_equal(p_train, p_test, 1)
-            assert_equal(y[train].size + y[test].size, y.size)
-            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
+        # Split should work if X is specified or not specified
+        X_ = np.ones(len(y))
+        for X in [X_, None]:
+            sss = StratifiedShuffleSplit(6, test_size=0.33,
+                                         random_state=0).split(X, y)
+            for train, test in sss:
+                assert_array_equal(np.unique(y[train]), np.unique(y[test]))
+                # Checks if folds keep classes proportions
+                p_train = (np.bincount(np.unique(y[train],
+                                       return_inverse=True)[1]) /
+                           float(len(y[train])))
+                p_test = (np.bincount(np.unique(y[test],
+                                      return_inverse=True)[1]) /
+                          float(len(y[test])))
+                assert_array_almost_equal(p_train, p_test, 1)
+                assert_equal(y[train].size + y[test].size, y.size)
+                assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
 
 
 def test_stratified_shuffle_split_even():
@@ -806,6 +826,9 @@ def test_shufflesplit_errors():
     assert_raises(ValueError, next, ShuffleSplit(test_size=8,
                                                  train_size=3).split(X))
 
+    # Test at least X or Y is specified
+    assert_raises(ValueError, next, ShuffleSplit(test_size=5).split(X=None, y=None))
+
 
 def test_shufflesplit_reproducible():
     # Check that iterating twice on the ShuffleSplit gives the same