Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Changed signature of CV split to accept X=None #7128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 37 additions & 15 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,19 @@ def __init__(self):
# see #6304
pass

def split(self, X, y=None, labels=None):
def split(self, X=None, y=None, labels=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
X : array-like, shape (n_samples, n_features), optional
Training data, where n_samples is the number of samples
and n_features is the number of features.
Can be None if y is specified.

y : array-like, of length n_samples
y : array-like, of length n_samples, optional
The target variable for supervised learning problems.
Must be specified if X is None.

labels : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
Expand All @@ -84,6 +86,7 @@ def split(self, X, y=None, labels=None):
test : ndarray
The testing set indices for that split.
"""
X = _default_split_X(X, y)
X, y, labels = indexable(X, y, labels)
indices = np.arange(_num_samples(X))
for test_index in self._iter_test_masks(X, y, labels):
Expand Down Expand Up @@ -285,17 +288,19 @@ def __init__(self, n_folds, shuffle, random_state):
self.shuffle = shuffle
self.random_state = random_state

def split(self, X, y=None, labels=None):
def split(self, X=None, y=None, labels=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
X : array-like, shape (n_samples, n_features), optional
Training data, where n_samples is the number of samples
and n_features is the number of features.
Can be None if y is specified.

y : array-like, shape (n_samples,)
y : array-like, of length n_samples, optional
The target variable for supervised learning problems.
Must be specified if X is None.

labels : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
Expand All @@ -309,6 +314,7 @@ def split(self, X, y=None, labels=None):
test : ndarray
The testing set indices for that split.
"""
X = _default_split_X(X, y)
X, y, labels = indexable(X, y, labels)
n_samples = _num_samples(X)
if self.n_folds > n_samples:
Expand Down Expand Up @@ -608,17 +614,19 @@ def _iter_test_masks(self, X, y=None, labels=None):
for i in range(self.n_folds):
yield test_folds == i

def split(self, X, y, labels=None):
def split(self, X=None, y=None, labels=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
X : array-like, shape (n_samples, n_features), optional
Training data, where n_samples is the number of samples
and n_features is the number of features.
Can be None if y is specified.

y : array-like, shape (n_samples,)
y : array-like, of length n_samples, optional
The target variable for supervised learning problems.
Must be specified if X is None.

labels : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
Expand All @@ -634,6 +642,7 @@ def split(self, X, y, labels=None):
"""
return super(StratifiedKFold, self).split(X, y, labels)


class LeaveOneLabelOut(BaseCrossValidator):
"""Leave One Label Out cross-validator

Expand Down Expand Up @@ -811,17 +820,19 @@ def __init__(self, n_iter=10, test_size=0.1, train_size=None,
self.train_size = train_size
self.random_state = random_state

def split(self, X, y=None, labels=None):
def split(self, X=None, y=None, labels=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
X : array-like, shape (n_samples, n_features), optional
Training data, where n_samples is the number of samples
and n_features is the number of features.
Can be None if y is specified.

y : array-like, shape (n_samples,)
y : array-like, of length n_samples, optional
The target variable for supervised learning problems.
Must be specified if X is None.

labels : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
Expand All @@ -835,6 +846,7 @@ def split(self, X, y=None, labels=None):
test : ndarray
The testing set indices for that split.
"""
X = _default_split_X(X, y)
X, y, labels = indexable(X, y, labels)
for train, test in self._iter_indices(X, y, labels):
yield train, test
Expand Down Expand Up @@ -1125,17 +1137,19 @@ def _iter_indices(self, X, y, labels=None):

yield train, test

def split(self, X, y, labels=None):
def split(self, X=None, y=None, labels=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
X : array-like, shape (n_samples, n_features), optional
Training data, where n_samples is the number of samples
and n_features is the number of features.
Can be None if y is specified.

y : array-like, shape (n_samples,)
y : array-like, of length n_samples, optional
The target variable for supervised learning problems.
Must be specified if X is None.

labels : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
Expand Down Expand Up @@ -1567,6 +1581,14 @@ def _safe_split(estimator, X, y, indices, train_indices=None):
return X_subset, y_subset


def _default_split_X(X, y):
if X is None:
if y is None:
raise ValueError("At least X or y should be specified.")
X = np.empty((len(y), 1))
return X


def _build_repr(self):
# XXX This is copied from BaseEstimator's get_params
cls = self.__class__
Expand Down
51 changes: 37 additions & 14 deletions sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,9 @@ def test_kfold_valueerrors():

assert_raises(ValueError, next, skf_3.split(X2, y))

# Make sure at least X or y is specified
assert_raises(ValueError, next, skf_3.split(X=None, y=None))

# Error when number of folds is <= 1
assert_raises(ValueError, KFold, 0)
assert_raises(ValueError, KFold, 1)
Expand Down Expand Up @@ -325,6 +328,16 @@ def test_stratified_kfold_no_shuffle():
assert_array_equal(test, [2, 5, 6])
assert_array_equal(train, [0, 1, 3, 4])

# Ensure that X is not requited
splits = StratifiedKFold(2).split(y=y)
train, test = next(splits)
assert_array_equal(test, [0, 1, 3, 4])
assert_array_equal(train, [2, 5, 6])

train, test = next(splits)
assert_array_equal(test, [2, 5, 6])
assert_array_equal(train, [0, 1, 3, 4])

# Check if get_n_splits returns the number of folds
assert_equal(5, StratifiedKFold(5).get_n_splits(X, y))

Expand Down Expand Up @@ -532,6 +545,10 @@ def test_stratified_shuffle_split_init():
assert_raises(ValueError, next,
StratifiedShuffleSplit(test_size=2).split(X, y))

# Test at least X or Y is specified
assert_raises(ValueError, next,
StratifiedShuffleSplit(test_size=2).split(X=None, y=None))


def test_stratified_shuffle_split_iter():
ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
Expand All @@ -542,20 +559,23 @@ def test_stratified_shuffle_split_iter():
]

for y in ys:
sss = StratifiedShuffleSplit(6, test_size=0.33,
random_state=0).split(np.ones(len(y)), y)
for train, test in sss:
assert_array_equal(np.unique(y[train]), np.unique(y[test]))
# Checks if folds keep classes proportions
p_train = (np.bincount(np.unique(y[train],
return_inverse=True)[1]) /
float(len(y[train])))
p_test = (np.bincount(np.unique(y[test],
return_inverse=True)[1]) /
float(len(y[test])))
assert_array_almost_equal(p_train, p_test, 1)
assert_equal(y[train].size + y[test].size, y.size)
assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
# Split should work if X is specified or not specified
X_ = np.ones(len(y))
for X in [X_, None]:
sss = StratifiedShuffleSplit(6, test_size=0.33,
random_state=0).split(X, y)
for train, test in sss:
assert_array_equal(np.unique(y[train]), np.unique(y[test]))
# Checks if folds keep classes proportions
p_train = (np.bincount(np.unique(y[train],
return_inverse=True)[1]) /
float(len(y[train])))
p_test = (np.bincount(np.unique(y[test],
return_inverse=True)[1]) /
float(len(y[test])))
assert_array_almost_equal(p_train, p_test, 1)
assert_equal(y[train].size + y[test].size, y.size)
assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])


def test_stratified_shuffle_split_even():
Expand Down Expand Up @@ -806,6 +826,9 @@ def test_shufflesplit_errors():
assert_raises(ValueError, next, ShuffleSplit(test_size=8,
train_size=3).split(X))

# Test at least X or Y is specified
assert_raises(ValueError, next, ShuffleSplit(test_size=5).split(X=None, y=None))


def test_shufflesplit_reproducible():
# Check that iterating twice on the ShuffleSplit gives the same
Expand Down