Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 147 additions & 2 deletions sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
'LeavePOut',
'ShuffleSplit',
'StratifiedKFold',
'BinnedStratifiedKFold',
'StratifiedShuffleSplit',
'PredefinedSplit',
'LabelShuffleSplit',
Expand Down Expand Up @@ -230,8 +231,8 @@ def __repr__(self):
)

def __len__(self):
return int(factorial(self.n) / factorial(self.n - self.p)
/ factorial(self.p))
return int(factorial(self.n) / factorial(self.n - self.p) /
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kindly revert this change too...

factorial(self.p))


class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
Expand Down Expand Up @@ -577,6 +578,150 @@ def __len__(self):
return self.n_folds


class BinnedStratifiedKFold(_BaseKFold):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be removed from cross_validation.py. As we are deprecating the whole import path. You can safely have it implemented inside model_selection alone.

"""Binned Stratified K-Folds cross validation iterator for continuous data

Provides train/test indices to split data in train test sets
based on continuous input `y` of length `len_y`.
The input is binned into `ceil(len_y / n_folds)` classes
with equal number of members, except the middle class,
which receives the remainder of labels (of length `len_y % n_folds`).

This cross-validation object is a variation of KFold that
returns binned stratified folds. The folds are made by preserving
the percentage of samples for each class.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
y : array-like, [n_samples]
Samples to split in K folds.

n_folds : int, default=3
Number of folds. Must be at least 2.

shuffle : boolean, optional
Whether to shuffle each stratification of the data before splitting
into batches.

random_state : None, int or RandomState
When shuffle=True, pseudo-random number generator state used for
shuffling. If None, use default numpy RNG for shuffling.

Examples
--------
>>> from sklearn.cross_validation import BinnedStratifiedKFold
>>> y = np.arange(11.0)
>>> np.random.seed(0)
>>> np.random.shuffle(y)
>>> X = y + 0.1* np.random.randn(len(y))
>>> skf = BinnedStratifiedKFold(y, n_folds=3)
>>> len(skf)
3
>>> print(skf) # doctest: +NORMALIZE_WHITESPACE
sklearn.cross_validation.BinnedStratifiedKFold(n=11, n_folds=3,
shuffle=False, random_state=None)
>>> indarr = np.zeros( len(y), dtype = bool)
>>> for train_index, test_index in skf:
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [ 1 2 3 4 5 8 10] TEST: [0 6 7 9]
TRAIN: [0 2 3 4 6 7 8 9] TEST: [ 1 5 10]
TRAIN: [ 0 1 5 6 7 9 10] TEST: [2 3 4 8]

Notes
-----
All the folds have size floor(n_samples / n_folds) or
floor(n_samples / n_folds) +1,
the length is assigned randomly (even if no shuffling is requested)
to balance the variance between folds.

See also
--------
StratifiedKFold -- stratified k-fold generator for classification data
"""

def __init__(self, y, n_folds=3, shuffle=False,
random_state=None):
self.random_state = random_state
super(BinnedStratifiedKFold, self).__init__(
len(y),
n_folds=n_folds, shuffle=shuffle, random_state=random_state
)
len_y = len(y)
yinds = np.arange(len_y)
"reorder the labels according to the ordering of `y`"
sorter0 = np.argsort(y)
yinds = yinds[sorter0]

self.n_classes = len_y // n_folds + int(len_y % n_folds != 0)

if len_y // n_folds > 1:
n_items_boundary_cls = n_folds * (len_y // n_folds // 2)
"assign lower `n_folds*(n_classes//2 )` labels to the lower class"
lowerclasses = yinds[:n_items_boundary_cls].reshape(-1, n_folds)
"assign upper `n_folds*(n_classes//2 )` labels to the upper class"
upperclasses = yinds[-n_items_boundary_cls:].reshape(-1, n_folds)
"""assign the remainder labels to the middle class;
add -1 as a filling value; shuffle"""
middleclasses = yinds[n_items_boundary_cls:-n_items_boundary_cls]
middleclasses = np.hstack([
middleclasses,
-np.ones(n_folds - len(middleclasses) % n_folds, dtype=int)
])
middleclasses = middleclasses.reshape(-1, n_folds)

rng = check_random_state(self.random_state)
rng.shuffle(middleclasses.T)
middleclasses = middleclasses.reshape(-1, n_folds)
self._test_masks = np.vstack([
lowerclasses,
middleclasses,
upperclasses]).T
"to do : middle class rebalancing"
elif len_y > self.n_classes:
"""put the lower half in one piece, and the rest into a ragged array;
the central values will remain unpaired
"""
lowerclasses = yinds[:n_folds].reshape(-1, n_folds)
upperclasses = yinds[n_folds:]
upperclasses = np.hstack([
upperclasses,
-np.ones(n_folds - len(upperclasses) % n_folds, dtype=int)
])

self._test_masks = np.vstack([lowerclasses, upperclasses]).T

if shuffle:
rng.shuffle(self._test_masks)

"remove missing values from the middle class"
self._test_masks = [y[y != -1] for y in self._test_masks]
return

def _iter_test_masks(self):
indarr = np.zeros(self.n, dtype=bool)
for mask in self._test_masks:
indarr[:] = False
indarr[mask] = True
yield indarr

def __repr__(self):
return '%s.%s(n=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
self.__class__.__module__,
self.__class__.__name__,
self.n,
self.n_folds,
self.shuffle,
self.random_state,
)

def __len__(self):
return self.n_folds


class LeaveOneLabelOut(_PartitionIterator):
"""Leave-One-Label_Out cross-validation iterator

Expand Down
2 changes: 2 additions & 0 deletions sklearn/model_selection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from ._split import KFold
from ._split import LabelKFold
from ._split import StratifiedKFold
from ._split import BinnedStratifiedKFold
from ._split import LeaveOneLabelOut
from ._split import LeaveOneOut
from ._split import LeavePLabelOut
Expand Down Expand Up @@ -40,6 +41,7 @@
'RandomizedSearchCV',
'ShuffleSplit',
'StratifiedKFold',
'BinnedStratifiedKFold',
'StratifiedShuffleSplit',
'check_cv',
'cross_val_predict',
Expand Down
163 changes: 161 additions & 2 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
'ShuffleSplit',
'LabelShuffleSplit',
'StratifiedKFold',
'BinnedStratifiedKFold',
'StratifiedShuffleSplit',
'PredefinedSplit',
'train_test_split',
Expand Down Expand Up @@ -635,6 +636,164 @@ def split(self, X, y, labels=None):
"""
return super(StratifiedKFold, self).split(X, y, labels)


class BinnedStratifiedKFold(_BaseKFold):
"""Stratified K-Folds cross-validator

Provides train/test indices to split data in train/test sets.

This cross-validation object is a variation of KFold that returns
stratified folds. The folds are made by preserving the percentage of
samples for each class.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
n_folds : int, default=3
Number of folds. Must be at least 2.

shuffle : boolean, optional
Whether to shuffle each stratification of the data before splitting
into batches.

random_state : None, int or RandomState
When shuffle=True, pseudo-random number generator state used for
shuffling. If None, use default numpy RNG for shuffling.

Examples
--------
>>> from sklearn.model_selection import BinnedStratifiedKFold
>>> y = np.arange(11.0)
>>> np.random.seed(0)
>>> np.random.shuffle(y)
>>> X = y + 0.1* np.random.randn(len(y))
>>> cv = BinnedStratifiedKFold(n_folds=3)
>>> skf = cv.split(y)
>>> print(cv) # doctest: +NORMALIZE_WHITESPACE
BinnedStratifiedKFold(n_folds=3, random_state=None,
shuffle=False)
>>> indarr = np.zeros(len(y), dtype=bool)
>>> for train_index, test_index in skf:
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [ 1 2 3 4 5 8 10] TEST: [0 6 7 9]
TRAIN: [0 2 3 4 6 7 8 9] TEST: [ 1 5 10]
TRAIN: [ 0 1 5 6 7 9 10] TEST: [2 3 4 8]

Notes
-----
All the folds have size floor(n_samples / n_folds) or
floor(n_samples / n_folds) +1,
the length is assigned randomly (even if no shuffling is requested)
to balance the variance between folds.

See also
--------
StratifiedKFold -- stratified k-fold generator for classification data
"""

def __init__(self, n_folds=3, shuffle=False, random_state=None):
super(BinnedStratifiedKFold, self).__init__(n_folds, shuffle,
random_state)

def _make_test_folds(self, X, y=None, labels=None):
if y is None:
if hasattr(X, "shape") and \
(len(X.shape) == 1 or all(X.shape[1:] == 1)):
y = X
else:
raise ValueError("no y has been supplied; "
"first argument is not a valid y")
n_samples = len(y)
self.n_samples = n_samples
n_folds = self.n_folds
yinds = np.arange(n_samples)
"reorder the labels according to the ordering of `y`"
sorter0 = np.argsort(y)
yinds = yinds[sorter0]

self.n_classes = n_samples // n_folds + int(n_samples % n_folds != 0)

if n_samples // n_folds > 1:
n_items_boundary_cls = n_folds * (n_samples // n_folds // 2)
"assign lower `n_folds*(n_classes//2 )` labels to the lower class"
lowerclasses = yinds[:n_items_boundary_cls].reshape(-1, n_folds)
"assign upper `n_folds*(n_classes//2 )` labels to the upper class"
upperclasses = yinds[-n_items_boundary_cls:].reshape(-1, n_folds)
"""assign the remainder labels to the middle class;
add -1 as a filling value; shuffle"""
middleclasses = yinds[n_items_boundary_cls:-n_items_boundary_cls]
middleclasses = np.hstack([
middleclasses,
-np.ones(n_folds - len(middleclasses) % n_folds, dtype=int)
])
middleclasses = middleclasses.reshape(-1, n_folds)

rng = check_random_state(self.random_state)
rng.shuffle(middleclasses.T)
middleclasses = middleclasses.reshape(-1, n_folds)
self._test_masks = np.vstack([
lowerclasses,
middleclasses,
upperclasses]).T
"to do : middle class rebalancing"
elif n_samples > self.n_classes:
"""put the lower half in one piece, and the rest into a ragged array;
the central values will remain unpaired
"""
lowerclasses = yinds[:n_folds].reshape(-1, n_folds)
upperclasses = yinds[n_folds:]
upperclasses = np.hstack([
upperclasses,
-np.ones(n_folds - len(upperclasses) % n_folds, dtype=int)
])

self._test_masks = np.vstack([lowerclasses, upperclasses]).T

if self.shuffle:
rng.shuffle(self._test_masks)
"remove missing values from the middle class"
self._test_masks = [y[y != -1] for y in self._test_masks]

test_folds = np.empty(n_samples, dtype=np.int)
for nn, fold_masks in enumerate(self._test_masks):
test_folds[fold_masks] = nn
return test_folds

def _iter_test_masks(self, X, y=None, labels=None):
test_folds = self._make_test_folds(X, y)
for i in range(self.n_folds):
yield test_folds == i

def split(self, X, y=None, labels=None):
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.

y : array-like, shape (n_samples,)
The target variable for supervised learning problems.

labels : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.

Returns
-------
train : ndarray
The training set indices for that split.

test : ndarray
The testing set indices for that split.
"""
return super(BinnedStratifiedKFold, self).split(X, y, labels)


class LeaveOneLabelOut(BaseCrossValidator):
"""Leave One Label Out cross-validator

Expand Down Expand Up @@ -1193,8 +1352,8 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
Validation helper to check if the test/test sizes are meaningful wrt to the
size of the data (n_samples)
"""
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i'
and test_size >= n_samples):
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and
test_size >= n_samples):
raise ValueError('test_size=%d should be smaller than the number of '
'samples %d' % (test_size, n_samples))

Expand Down
Loading