FIX use labels is not needed in stratified k fold

raghavrv · raghavrv · commit d1d30b9c1b59 · 2015-08-15T04:52:59.000+05:30
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -63,7 +63,7 @@ def split(self, X, y=None, labels=None):
         y : array-like, shape (n_samples,)
             The target variable for supervised learning problems.
 
-        labels : array-like of int with shape (n_samples,), optional
+        labels : array-like, with shape (n_samples,), optional
             Arbitrary domain-specific stratification of the data to be used
             to draw the splits.
         """
@@ -261,21 +261,19 @@ def split(self, X, y=None, labels=None):
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
 
-        y : array-like, shape (n_samples,)
+        y : array-like, shape (n_samples,), optional
             The target variable for supervised learning problems.
 
-        labels : array-like of int with shape (n_samples,), optional
-            Arbitrary domain-specific stratification of the data to be used
-            to draw the splits.
+        labels : (Ignored, exists for compatibility.)
         """
-        X, y, labels = indexable(X, y, labels)
+        X, y = indexable(X, y)
         n = _num_samples(X)
         if self.n_folds > n:
             raise ValueError(
                 ("Cannot have number of folds n_folds={0} greater"
                  " than the number of samples: {1}.").format(self.n_folds, n))
 
-        for train, test in super(_BaseKFold, self).split(X, y, labels):
+        for train, test in super(_BaseKFold, self).split(X, y):
             yield train, test
 
     def n_splits(self, X=None, y=None, labels=None):
@@ -424,9 +422,9 @@ def _make_test_folds(self, X, y=None, labels=None):
             rng = self.random_state
         y = np.asarray(y)
         n_samples = y.shape[0]
-        unique_labels, y_inversed = np.unique(y, return_inverse=True)
-        label_counts = bincount(y_inversed)
-        min_labels = np.min(label_counts)
+        unique_y, y_inversed = np.unique(y, return_inverse=True)
+        y_counts = bincount(y_inversed)
+        min_labels = np.min(y_counts)
         if self.n_folds > min_labels:
             warnings.warn(("The least populated class in y has only %d"
                            " members, which is too few. The minimum"
@@ -435,33 +433,33 @@ def _make_test_folds(self, X, y=None, labels=None):
                            % (min_labels, self.n_folds)), Warning)
 
         # pre-assign each sample to a test fold index using individual KFold
-        # splitting strategies for each label so as to respect the balance of
-        # labels
-        # NOTE: Passing the data corresponding to ith label say X[y==label_i]
-        # will break when the data is not 100% stratifiable for all labels.
+        # splitting strategies for each class so as to respect the balance of
+        # classes
+        # NOTE: Passing the data corresponding to ith class say X[y==class_i]
+        # will break when the data is not 100% stratifiable for all classes.
         # So we pass np.zeroes(max(c, n_folds)) as data to the KFold
-        per_label_cvs = [
+        per_cls_cvs = [
             KFold(self.n_folds, shuffle=self.shuffle,
                   random_state=rng).split(np.zeros(max(c, self.n_folds)))
-            for c in label_counts]
+            for c in y_counts]
 
         test_folds = np.zeros(n_samples, dtype=np.int)
-        for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
-            for label, (_, test_split) in zip(unique_labels, per_label_splits):
-                label_test_folds = test_folds[y == label]
+        for test_fold_idx, per_cls_splits in enumerate(zip(*per_cls_cvs)):
+            for cls, (_, test_split) in zip(unique_y, per_cls_splits):
+                cls_test_folds = test_folds[y == cls]
                 # the test split can be too big because we used
                 # KFold(...).split(X[:max(c, n_folds)]) when data is not 100%
-                # stratifiable for all the labels
+                # stratifiable for all the classes
                 # (we use a warning instead of raising an exception)
                 # If this is the case, let's trim it:
-                test_split = test_split[test_split < len(label_test_folds)]
-                label_test_folds[test_split] = test_fold_idx
-                test_folds[y == label] = label_test_folds
+                test_split = test_split[test_split < len(cls_test_folds)]
+                cls_test_folds[test_split] = test_fold_idx
+                test_folds[y == cls] = cls_test_folds
 
         return test_folds
 
     def _iter_test_masks(self, X, y=None, labels=None):
-        test_folds = self._make_test_folds(X, y, labels)
+        test_folds = self._make_test_folds(X, y)
         for i in range(self.n_folds):
             yield test_folds == i
 
@@ -520,7 +518,7 @@ def n_splits(self, X, y, labels):
         X : (Ignored, exists for compatibility.)
         y : (Ignored, exists for compatibility.)
 
-        labels : array-like of int with shape (n_samples,)
+        labels : array-like, with shape (n_samples,)
             Arbitrary domain-specific stratification of the data to be used
             to draw the splits.
         """
@@ -598,7 +596,7 @@ def n_splits(self, X, y, labels):
         X : (Ignored, exists for compatibility.)
         y : (Ignored, exists for compatibility.)
 
-        labels : array-like of int with shape (n_samples,)
+        labels : array-like, with shape (n_samples,)
             Arbitrary domain-specific stratification of the data to be used
             to draw the splits.
         """
@@ -628,7 +626,7 @@ def split(self, X, y=None, labels=None):
         y : array-like, shape (n_samples,)
             The target variable for supervised learning problems.
 
-        labels : array-like of int with shape (n_samples,), optional
+        labels : array-like, with shape (n_samples,), optional
             Arbitrary domain-specific stratification of the data to be used
             to draw the splits.
         """