From 39d410dd2b7246c83462ee8a43128f6dcad55beb Mon Sep 17 00:00:00 2001 From: reiinakano Date: Mon, 21 Aug 2017 20:32:42 +0800 Subject: [PATCH 01/35] fix cross_val_predict for binary classification in decision_function --- sklearn/model_selection/_validation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index d3e84b3978ceb..176494463408d 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -734,7 +734,10 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, n_classes = len(set(y)) predictions_ = np.zeros((X_test.shape[0], n_classes)) if method == 'decision_function' and len(estimator.classes_) == 2: - predictions_[:, estimator.classes_[-1]] = predictions + if n_classes == 2: + predictions_ = predictions + else: + predictions_[:, estimator.classes_[-1]] = predictions else: predictions_[:, estimator.classes_] = predictions predictions = predictions_ From fc04ceee749074529f6ad4e7942f39d0331cd06e Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 10:42:22 +0800 Subject: [PATCH 02/35] Add unit tests --- sklearn/model_selection/tests/test_validation.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 5f650cb644079..affa984ca16e1 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -776,6 +776,18 @@ def split(self, X, y=None, groups=None): assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV()) + X, y = make_classification(n_classes=2, n_samples=50) + + preds = cross_val_predict(LogisticRegression(), X, y, + method='decision_function') + assert_equal(preds.shape, (10,)) + + X, y = make_classification(n_classes=4, n_samples=50) + + preds = cross_val_predict(LogisticRegression(), X, y, + method='decision_function') + assert_equal(preds.shape, (10, 4)) + def test_cross_val_predict_input_types(): iris = load_iris() From 9ea3baa435c4c9ed13401d1d3b8f33c4a3991658 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 10:43:14 +0800 Subject: [PATCH 03/35] Add unit tests --- sklearn/model_selection/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index affa984ca16e1..c7598c54d45ae 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -780,13 +780,13 @@ def split(self, X, y=None, groups=None): preds = cross_val_predict(LogisticRegression(), X, y, method='decision_function') - assert_equal(preds.shape, (10,)) + assert_equal(preds.shape, (50,)) X, y = make_classification(n_classes=4, n_samples=50) preds = cross_val_predict(LogisticRegression(), X, y, method='decision_function') - assert_equal(preds.shape, (10, 4)) + assert_equal(preds.shape, (50, 4)) def test_cross_val_predict_input_types(): From 115e766322ba2e484a5c6872742803224da9601c Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 10:49:20 +0800 Subject: [PATCH 04/35] Add unit tests --- sklearn/model_selection/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index c7598c54d45ae..7e9c8b2497c18 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -782,11 +782,11 @@ def split(self, X, y=None, groups=None): method='decision_function') assert_equal(preds.shape, (50,)) - X, y = make_classification(n_classes=4, n_samples=50) + X, y = load_iris(return_X_y=True) preds = cross_val_predict(LogisticRegression(), X, y, method='decision_function') - assert_equal(preds.shape, (50, 4)) + assert_equal(preds.shape, (150, 3)) def test_cross_val_predict_input_types(): From b012790bcddd9e82eb2d96a7db6fe2c13dd00afd Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 18:31:48 +0800 Subject: [PATCH 05/35] better fix --- sklearn/model_selection/_validation.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 176494463408d..b934448032afe 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -730,14 +730,12 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - if method in ['decision_function', 'predict_proba', 'predict_log_proba']: - n_classes = len(set(y)) + n_classes = len(set(y)) + if not n_classes == len(estimator.classes_) and method in \ + ['decision_function', 'predict_proba', 'predict_log_proba']: predictions_ = np.zeros((X_test.shape[0], n_classes)) if method == 'decision_function' and len(estimator.classes_) == 2: - if n_classes == 2: - predictions_ = predictions - else: - predictions_[:, estimator.classes_[-1]] = predictions + predictions_[:, estimator.classes_[-1]] = predictions else: predictions_[:, estimator.classes_] = predictions predictions = predictions_ From a0acb06aa631c86133e7f4e86a494000d4fe2054 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 18:39:03 +0800 Subject: [PATCH 06/35] fix conflict --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index b934448032afe..9760ed55fcba7 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -733,7 +733,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, n_classes = len(set(y)) if not n_classes == len(estimator.classes_) and method in \ ['decision_function', 'predict_proba', 'predict_log_proba']: - predictions_ = np.zeros((X_test.shape[0], n_classes)) + predictions_ = np.zeros((_num_samples(X_test), n_classes)) if method == 'decision_function' and len(estimator.classes_) == 2: predictions_[:, estimator.classes_[-1]] = predictions else: From 08bc4f998bb3a35caf37dd89f01e533124ef3847 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 18:56:23 +0800 Subject: [PATCH 07/35] fix broken --- sklearn/model_selection/_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 9760ed55fcba7..e1c28452e8b22 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -731,8 +731,8 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, func = getattr(estimator, method) predictions = func(X_test) n_classes = len(set(y)) - if not n_classes == len(estimator.classes_) and method in \ - ['decision_function', 'predict_proba', 'predict_log_proba']: + if method in ['decision_function', 'predict_proba', 'predict_log_proba'] \ + and not n_classes == len(estimator.classes_): predictions_ = np.zeros((_num_samples(X_test), n_classes)) if method == 'decision_function' and len(estimator.classes_) == 2: predictions_[:, estimator.classes_[-1]] = predictions From bfee76976fb9060af7db2cab60214a5095ceb414 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 19:03:08 +0800 Subject: [PATCH 08/35] only calculate n_classes if one of 'decision_function', 'predict_proba', 'predict_log_proba' --- sklearn/model_selection/_validation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index e1c28452e8b22..1a6262416aa92 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -730,15 +730,15 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - n_classes = len(set(y)) - if method in ['decision_function', 'predict_proba', 'predict_log_proba'] \ - and not n_classes == len(estimator.classes_): - predictions_ = np.zeros((_num_samples(X_test), n_classes)) - if method == 'decision_function' and len(estimator.classes_) == 2: - predictions_[:, estimator.classes_[-1]] = predictions - else: - predictions_[:, estimator.classes_] = predictions - predictions = predictions_ + if method in ['decision_function', 'predict_proba', 'predict_log_proba']: + n_classes = len(set(y)) + if not n_classes == len(estimator.classes_): + predictions_ = np.zeros((_num_samples(X_test), n_classes)) + if method == 'decision_function' and len(estimator.classes_) == 2: + predictions_[:, estimator.classes_[-1]] = predictions + else: + predictions_[:, estimator.classes_] = predictions + predictions = predictions_ return predictions, test From 975b5e0bfa5ed3b2b8f92b3a2c9b9128a7e56d87 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 21:40:33 +0800 Subject: [PATCH 09/35] add test for SVC ovo in cross_val_predict --- sklearn/model_selection/_validation.py | 2 +- sklearn/model_selection/tests/test_validation.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 1a6262416aa92..942d7a3a5fb24 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -732,7 +732,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, predictions = func(X_test) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) - if not n_classes == len(estimator.classes_): + if n_classes != len(estimator.classes_): predictions_ = np.zeros((_num_samples(X_test), n_classes)) if method == 'decision_function' and len(estimator.classes_) == 2: predictions_[:, estimator.classes_[-1]] = predictions diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 05f37ebb2c5eb..ac0d61a58ae32 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -42,6 +42,7 @@ from sklearn.datasets import make_regression from sklearn.datasets import load_boston from sklearn.datasets import load_iris +from sklearn.datasets import load_digits from sklearn.metrics import explained_variance_score from sklearn.metrics import make_scorer from sklearn.metrics import accuracy_score @@ -788,6 +789,13 @@ def split(self, X, y=None, groups=None): method='decision_function') assert_equal(preds.shape, (150, 3)) + X, y = load_digits(return_X_y=True) + + preds = cross_val_predict(SVC(kernel='linear', decision_function_shape='ovo'), + X, y, + method='decision_function') + assert_equal(preds.shape, (1797, 45)) + def test_cross_val_predict_input_types(): iris = load_iris() From c1d0ef4b90e54ff4713d849c727201e5138b29ed Mon Sep 17 00:00:00 2001 From: reiinakano Date: Tue, 22 Aug 2017 22:19:52 +0800 Subject: [PATCH 10/35] flake8 fix --- sklearn/model_selection/tests/test_validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index ac0d61a58ae32..e328475dbd142 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -791,7 +791,8 @@ def split(self, X, y=None, groups=None): X, y = load_digits(return_X_y=True) - preds = cross_val_predict(SVC(kernel='linear', decision_function_shape='ovo'), + preds = cross_val_predict(SVC(kernel='linear', + decision_function_shape='ovo'), X, y, method='decision_function') assert_equal(preds.shape, (1797, 45)) From a9aa2b29c00f310477dfde60086bd5b03718f876 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 31 Aug 2017 23:18:49 +0800 Subject: [PATCH 11/35] fix case of ovo and imbalanced folds for binary classification --- sklearn/model_selection/_validation.py | 12 +++++++++++- .../model_selection/tests/test_validation.py | 17 ++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 942d7a3a5fb24..6ff671990f549 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -733,11 +733,21 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) if n_classes != len(estimator.classes_): + if len(predictions.shape) == 2 and \ + predictions.shape[1] != len(estimator.classes_): + raise ValueError('Shape of output predictions does not ' + 'match number of classes in fold. ' + 'Cannot reconcile different number of ' + 'classes in different folds. To fix this, ' + 'use a cross-validation technique resulting ' + 'in properly stratified folds.') predictions_ = np.zeros((_num_samples(X_test), n_classes)) - if method == 'decision_function' and len(estimator.classes_) == 2: + if method == 'decision_function' and len(estimator.classes_) <= 2: predictions_[:, estimator.classes_[-1]] = predictions else: predictions_[:, estimator.classes_] = predictions + if method == 'decision_function' and n_classes == 2: + predictions_ = predictions_[:, 1] predictions = predictions_ return predictions, test diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index e328475dbd142..a0d0fd3ce862f 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -52,7 +52,7 @@ from sklearn.metrics import r2_score from sklearn.metrics.scorer import check_scoring -from sklearn.linear_model import Ridge, LogisticRegression +from sklearn.linear_model import Ridge, LogisticRegression, RidgeClassifier from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC @@ -789,14 +789,25 @@ def split(self, X, y=None, groups=None): method='decision_function') assert_equal(preds.shape, (150, 3)) + X = X[:100] + y = y[:100] + preds = cross_val_predict(RidgeClassifier(), X, y, + method='decision_function', cv=KFold()) + assert_equal(preds.shape, (100,)) + X, y = load_digits(return_X_y=True) + est = SVC(kernel='linear', decision_function_shape='ovo') - preds = cross_val_predict(SVC(kernel='linear', - decision_function_shape='ovo'), + preds = cross_val_predict(est, X, y, method='decision_function') assert_equal(preds.shape, (1797, 45)) + ind = np.argsort(y) + X, y = X[ind], y[ind] + assert_raises(ValueError, cross_val_predict, est, X, y, + cv=KFold(), method='decision_function') + def test_cross_val_predict_input_types(): iris = load_iris() From b2da0afb92b512eac31b4467f3f93d2bb80bd1ef Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 31 Aug 2017 23:35:03 +0800 Subject: [PATCH 12/35] change assert_raises to assert_raise_message for ovo case --- sklearn/model_selection/tests/test_validation.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 9502ce9ce8b93..5dae2c32bc071 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -805,8 +805,15 @@ def split(self, X, y=None, groups=None): ind = np.argsort(y) X, y = X[ind], y[ind] - assert_raises(ValueError, cross_val_predict, est, X, y, - cv=KFold(), method='decision_function') + assert_raise_message(ValueError, + 'Shape of output predictions does not ' + 'match number of classes in fold. ' + 'Cannot reconcile different number of ' + 'classes in different folds. To fix this, ' + 'use a cross-validation technique resulting ' + 'in properly stratified folds.', + cross_val_predict, est, X, y, + cv=KFold(), method='decision_function') def test_cross_val_predict_input_types(): From dd29e0f15fe532ab6b2e3212d53712257c52fc8d Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 31 Aug 2017 23:38:48 +0800 Subject: [PATCH 13/35] fix flake8 linetoo long --- sklearn/model_selection/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 5dae2c32bc071..758719698b94e 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -52,8 +52,8 @@ from sklearn.metrics import r2_score from sklearn.metrics.scorer import check_scoring -from sklearn.linear_model import Ridge, LogisticRegression, SGDClassifier, RidgeClassifier -from sklearn.linear_model import PassiveAggressiveClassifier +from sklearn.linear_model import Ridge, LogisticRegression, SGDClassifier +from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.cluster import KMeans From e63b6bc8af9a152ff79cd1e9bf7fb2a1765b7cf6 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 6 Sep 2017 01:33:31 +0800 Subject: [PATCH 14/35] add comments and clearer tests --- sklearn/model_selection/_validation.py | 4 ++++ .../model_selection/tests/test_validation.py | 22 ++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 2971d1d088552..a988cf5e52137 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -730,6 +730,10 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if n_classes != len(estimator.classes_): if len(predictions.shape) == 2 and \ predictions.shape[1] != len(estimator.classes_): + # This handles the case when the shape of predictions + # does not match the number of classes used to train + # it with. This case is found when sklearn.svm.SVC is + # set to `decision_function_shape='ovo'`. raise ValueError('Shape of output predictions does not ' 'match number of classes in fold. ' 'Cannot reconcile different number of ' diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 758719698b94e..cdb98f82e570e 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -777,6 +777,8 @@ def split(self, X, y=None, groups=None): assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV()) + +def test_cross_val_predict_decision_function_shape(): X, y = make_classification(n_classes=2, n_samples=50) preds = cross_val_predict(LogisticRegression(), X, y, @@ -789,10 +791,14 @@ def split(self, X, y=None, groups=None): method='decision_function') assert_equal(preds.shape, (150, 3)) + # This specifically tests imbalanced splits for binary + # classification with decision_function. This is only + # applicable to classifiers that can be fit on a single + # class. X = X[:100] y = y[:100] preds = cross_val_predict(RidgeClassifier(), X, y, - method='decision_function', cv=KFold()) + method='decision_function', cv=KFold(2)) assert_equal(preds.shape, (100,)) X, y = load_digits(return_X_y=True) @@ -816,6 +822,20 @@ def split(self, X, y=None, groups=None): cv=KFold(), method='decision_function') +def test_cross_val_predict_predict_proba_shape(): + X, y = make_classification(n_classes=2, n_samples=50) + + preds = cross_val_predict(LogisticRegression(), X, y, + method='predict_proba') + assert_equal(preds.shape, (50, 2)) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict(LogisticRegression(), X, y, + method='predict_proba') + assert_equal(preds.shape, (150, 3)) + + def test_cross_val_predict_input_types(): iris = load_iris() X, y = iris.data, iris.target From 624ca2c423683e957a773226f5e292b8c1a6b625 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 28 Sep 2017 01:45:31 +0800 Subject: [PATCH 15/35] improve comments and error message for OvO --- sklearn/model_selection/_validation.py | 18 +++++++++++++----- .../model_selection/tests/test_validation.py | 4 ++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 823d777679161..1b8b8a233f5d2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -728,24 +728,32 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) if n_classes != len(estimator.classes_): - if len(predictions.shape) == 2 and \ + if predictions.ndim == 2 and \ predictions.shape[1] != len(estimator.classes_): # This handles the case when the shape of predictions # does not match the number of classes used to train # it with. This case is found when sklearn.svm.SVC is # set to `decision_function_shape='ovo'`. - raise ValueError('Shape of output predictions does not ' - 'match number of classes in fold. ' + raise ValueError('Output shape {} of {} does not ' + 'match number of classes ({}) in fold. ' 'Cannot reconcile different number of ' 'classes in different folds. To fix this, ' - 'use a cross-validation technique resulting ' - 'in properly stratified folds.') + 'use a cross-validation technique ' + 'resulting in properly stratified ' + 'folds.'.format(predictions.shape, method, + len(estimator.classes_))) predictions_ = np.zeros((_num_samples(X_test), n_classes)) if method == 'decision_function' and len(estimator.classes_) <= 2: + # In this special case, `predictions` contains a 1D array. + # It must be handled differently to fit inside the + # `predictions_` temporary array. predictions_[:, estimator.classes_[-1]] = predictions else: predictions_[:, estimator.classes_] = predictions + if method == 'decision_function' and n_classes == 2: + # In this special case, the output `predictions` must + # contain a 1D array. predictions_ = predictions_[:, 1] predictions = predictions_ return predictions, test diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index cdb98f82e570e..0d012faa45a57 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -812,8 +812,8 @@ def test_cross_val_predict_decision_function_shape(): ind = np.argsort(y) X, y = X[ind], y[ind] assert_raise_message(ValueError, - 'Shape of output predictions does not ' - 'match number of classes in fold. ' + 'Output shape (599, 21) of decision_function ' + 'does not match number of classes (7) in fold. ' 'Cannot reconcile different number of ' 'classes in different folds. To fix this, ' 'use a cross-validation technique resulting ' From a0c7f678b0b85f1750bfd50c341df693ea761152 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 28 Sep 2017 03:20:59 +0800 Subject: [PATCH 16/35] fix .format error with L --- sklearn/model_selection/_validation.py | 4 ++-- sklearn/model_selection/tests/test_validation.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 1b8b8a233f5d2..a3403517fd5e2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -734,13 +734,13 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, # does not match the number of classes used to train # it with. This case is found when sklearn.svm.SVC is # set to `decision_function_shape='ovo'`. - raise ValueError('Output shape {} of {} does not ' + raise ValueError('Output shape (-1, {}) of {} does not ' 'match number of classes ({}) in fold. ' 'Cannot reconcile different number of ' 'classes in different folds. To fix this, ' 'use a cross-validation technique ' 'resulting in properly stratified ' - 'folds.'.format(predictions.shape, method, + 'folds.'.format(predictions.shape[1], method, len(estimator.classes_))) predictions_ = np.zeros((_num_samples(X_test), n_classes)) if method == 'decision_function' and len(estimator.classes_) <= 2: diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 0d012faa45a57..360b57981b1e4 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -812,7 +812,7 @@ def test_cross_val_predict_decision_function_shape(): ind = np.argsort(y) X, y = X[ind], y[ind] assert_raise_message(ValueError, - 'Output shape (599, 21) of decision_function ' + 'Output shape (-1, 21) of decision_function ' 'does not match number of classes (7) in fold. ' 'Cannot reconcile different number of ' 'classes in different folds. To fix this, ' From ed3b6a31a01ad6430625aaff213b83e869bec8aa Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 28 Sep 2017 22:55:42 +0800 Subject: [PATCH 17/35] use assert_raises_regex for better error message --- sklearn/model_selection/_validation.py | 4 ++-- .../model_selection/tests/test_validation.py | 20 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index a3403517fd5e2..1b8b8a233f5d2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -734,13 +734,13 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, # does not match the number of classes used to train # it with. This case is found when sklearn.svm.SVC is # set to `decision_function_shape='ovo'`. - raise ValueError('Output shape (-1, {}) of {} does not ' + raise ValueError('Output shape {} of {} does not ' 'match number of classes ({}) in fold. ' 'Cannot reconcile different number of ' 'classes in different folds. To fix this, ' 'use a cross-validation technique ' 'resulting in properly stratified ' - 'folds.'.format(predictions.shape[1], method, + 'folds.'.format(predictions.shape, method, len(estimator.classes_))) predictions_ = np.zeros((_num_samples(X_test), n_classes)) if method == 'decision_function' and len(estimator.classes_) <= 2: diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 360b57981b1e4..1f0161bcde7b2 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -779,7 +779,7 @@ def split(self, X, y=None, groups=None): def test_cross_val_predict_decision_function_shape(): - X, y = make_classification(n_classes=2, n_samples=50) + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) preds = cross_val_predict(LogisticRegression(), X, y, method='decision_function') @@ -811,15 +811,15 @@ def test_cross_val_predict_decision_function_shape(): ind = np.argsort(y) X, y = X[ind], y[ind] - assert_raise_message(ValueError, - 'Output shape (-1, 21) of decision_function ' - 'does not match number of classes (7) in fold. ' - 'Cannot reconcile different number of ' - 'classes in different folds. To fix this, ' - 'use a cross-validation technique resulting ' - 'in properly stratified folds.', - cross_val_predict, est, X, y, - cv=KFold(), method='decision_function') + assert_raises_regex(ValueError, + 'Output shape \(599L?, 21L?\) of decision_function ' + 'does not match number of classes \(7\) in fold\. ' + 'Cannot reconcile different number of ' + 'classes in different folds\. To fix this, ' + 'use a cross-validation technique resulting ' + 'in properly stratified folds\.', + cross_val_predict, est, X, y, + cv=KFold(), method='decision_function') def test_cross_val_predict_predict_proba_shape(): From 8a71ef3e34b3c40f1b1ee2911c18873a726ec2f6 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 15:43:16 +0800 Subject: [PATCH 18/35] raise error in decision_function special cases. change predict_log_proba missing classes to minimum numpy value --- sklearn/model_selection/_validation.py | 67 ++++++++++++------- .../model_selection/tests/test_validation.py | 26 ++++++- 2 files changed, 65 insertions(+), 28 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 1b8b8a233f5d2..fd0e5a00b873b 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -728,33 +728,50 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) if n_classes != len(estimator.classes_): - if predictions.ndim == 2 and \ - predictions.shape[1] != len(estimator.classes_): - # This handles the case when the shape of predictions - # does not match the number of classes used to train - # it with. This case is found when sklearn.svm.SVC is - # set to `decision_function_shape='ovo'`. - raise ValueError('Output shape {} of {} does not ' - 'match number of classes ({}) in fold. ' - 'Cannot reconcile different number of ' - 'classes in different folds. To fix this, ' - 'use a cross-validation technique ' - 'resulting in properly stratified ' - 'folds.'.format(predictions.shape, method, - len(estimator.classes_))) - predictions_ = np.zeros((_num_samples(X_test), n_classes)) - if method == 'decision_function' and len(estimator.classes_) <= 2: - # In this special case, `predictions` contains a 1D array. - # It must be handled differently to fit inside the - # `predictions_` temporary array. - predictions_[:, estimator.classes_[-1]] = predictions - else: + if method == 'predict_proba': + # Fill missing classes with zero + predictions_ = np.zeros((_num_samples(X_test), n_classes)) + predictions_[:, estimator.classes_] = predictions + + elif method == 'predict_log_proba': + # Fill missing classes with minimum value + predictions_ = np.full((_num_samples(X_test), n_classes), + np.finfo(X_test.dtype).min) + predictions_[:, estimator.classes_] = predictions + + else: # Special handling logic for decision_function + err_mess = 'Output shape {} of {} does not match ' \ + 'number of classes ({}) in fold. Cannot' \ + ' reconcile different number of classes' \ + ' in different folds. To fix this, use ' \ + 'a cross-validation technique resulting' \ + ' in properly stratified folds' + if predictions.ndim == 2 and \ + predictions.shape[1] != len(estimator.classes_): + # This handles the case when the shape of predictions + # does not match the number of classes used to train + # it with. This case is found when sklearn.svm.SVC is + # set to `decision_function_shape='ovo'`. + raise ValueError(err_mess.format(predictions.shape, method, + len(estimator.classes_))) + if len(estimator.classes_) <= 2: + # In this special case, `predictions` contains a 1D array. + raise ValueError(err_mess.format(predictions.shape, method, + len(estimator.classes_))) + if n_classes == 2: + # In this special case, the estimator is trained with + # just one class. + raise ValueError('Cannot reconcile cross-validation' + ' predictions trained using' + ' only one class. To fix this, ' + 'use a cross-validation technique ' + 'resulting in properly stratified ' + 'folds.') + + predictions_ = np.full((_num_samples(X_test), n_classes), + np.finfo(X_test.dtype).min) predictions_[:, estimator.classes_] = predictions - if method == 'decision_function' and n_classes == 2: - # In this special case, the output `predictions` must - # contain a 1D array. - predictions_ = predictions_[:, 1] predictions = predictions_ return predictions, test diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 1f0161bcde7b2..e1ae4ff69bbf6 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -797,9 +797,15 @@ def test_cross_val_predict_decision_function_shape(): # class. X = X[:100] y = y[:100] - preds = cross_val_predict(RidgeClassifier(), X, y, - method='decision_function', cv=KFold(2)) - assert_equal(preds.shape, (100,)) + assert_raises(ValueError, + 'Cannot reconcile cross-validation' + ' predictions trained using' + ' only one class. To fix this, ' + 'use a cross-validation technique ' + 'resulting in properly stratified ' + 'folds.', + cross_val_predict, RidgeClassifier(), X, y, + method='decision_function', cv=KFold(2)) X, y = load_digits(return_X_y=True) est = SVC(kernel='linear', decision_function_shape='ovo') @@ -836,6 +842,20 @@ def test_cross_val_predict_predict_proba_shape(): assert_equal(preds.shape, (150, 3)) +def test_cross_val_predict_predict_log_proba_shape(): + X, y = make_classification(n_classes=2, n_samples=50) + + preds = cross_val_predict(LogisticRegression(), X, y, + method='predict_log_proba') + assert_equal(preds.shape, (50, 2)) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict(LogisticRegression(), X, y, + method='predict_log_proba') + assert_equal(preds.shape, (150, 3)) + + def test_cross_val_predict_input_types(): iris = load_iris() X, y = iris.data, iris.target From 4c0bc479866025d161c946127f1e899697677684 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 16:14:36 +0800 Subject: [PATCH 19/35] fix broken tests due to special cases of decision_function --- sklearn/model_selection/_validation.py | 4 +-- .../model_selection/tests/test_validation.py | 33 ++++++++++--------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index fd0e5a00b873b..cc32cabf7b8ed 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -736,7 +736,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, elif method == 'predict_log_proba': # Fill missing classes with minimum value predictions_ = np.full((_num_samples(X_test), n_classes), - np.finfo(X_test.dtype).min) + np.finfo(predictions.dtype).min) predictions_[:, estimator.classes_] = predictions else: # Special handling logic for decision_function @@ -769,7 +769,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, 'folds.') predictions_ = np.full((_num_samples(X_test), n_classes), - np.finfo(X_test.dtype).min) + np.finfo(predictions.dtype).min) predictions_[:, estimator.classes_] = predictions predictions = predictions_ diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index e1ae4ff69bbf6..4c4d7a028987c 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -797,15 +797,15 @@ def test_cross_val_predict_decision_function_shape(): # class. X = X[:100] y = y[:100] - assert_raises(ValueError, - 'Cannot reconcile cross-validation' - ' predictions trained using' - ' only one class. To fix this, ' - 'use a cross-validation technique ' - 'resulting in properly stratified ' - 'folds.', - cross_val_predict, RidgeClassifier(), X, y, - method='decision_function', cv=KFold(2)) + assert_raise_message(ValueError, + 'Cannot reconcile cross-validation' + ' predictions trained using' + ' only one class. To fix this, ' + 'use a cross-validation technique ' + 'resulting in properly stratified ' + 'folds.', + cross_val_predict, RidgeClassifier(), X, y, + method='decision_function', cv=KFold(2)) X, y = load_digits(return_X_y=True) est = SVC(kernel='linear', decision_function_shape='ovo') @@ -829,7 +829,7 @@ def test_cross_val_predict_decision_function_shape(): def test_cross_val_predict_predict_proba_shape(): - X, y = make_classification(n_classes=2, n_samples=50) + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) preds = cross_val_predict(LogisticRegression(), X, y, method='predict_proba') @@ -843,7 +843,7 @@ def test_cross_val_predict_predict_proba_shape(): def test_cross_val_predict_predict_log_proba_shape(): - X, y = make_classification(n_classes=2, n_samples=50) + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) preds = cross_val_predict(LogisticRegression(), X, y, method='predict_log_proba') @@ -1296,11 +1296,12 @@ def get_expected_predictions(X, y, cv, classes, est, method): est.fit(X[train], y[train]) expected_predictions_ = func(X[test]) # To avoid 2 dimensional indexing - exp_pred_test = np.zeros((len(test), classes)) - if method is 'decision_function' and len(est.classes_) == 2: - exp_pred_test[:, est.classes_[-1]] = expected_predictions_ + if method is 'predict_proba': + exp_pred_test = np.zeros((len(test), classes)) else: - exp_pred_test[:, est.classes_] = expected_predictions_ + exp_pred_test = np.full((len(test), classes), + np.finfo(expected_predictions.dtype).min) + exp_pred_test[:, est.classes_] = expected_predictions_ expected_predictions[test] = exp_pred_test return expected_predictions @@ -1317,7 +1318,7 @@ def test_cross_val_predict_class_subset(): le = LabelEncoder() - methods = ['decision_function', 'predict_proba', 'predict_log_proba'] + methods = ['predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() From de3c3bd64e38ba4ca266fdf21d0f963693859c57 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 16:39:28 +0800 Subject: [PATCH 20/35] add modified test for decision_function behavior that does not trigger edge cases --- sklearn/model_selection/_validation.py | 9 ------ .../model_selection/tests/test_validation.py | 30 +++++++++++++------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index cc32cabf7b8ed..8d24c3d002e88 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -758,15 +758,6 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, # In this special case, `predictions` contains a 1D array. raise ValueError(err_mess.format(predictions.shape, method, len(estimator.classes_))) - if n_classes == 2: - # In this special case, the estimator is trained with - # just one class. - raise ValueError('Cannot reconcile cross-validation' - ' predictions trained using' - ' only one class. To fix this, ' - 'use a cross-validation technique ' - 'resulting in properly stratified ' - 'folds.') predictions_ = np.full((_num_samples(X_test), n_classes), np.finfo(predictions.dtype).min) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 4c4d7a028987c..e4f68630d6bba 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -797,15 +797,15 @@ def test_cross_val_predict_decision_function_shape(): # class. X = X[:100] y = y[:100] - assert_raise_message(ValueError, - 'Cannot reconcile cross-validation' - ' predictions trained using' - ' only one class. To fix this, ' - 'use a cross-validation technique ' - 'resulting in properly stratified ' - 'folds.', - cross_val_predict, RidgeClassifier(), X, y, - method='decision_function', cv=KFold(2)) + assert_raises_regex(ValueError, + 'Output shape \(50L?,) of decision_function' + ' does not match number of classes \(1\) ' + 'in fold. Cannot reconcile different number' + ' of classes in different folds. To fix ' + 'this, use a cross-validation technique ' + 'resulting in properly stratified folds', + cross_val_predict, RidgeClassifier(), X, y, + method='decision_function', cv=KFold(2)) X, y = load_digits(return_X_y=True) est = SVC(kernel='linear', decision_function_shape='ovo') @@ -1347,6 +1347,18 @@ def test_cross_val_predict_class_subset(): est, method) assert_array_almost_equal(expected_predictions, predictions) + # Special test for decision_function. This makes sure not to trigger + # any of the numerous edge cases in decision_function + X = np.arange(100).reshape(50, 2) + y = np.array([x//10 for x in range(100)]) + + est = LogisticRegression() + predictions = cross_val_predict(est, X, y, method='decision_function', + cv=kfold3) + expected_predictions = get_expected_predictions(X, y, kfold3, classes, + est, 'decision_function') + assert_array_almost_equal(expected_predictions, predictions) + def test_score_memmap(): # Ensure a scalar score of memmap type is accepted From b3b350a87b9982cc0f0f5b03460b03e4db6296da Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 16:53:25 +0800 Subject: [PATCH 21/35] fix typos --- sklearn/model_selection/tests/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index e4f68630d6bba..4079b4e0d2082 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -819,11 +819,11 @@ def test_cross_val_predict_decision_function_shape(): X, y = X[ind], y[ind] assert_raises_regex(ValueError, 'Output shape \(599L?, 21L?\) of decision_function ' - 'does not match number of classes \(7\) in fold\. ' + 'does not match number of classes \(7\) in fold. ' 'Cannot reconcile different number of ' - 'classes in different folds\. To fix this, ' + 'classes in different folds. To fix this, ' 'use a cross-validation technique resulting ' - 'in properly stratified folds\.', + 'in properly stratified folds.', cross_val_predict, est, X, y, cv=KFold(), method='decision_function') @@ -1349,7 +1349,7 @@ def test_cross_val_predict_class_subset(): # Special test for decision_function. This makes sure not to trigger # any of the numerous edge cases in decision_function - X = np.arange(100).reshape(50, 2) + X = np.arange(200).reshape(100, 2) y = np.array([x//10 for x in range(100)]) est = LogisticRegression() From af5f9ad5827617434407be5726773587b88140fd Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 19:23:53 +0800 Subject: [PATCH 22/35] fix typos --- sklearn/model_selection/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 4079b4e0d2082..cf932f88403b8 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -798,7 +798,7 @@ def test_cross_val_predict_decision_function_shape(): X = X[:100] y = y[:100] assert_raises_regex(ValueError, - 'Output shape \(50L?,) of decision_function' + 'Output shape \(50L?,\) of decision_function' ' does not match number of classes \(1\) ' 'in fold. Cannot reconcile different number' ' of classes in different folds. To fix ' @@ -1355,7 +1355,7 @@ def test_cross_val_predict_class_subset(): est = LogisticRegression() predictions = cross_val_predict(est, X, y, method='decision_function', cv=kfold3) - expected_predictions = get_expected_predictions(X, y, kfold3, classes, + expected_predictions = get_expected_predictions(X, y, kfold3, 10, est, 'decision_function') assert_array_almost_equal(expected_predictions, predictions) From cee405498f5a5c75d9702ca9441e6aede46ae5c1 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 19:50:53 +0800 Subject: [PATCH 23/35] escape regex . --- sklearn/model_selection/tests/test_validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index cf932f88403b8..f9c7ac29953ad 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -819,11 +819,11 @@ def test_cross_val_predict_decision_function_shape(): X, y = X[ind], y[ind] assert_raises_regex(ValueError, 'Output shape \(599L?, 21L?\) of decision_function ' - 'does not match number of classes \(7\) in fold. ' + 'does not match number of classes \(7\) in fold\. ' 'Cannot reconcile different number of ' - 'classes in different folds. To fix this, ' + 'classes in different folds\. To fix this, ' 'use a cross-validation technique resulting ' - 'in properly stratified folds.', + 'in properly stratified folds\.', cross_val_predict, est, X, y, cv=KFold(), method='decision_function') From 7b6545079ca13f9e84988ee2d50d8e7fd0eb3a91 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Wed, 11 Oct 2017 19:56:50 +0800 Subject: [PATCH 24/35] escape regex . --- sklearn/model_selection/tests/test_validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index f9c7ac29953ad..ac3b81a7db9b9 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -819,11 +819,11 @@ def test_cross_val_predict_decision_function_shape(): X, y = X[ind], y[ind] assert_raises_regex(ValueError, 'Output shape \(599L?, 21L?\) of decision_function ' - 'does not match number of classes \(7\) in fold\. ' + 'does not match number of classes \(7\) in fold. ' 'Cannot reconcile different number of ' - 'classes in different folds\. To fix this, ' + 'classes in different folds. To fix this, ' 'use a cross-validation technique resulting ' - 'in properly stratified folds\.', + 'in properly stratified folds', cross_val_predict, est, X, y, cv=KFold(), method='decision_function') From 9f97b89b02783272db11a3b25b9ed4311ad6a365 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 12 Oct 2017 14:51:28 +0800 Subject: [PATCH 25/35] address comments. one unaddressed comment --- sklearn/model_selection/_validation.py | 16 ++++++------ .../model_selection/tests/test_validation.py | 25 ++++++------------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 8d24c3d002e88..56ec15a24565d 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -740,14 +740,14 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, predictions_[:, estimator.classes_] = predictions else: # Special handling logic for decision_function - err_mess = 'Output shape {} of {} does not match ' \ - 'number of classes ({}) in fold. Cannot' \ - ' reconcile different number of classes' \ - ' in different folds. To fix this, use ' \ - 'a cross-validation technique resulting' \ - ' in properly stratified folds' - if predictions.ndim == 2 and \ - predictions.shape[1] != len(estimator.classes_): + err_mess = ('Output shape {} of {} does not match ' + 'number of classes ({}) in fold. Cannot' + ' reconcile different number of classes' + ' in different folds. To fix this, use ' + 'a cross-validation technique resulting' + ' in properly stratified folds') + if (predictions.ndim == 2 and + predictions.shape[1] != len(estimator.classes_)): # This handles the case when the shape of predictions # does not match the number of classes used to train # it with. This case is found when sklearn.svm.SVC is diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index ac3b81a7db9b9..125731aac5cdc 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -825,7 +825,7 @@ def test_cross_val_predict_decision_function_shape(): 'use a cross-validation technique resulting ' 'in properly stratified folds', cross_val_predict, est, X, y, - cv=KFold(), method='decision_function') + cv=KFold(n_splits=3), method='decision_function') def test_cross_val_predict_predict_proba_shape(): @@ -1309,16 +1309,16 @@ def get_expected_predictions(X, y, cv, classes, est, method): def test_cross_val_predict_class_subset(): - X = np.arange(8).reshape(4, 2) - y = np.array([0, 0, 1, 2]) - classes = 3 + X = np.arange(200).reshape(100, 2) + y = np.array([x//10 for x in range(100)]) + classes = 10 kfold3 = KFold(n_splits=3) kfold4 = KFold(n_splits=4) le = LabelEncoder() - methods = ['predict_proba', 'predict_log_proba'] + methods = ['decision_function','predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() @@ -1339,7 +1339,8 @@ def test_cross_val_predict_class_subset(): assert_array_almost_equal(expected_predictions, predictions) # Testing unordered labels - y = [1, 1, -4, 6] + y = np.array([x//10 for x in range(-100, 100, 2)]) + y = shuffle(y, random_state=0) predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) y = le.fit_transform(y) @@ -1347,18 +1348,6 @@ def test_cross_val_predict_class_subset(): est, method) assert_array_almost_equal(expected_predictions, predictions) - # Special test for decision_function. This makes sure not to trigger - # any of the numerous edge cases in decision_function - X = np.arange(200).reshape(100, 2) - y = np.array([x//10 for x in range(100)]) - - est = LogisticRegression() - predictions = cross_val_predict(est, X, y, method='decision_function', - cv=kfold3) - expected_predictions = get_expected_predictions(X, y, kfold3, 10, - est, 'decision_function') - assert_array_almost_equal(expected_predictions, predictions) - def test_score_memmap(): # Ensure a scalar score of memmap type is accepted From d695b9523a37fa7e2cfe7c5c943becebed656b63 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 12 Oct 2017 15:00:51 +0800 Subject: [PATCH 26/35] simplify code --- sklearn/model_selection/_validation.py | 27 ++++++++++---------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 56ec15a24565d..7dc46cfb3eaa6 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -728,18 +728,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) if n_classes != len(estimator.classes_): - if method == 'predict_proba': - # Fill missing classes with zero - predictions_ = np.zeros((_num_samples(X_test), n_classes)) - predictions_[:, estimator.classes_] = predictions - - elif method == 'predict_log_proba': - # Fill missing classes with minimum value - predictions_ = np.full((_num_samples(X_test), n_classes), - np.finfo(predictions.dtype).min) - predictions_[:, estimator.classes_] = predictions - - else: # Special handling logic for decision_function + if method == 'decision_function': err_mess = ('Output shape {} of {} does not match ' 'number of classes ({}) in fold. Cannot' ' reconcile different number of classes' @@ -759,11 +748,15 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, raise ValueError(err_mess.format(predictions.shape, method, len(estimator.classes_))) - predictions_ = np.full((_num_samples(X_test), n_classes), - np.finfo(predictions.dtype).min) - predictions_[:, estimator.classes_] = predictions - - predictions = predictions_ + float_min = np.finfo(predictions.dtype).min + default_values = {'decision_function': float_min, + 'predict_log_proba': float_min, + 'predict_proba': 0} + predictions_for_all_classes = np.full((_num_samples(predictions), + n_classes), + default_values[method]) + predictions_for_all_classes[:, estimator.classes_] = predictions + predictions = predictions_for_all_classes return predictions, test From dc42b27317af79f4a761a93b208c38f084dd5b9c Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 12 Oct 2017 15:01:58 +0800 Subject: [PATCH 27/35] flake --- sklearn/model_selection/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 125731aac5cdc..2e6a51b18ca86 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1318,7 +1318,7 @@ def test_cross_val_predict_class_subset(): le = LabelEncoder() - methods = ['decision_function','predict_proba', 'predict_log_proba'] + methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() From 564bf5ecbc23d4951f0023aea18f5d023a5898a8 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Thu, 12 Oct 2017 15:22:48 +0800 Subject: [PATCH 28/35] wrong classes range --- sklearn/model_selection/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 2e6a51b18ca86..7547f66c3139a 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1339,7 +1339,7 @@ def test_cross_val_predict_class_subset(): assert_array_almost_equal(expected_predictions, predictions) # Testing unordered labels - y = np.array([x//10 for x in range(-100, 100, 2)]) + y = np.array([x//20 for x in range(-100, 100, 2)]) y = shuffle(y, random_state=0) predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) From 3b5cb5e347a99b5aecc3e9b16e994ee7048a6d06 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Sat, 14 Oct 2017 02:44:16 +0800 Subject: [PATCH 29/35] address comments. adjust error message --- sklearn/model_selection/_validation.py | 24 +++++++++++-------- .../model_selection/tests/test_validation.py | 20 +++++++--------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 7dc46cfb3eaa6..fc8f8cba30f77 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -729,24 +729,28 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, n_classes = len(set(y)) if n_classes != len(estimator.classes_): if method == 'decision_function': - err_mess = ('Output shape {} of {} does not match ' - 'number of classes ({}) in fold. Cannot' - ' reconcile different number of classes' - ' in different folds. To fix this, use ' - 'a cross-validation technique resulting' - ' in properly stratified folds') + err_mess = ('To fix this, use a cross-validation ' + 'technique resulting in properly stratified folds') if (predictions.ndim == 2 and predictions.shape[1] != len(estimator.classes_)): # This handles the case when the shape of predictions # does not match the number of classes used to train # it with. This case is found when sklearn.svm.SVC is # set to `decision_function_shape='ovo'`. - raise ValueError(err_mess.format(predictions.shape, method, - len(estimator.classes_))) + raise ValueError('Output shape {} of {} does not match ' + 'number of classes ({}) in fold. Cannot' + ' reconcile different number of classes' + ' in different folds. {}'.format( + predictions.shape, method, + len(estimator.classes_), err_mess + )) if len(estimator.classes_) <= 2: # In this special case, `predictions` contains a 1D array. - raise ValueError(err_mess.format(predictions.shape, method, - len(estimator.classes_))) + raise ValueError('Only {} class/es in training fold, this' + ' is not supported for decision_function' + ' with imbalanced folds. {}'.format( + len(estimator.classes_), err_mess + )) float_min = np.finfo(predictions.dtype).min default_values = {'decision_function': float_min, diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 7547f66c3139a..e7752ec307965 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -797,15 +797,14 @@ def test_cross_val_predict_decision_function_shape(): # class. X = X[:100] y = y[:100] - assert_raises_regex(ValueError, - 'Output shape \(50L?,\) of decision_function' - ' does not match number of classes \(1\) ' - 'in fold. Cannot reconcile different number' - ' of classes in different folds. To fix ' - 'this, use a cross-validation technique ' - 'resulting in properly stratified folds', - cross_val_predict, RidgeClassifier(), X, y, - method='decision_function', cv=KFold(2)) + assert_raise_message(ValueError, + 'Only 1 class/es in training fold, this' + ' is not supported for decision_function' + ' with imbalanced folds. To fix ' + 'this, use a cross-validation technique ' + 'resulting in properly stratified folds', + cross_val_predict, RidgeClassifier(), X, y, + method='decision_function', cv=KFold(2)) X, y = load_digits(return_X_y=True) est = SVC(kernel='linear', decision_function_shape='ovo') @@ -1339,8 +1338,7 @@ def test_cross_val_predict_class_subset(): assert_array_almost_equal(expected_predictions, predictions) # Testing unordered labels - y = np.array([x//20 for x in range(-100, 100, 2)]) - y = shuffle(y, random_state=0) + y = shuffle(np.repeat(range(10), 10), random_state=0) predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) y = le.fit_transform(y) From e5013bd6b81fd786068fa1eda98c163519716478 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Sat, 14 Oct 2017 02:52:38 +0800 Subject: [PATCH 30/35] add warning --- sklearn/model_selection/_validation.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index fc8f8cba30f77..c11e87ceafb1e 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -728,9 +728,16 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) if n_classes != len(estimator.classes_): + err_mess = ('To fix this, use a cross-validation ' + 'technique resulting in properly ' + 'stratified folds') + warnings.warn("Number of classes in training fold ({}) does" + "not match total number of classes ({}). " + "Results may not be appropriate for your use case." + " {}".format(len(estimator.classes_), + n_classes, err_mess) + ) if method == 'decision_function': - err_mess = ('To fix this, use a cross-validation ' - 'technique resulting in properly stratified folds') if (predictions.ndim == 2 and predictions.shape[1] != len(estimator.classes_)): # This handles the case when the shape of predictions @@ -743,14 +750,14 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, ' in different folds. {}'.format( predictions.shape, method, len(estimator.classes_), err_mess - )) + )) if len(estimator.classes_) <= 2: # In this special case, `predictions` contains a 1D array. raise ValueError('Only {} class/es in training fold, this' ' is not supported for decision_function' ' with imbalanced folds. {}'.format( len(estimator.classes_), err_mess - )) + )) float_min = np.finfo(predictions.dtype).min default_values = {'decision_function': float_min, From 75a0c59f8d41e376fd31cd6fd30ac762554ae241 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Sat, 14 Oct 2017 02:58:21 +0800 Subject: [PATCH 31/35] change warning to runtimewarning --- sklearn/model_selection/_validation.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index c11e87ceafb1e..40e29dc503bbc 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -735,8 +735,8 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, "not match total number of classes ({}). " "Results may not be appropriate for your use case." " {}".format(len(estimator.classes_), - n_classes, err_mess) - ) + n_classes, err_mess), + RuntimeWarning) if method == 'decision_function': if (predictions.ndim == 2 and predictions.shape[1] != len(estimator.classes_)): @@ -749,15 +749,13 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, ' reconcile different number of classes' ' in different folds. {}'.format( predictions.shape, method, - len(estimator.classes_), err_mess - )) + len(estimator.classes_), err_mess)) if len(estimator.classes_) <= 2: # In this special case, `predictions` contains a 1D array. raise ValueError('Only {} class/es in training fold, this' ' is not supported for decision_function' ' with imbalanced folds. {}'.format( - len(estimator.classes_), err_mess - )) + len(estimator.classes_), err_mess)) float_min = np.finfo(predictions.dtype).min default_values = {'decision_function': float_min, From 042e45fca5a6257e0e205b0144da5e4fbf3d35b1 Mon Sep 17 00:00:00 2001 From: reiinakano Date: Sat, 14 Oct 2017 03:04:19 +0800 Subject: [PATCH 32/35] add test for the warning --- sklearn/model_selection/tests/test_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index e7752ec307965..1161d2d4e4dfe 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -777,6 +777,10 @@ def split(self, X, y=None, groups=None): assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV()) + X, y = load_iris(return_X_y=True) + assert_warns(RuntimeWarning, cross_val_predict, LogisticRegression(), + X, y, method='predict_proba', cv=KFold(2)) + def test_cross_val_predict_decision_function_shape(): X, y = make_classification(n_classes=2, n_samples=50, random_state=0) From bc405c8ccf41dcae67ade093eb8fe4a70c7178ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Oct 2017 10:10:30 +0200 Subject: [PATCH 33/35] Use assert_warns_message rather than assert_warns Other minor fixes --- sklearn/model_selection/_validation.py | 35 ++++++++++--------- .../model_selection/tests/test_validation.py | 10 ++++-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 40e29dc503bbc..8c4cbf2859102 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -728,14 +728,15 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: n_classes = len(set(y)) if n_classes != len(estimator.classes_): - err_mess = ('To fix this, use a cross-validation ' - 'technique resulting in properly ' - 'stratified folds') - warnings.warn("Number of classes in training fold ({}) does" - "not match total number of classes ({}). " - "Results may not be appropriate for your use case." - " {}".format(len(estimator.classes_), - n_classes, err_mess), + recommendation = ( + 'To fix this, use a cross-validation ' + 'technique resulting in properly ' + 'stratified folds') + warnings.warn('Number of classes in training fold ({}) does ' + 'not match total number of classes ({}). ' + 'Results may not be appropriate for your use case. ' + '{}'.format(len(estimator.classes_), + n_classes, recommendation), RuntimeWarning) if method == 'decision_function': if (predictions.ndim == 2 and @@ -745,17 +746,19 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, # it with. This case is found when sklearn.svm.SVC is # set to `decision_function_shape='ovo'`. raise ValueError('Output shape {} of {} does not match ' - 'number of classes ({}) in fold. Cannot' - ' reconcile different number of classes' - ' in different folds. {}'.format( + 'number of classes ({}) in fold. Cannot ' + 'reconcile different number of classes ' + 'in different folds. {}'.format( predictions.shape, method, - len(estimator.classes_), err_mess)) + len(estimator.classes_), + recommendation)) if len(estimator.classes_) <= 2: # In this special case, `predictions` contains a 1D array. - raise ValueError('Only {} class/es in training fold, this' - ' is not supported for decision_function' - ' with imbalanced folds. {}'.format( - len(estimator.classes_), err_mess)) + raise ValueError('Only {} class/es in training fold, this ' + 'is not supported for decision_function ' + 'with imbalanced folds. {}'.format( + len(estimator.classes_), + recommendation)) float_min = np.finfo(predictions.dtype).min default_values = {'decision_function': float_min, diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 1161d2d4e4dfe..412dcf5ac7cbe 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -22,6 +22,7 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_warns_message from sklearn.utils.mocking import CheckingClassifier, MockDataFrame from sklearn.model_selection import cross_val_score @@ -778,8 +779,13 @@ def split(self, X, y=None, groups=None): assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV()) X, y = load_iris(return_X_y=True) - assert_warns(RuntimeWarning, cross_val_predict, LogisticRegression(), - X, y, method='predict_proba', cv=KFold(2)) + + warning_message = ('Number of classes in training fold (2) does ' + 'not match total number of classes (3). ' + 'Results may not be appropriate for your use case.') + assert_warns_message(RuntimeWarning, warning_message, + cross_val_predict, LogisticRegression(), + X, y, method='predict_proba', cv=KFold(2)) def test_cross_val_predict_decision_function_shape(): From e3b963d8a75e313b1eade657514400245615a16c Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 19 Oct 2017 09:06:28 +1100 Subject: [PATCH 34/35] Note on class-absent replacement values --- sklearn/model_selection/_validation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 8c4cbf2859102..481e5b1f7a8e1 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -625,6 +625,15 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, predictions : ndarray This is the result of calling ``method`` + Notes + ----- + In the case that one or more classes are absent in a training portion, a + default score needs to be assigned to all instances for that class if + ``method`` produces columns per class, as in {'decision_function', + 'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is + 0. In order to ensure finite output, we approximate negative infinity by + the minimum finite float value for the dtype in other cases. + Examples -------- >>> from sklearn import datasets, linear_model From 6639b79ffa0f11f1a62e71fdac7a7a28f68b0e40 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 19 Oct 2017 09:07:13 +1100 Subject: [PATCH 35/35] Improve error message --- sklearn/model_selection/_validation.py | 7 ++++--- sklearn/model_selection/tests/test_validation.py | 5 +---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 481e5b1f7a8e1..62f36c9d02c83 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -755,9 +755,10 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, # it with. This case is found when sklearn.svm.SVC is # set to `decision_function_shape='ovo'`. raise ValueError('Output shape {} of {} does not match ' - 'number of classes ({}) in fold. Cannot ' - 'reconcile different number of classes ' - 'in different folds. {}'.format( + 'number of classes ({}) in fold. ' + 'Irregular decision_function outputs ' + 'are not currently supported by ' + 'cross_val_predict'.format( predictions.shape, method, len(estimator.classes_), recommendation)) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 412dcf5ac7cbe..aeddf31b31889 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -829,10 +829,7 @@ def test_cross_val_predict_decision_function_shape(): assert_raises_regex(ValueError, 'Output shape \(599L?, 21L?\) of decision_function ' 'does not match number of classes \(7\) in fold. ' - 'Cannot reconcile different number of ' - 'classes in different folds. To fix this, ' - 'use a cross-validation technique resulting ' - 'in properly stratified folds', + 'Irregular decision_function .*', cross_val_predict, est, X, y, cv=KFold(n_splits=3), method='decision_function')