From 0832bdac941cfa49bf2319c861775435cdc7294d Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Mon, 31 Oct 2016 18:14:18 +0530 Subject: [PATCH 01/13] label binarizer not used consistently in CalibratedClassifierCV --- sklearn/calibration.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ed3e85b643815..8e475982b6b82 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -178,9 +178,11 @@ def fit(self, X, y, sample_weight=None): this_estimator, method=self.method) if sample_weight is not None: calibrated_classifier.fit(X[test], y[test], + np.unique(y[train]), sample_weight[test]) else: - calibrated_classifier.fit(X[test], y[test]) + calibrated_classifier.fit(X[test], y[test], + np.unique(y[train])) self.calibrated_classifiers_.append(calibrated_classifier) return self @@ -289,7 +291,7 @@ def _preproc(self, X): return df, idx_pos_class - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, classes=None, sample_weight=None): """Calibrate the fitted model Parameters @@ -300,6 +302,10 @@ def fit(self, X, y, sample_weight=None): y : array-like, shape (n_samples,) Target values. + classes : array-like, shape (n_classes,) + Contains unique classes used to fit the base estimator. + if None, then classes is extracted from the given target values. + sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. @@ -309,7 +315,11 @@ def fit(self, X, y, sample_weight=None): Returns an instance of self. """ lb = LabelBinarizer() - Y = lb.fit_transform(y) + if classes is None: + lb.fit(y) + else: + lb.fit(classes) + Y = lb.transform(y) self.classes_ = lb.classes_ df, idx_pos_class = self._preproc(X) From c57c4f141aa6838211f621c5d38559a835e2fe51 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Mon, 31 Oct 2016 18:37:08 +0530 Subject: [PATCH 02/13] changed position of classes argument to make old tests run --- sklearn/calibration.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 8e475982b6b82..2a83ce8e50535 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -178,11 +178,11 @@ def fit(self, X, y, sample_weight=None): this_estimator, method=self.method) if sample_weight is not None: calibrated_classifier.fit(X[test], y[test], - np.unique(y[train]), - sample_weight[test]) + sample_weight[test], + np.unique(y[train])) else: calibrated_classifier.fit(X[test], y[test], - np.unique(y[train])) + classes=np.unique(y[train])) self.calibrated_classifiers_.append(calibrated_classifier) return self @@ -291,7 +291,7 @@ def _preproc(self, X): return df, idx_pos_class - def fit(self, X, y, classes=None, sample_weight=None): + def fit(self, X, y, sample_weight=None, classes=None): """Calibrate the fitted model Parameters @@ -302,13 +302,13 @@ def fit(self, X, y, classes=None, sample_weight=None): y : array-like, shape (n_samples,) Target values. + sample_weight : array-like, shape = [n_samples] or None + Sample weights. If None, then samples are equally weighted. + classes : array-like, shape (n_classes,) Contains unique classes used to fit the base estimator. if None, then classes is extracted from the given target values. - sample_weight : array-like, shape = [n_samples] or None - Sample weights. If None, then samples are equally weighted. - Returns ------- self : object From 2b26c8232042b6b148b7e241734a340b3fd16930 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Mon, 31 Oct 2016 19:30:58 +0530 Subject: [PATCH 03/13] moved parameter to constructor and added test --- sklearn/calibration.py | 29 ++++++++++++++++------------- sklearn/tests/test_calibration.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 2a83ce8e50535..1700d4cf5de2f 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -175,14 +175,13 @@ def fit(self, X, y, sample_weight=None): this_estimator.fit(X[train], y[train]) calibrated_classifier = _CalibratedClassifier( - this_estimator, method=self.method) + this_estimator, method=self.method, + classes=np.unique(y[train])) if sample_weight is not None: calibrated_classifier.fit(X[test], y[test], - sample_weight[test], - np.unique(y[train])) + sample_weight[test]) else: - calibrated_classifier.fit(X[test], y[test], - classes=np.unique(y[train])) + calibrated_classifier.fit(X[test], y[test]) self.calibrated_classifiers_.append(calibrated_classifier) return self @@ -255,6 +254,11 @@ class _CalibratedClassifier(object): corresponds to Platt's method or 'isotonic' which is a non-parametric approach based on isotonic regression. + classes : array-like, shape (n_classes,) + Contains unique classes used to fit the base estimator. + if None, then classes is extracted from the given target values + in fit(). + References ---------- .. [1] Obtaining calibrated probability estimates from decision trees @@ -269,9 +273,10 @@ class _CalibratedClassifier(object): .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ - def __init__(self, base_estimator, method='sigmoid'): + def __init__(self, base_estimator, method='sigmoid', classes=None): self.base_estimator = base_estimator self.method = method + self.classes = classes def _preproc(self, X): n_classes = len(self.classes_) @@ -291,7 +296,7 @@ def _preproc(self, X): return df, idx_pos_class - def fit(self, X, y, sample_weight=None, classes=None): + def fit(self, X, y, sample_weight=None): """Calibrate the fitted model Parameters @@ -305,20 +310,18 @@ def fit(self, X, y, sample_weight=None, classes=None): sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. - classes : array-like, shape (n_classes,) - Contains unique classes used to fit the base estimator. - if None, then classes is extracted from the given target values. - Returns ------- self : object Returns an instance of self. """ + lb = LabelBinarizer() - if classes is None: + if self.classes is None: lb.fit(y) else: - lb.fit(classes) + lb.fit(self.classes) + Y = lb.transform(y) self.classes_ = lb.classes_ diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 68a6efb395971..6981fc23190d3 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -3,6 +3,7 @@ import numpy as np from scipy import sparse +from sklearn.model_selection import LeaveOneOut from sklearn.utils.testing import (assert_array_almost_equal, assert_equal, assert_greater, assert_almost_equal, @@ -159,6 +160,7 @@ def test_calibration_multiclass(): def softmax(y_pred): e = np.exp(-y_pred) return e / e.sum(axis=1).reshape(-1, 1) + uncalibrated_log_loss = \ log_loss(y_test, softmax(clf.decision_function(X_test))) calibrated_log_loss = log_loss(y_test, probas) @@ -275,3 +277,17 @@ def test_calibration_nan_imputer(): clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X) + + +def test_calibration_prob_sum(): + """Test that sum of probabilities is 1""" + num_classes = 2 + X, y = make_classification(n_samples=100, n_features=20, + n_informative=18, n_redundant=2, + n_classes=num_classes) + clf = LinearSVC(C=1.0) + clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) + clf_prob.fit(X, y) + + probs = clf_prob.predict_proba(X) + assert_array_equal(probs.sum(axis=1), np.ones(probs.shape[0])) From 693f3a89efe2fd7f3c10822519c996a9e8ee8f58 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Tue, 1 Nov 2016 17:21:23 +0530 Subject: [PATCH 04/13] added test where train set doesnt have all classes --- sklearn/calibration.py | 2 +- sklearn/tests/test_calibration.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 1700d4cf5de2f..0148f61a065ed 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -176,7 +176,7 @@ def fit(self, X, y, sample_weight=None): calibrated_classifier = _CalibratedClassifier( this_estimator, method=self.method, - classes=np.unique(y[train])) + classes=self.classes_) if sample_weight is not None: calibrated_classifier.fit(X[test], y[test], sample_weight[test]) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 6981fc23190d3..7aa8a3a46fb81 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -280,14 +280,26 @@ def test_calibration_nan_imputer(): def test_calibration_prob_sum(): - """Test that sum of probabilities is 1""" + # Test that sum of probabilities is 1. A non-regression test for + # issue #7796 num_classes = 2 - X, y = make_classification(n_samples=100, n_features=20, - n_informative=18, n_redundant=2, + X, y = make_classification(n_samples=10, n_features=5, n_classes=num_classes) clf = LinearSVC(C=1.0) clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) - assert_array_equal(probs.sum(axis=1), np.ones(probs.shape[0])) + assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0])) + + # Test to check calibration works fine when train set in a test-train + # split does not contain all classes + # Since this test uses LOO, at each iteration train set will not contain a + # class label + X = np.random.randn(10, 5) + y = np.arange(10) + clf = LinearSVC(C=1.0) + clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) + clf_prob.fit(X, y) + probs = clf_prob.predict_proba(X) + assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0])) From b62cb4ccf1e2ad29d33dd1d0a089a9b2c1aa0bd3 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Thu, 3 Nov 2016 01:54:38 +0530 Subject: [PATCH 05/13] CalibratedClassifierCV can now handle cases where train set doesnt contain all labels --- sklearn/calibration.py | 37 ++++++++++++++++++++----------- sklearn/tests/test_calibration.py | 4 +++- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 0148f61a065ed..33ab6eaaaae6a 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -14,9 +14,10 @@ import numpy as np from scipy.optimize import fmin_bfgs +from sklearn.preprocessing import LabelEncoder from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone -from .preprocessing import LabelBinarizer +from .preprocessing import label_binarize, LabelBinarizer from .utils import check_X_y, check_array, indexable, column_or_1d from .utils.validation import check_is_fitted from .utils.fixes import signature @@ -50,7 +51,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): The method to use for calibration. Can be 'sigmoid' which corresponds to Platt's method or 'isotonic' which is a non-parametric approach. It is not advised to use isotonic calibration - with too few calibration samples ``(<<1000)`` since it tends to overfit. + with too few calibration samples ``(<<1000)`` since it tends to + overfit. Use sigmoids (Platt's calibration) in this case. cv : integer, cross-validation generator, iterable or "prefit", optional @@ -64,7 +66,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): For integer/None inputs, if ``y`` is binary or multiclass, :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` - is neither binary nor multiclass, :class:`sklearn.model_selection.KFold` + is neither binary nor multiclass, + :class:`sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various @@ -97,6 +100,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ + def __init__(self, base_estimator=None, method='sigmoid', cv=3): self.base_estimator = base_estimator self.method = method @@ -124,15 +128,16 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'], force_all_finite=False) X, y = indexable(X, y) - lb = LabelBinarizer().fit(y) - self.classes_ = lb.classes_ + le = LabelBinarizer().fit(y) + self.classes_ = le.classes_ # Check that each cross-validation fold can have at least one # example per class n_folds = self.cv if isinstance(self.cv, int) \ else self.cv.n_folds if hasattr(self.cv, "n_folds") else None if n_folds and \ - np.any([np.sum(y == class_) < n_folds for class_ in self.classes_]): + np.any([np.sum(y == class_) < n_folds for class_ in + self.classes_]): raise ValueError("Requesting %d-fold cross-validation but provided" " less than %d examples for at least one class." % (n_folds, n_folds)) @@ -158,7 +163,7 @@ def fit(self, X, y, sample_weight=None): fit_parameters = signature(base_estimator.fit).parameters estimator_name = type(base_estimator).__name__ if (sample_weight is not None - and "sample_weight" not in fit_parameters): + and "sample_weight" not in fit_parameters): warnings.warn("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) @@ -273,6 +278,7 @@ class _CalibratedClassifier(object): .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ + def __init__(self, base_estimator, method='sigmoid', classes=None): self.base_estimator = base_estimator self.method = method @@ -292,7 +298,11 @@ def _preproc(self, X): raise RuntimeError('classifier has no decision_function or ' 'predict_proba method.') - idx_pos_class = np.arange(df.shape[1]) + if hasattr(self.base_estimator, "classes_"): + idx_pos_class = self.label_encoder_. \ + transform(self.base_estimator.classes_) + else: + idx_pos_class = np.arange(df.shape[1]) return df, idx_pos_class @@ -316,14 +326,14 @@ def fit(self, X, y, sample_weight=None): Returns an instance of self. """ - lb = LabelBinarizer() + self.label_encoder_ = LabelEncoder() if self.classes is None: - lb.fit(y) + self.label_encoder_.fit(y) else: - lb.fit(self.classes) + self.label_encoder_.fit(self.classes) - Y = lb.transform(y) - self.classes_ = lb.classes_ + self.classes_ = self.label_encoder_.classes_ + Y = label_binarize(y, self.classes_) df, idx_pos_class = self._preproc(X) self.calibrators_ = [] @@ -460,6 +470,7 @@ class _SigmoidCalibration(BaseEstimator, RegressorMixin): b_ : float The intercept. """ + def fit(self, X, y, sample_weight=None): """Fit the model using X, y as training data. diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 7aa8a3a46fb81..ff64eef8a4fd5 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -302,4 +302,6 @@ def test_calibration_prob_sum(): clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) - assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0])) + n_classes = len(y) + assert_array_almost_equal(probs, np.full((X.shape[0], n_classes), + 1/n_classes)) From 4910004e1b0131a409f21834724e95ad17bad648 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Thu, 3 Nov 2016 02:01:18 +0530 Subject: [PATCH 06/13] fixing flake error --- sklearn/calibration.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 33ab6eaaaae6a..b491afcf7a652 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -51,8 +51,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): The method to use for calibration. Can be 'sigmoid' which corresponds to Platt's method or 'isotonic' which is a non-parametric approach. It is not advised to use isotonic calibration - with too few calibration samples ``(<<1000)`` since it tends to - overfit. + with too few calibration samples ``(<<1000)`` since it tends to overfit. Use sigmoids (Platt's calibration) in this case. cv : integer, cross-validation generator, iterable or "prefit", optional @@ -66,8 +65,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): For integer/None inputs, if ``y`` is binary or multiclass, :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` - is neither binary nor multiclass, - :class:`sklearn.model_selection.KFold` + is neither binary nor multiclass, :class:`sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various @@ -100,7 +98,6 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ - def __init__(self, base_estimator=None, method='sigmoid', cv=3): self.base_estimator = base_estimator self.method = method @@ -163,7 +160,7 @@ def fit(self, X, y, sample_weight=None): fit_parameters = signature(base_estimator.fit).parameters estimator_name = type(base_estimator).__name__ if (sample_weight is not None - and "sample_weight" not in fit_parameters): + and "sample_weight" not in fit_parameters): warnings.warn("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) @@ -278,7 +275,6 @@ class _CalibratedClassifier(object): .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ - def __init__(self, base_estimator, method='sigmoid', classes=None): self.base_estimator = base_estimator self.method = method @@ -299,7 +295,7 @@ def _preproc(self, X): 'predict_proba method.') if hasattr(self.base_estimator, "classes_"): - idx_pos_class = self.label_encoder_. \ + idx_pos_class = self.label_encoder_.\ transform(self.base_estimator.classes_) else: idx_pos_class = np.arange(df.shape[1]) @@ -470,7 +466,6 @@ class _SigmoidCalibration(BaseEstimator, RegressorMixin): b_ : float The intercept. """ - def fit(self, X, y, sample_weight=None): """Fit the model using X, y as training data. From ee98a8dd96790ce5cdb54017bc723bb3a7194912 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Thu, 3 Nov 2016 02:22:45 +0530 Subject: [PATCH 07/13] fixing line lengths --- sklearn/calibration.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index b491afcf7a652..ab63ca2cc5ebe 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -51,7 +51,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): The method to use for calibration. Can be 'sigmoid' which corresponds to Platt's method or 'isotonic' which is a non-parametric approach. It is not advised to use isotonic calibration - with too few calibration samples ``(<<1000)`` since it tends to overfit. + with too few calibration samples ``(<<1000)`` since it tends to + overfit. Use sigmoids (Platt's calibration) in this case. cv : integer, cross-validation generator, iterable or "prefit", optional @@ -64,8 +65,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` - is neither binary nor multiclass, :class:`sklearn.model_selection.KFold` + :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is + neither binary nor multiclass, :class:`sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various From 95ba6ea40b77bfde73c8b3c2095271c5ff7391cb Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Thu, 3 Nov 2016 02:38:55 +0530 Subject: [PATCH 08/13] removing np.full() --- sklearn/tests/test_calibration.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index ff64eef8a4fd5..4dab698a412ca 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -303,5 +303,4 @@ def test_calibration_prob_sum(): clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) n_classes = len(y) - assert_array_almost_equal(probs, np.full((X.shape[0], n_classes), - 1/n_classes)) + assert_array_almost_equal(probs, 1/n_classes) From 5ae793cc505df711ea3e4c437ce2236f74a426ba Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Thu, 3 Nov 2016 03:25:12 +0530 Subject: [PATCH 09/13] from __future__ import division for py2.7 --- sklearn/tests/test_calibration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 4dab698a412ca..bc92c776592c2 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -1,6 +1,7 @@ # Authors: Alexandre Gramfort # License: BSD 3 clause +from __future__ import division import numpy as np from scipy import sparse from sklearn.model_selection import LeaveOneOut @@ -304,3 +305,4 @@ def test_calibration_prob_sum(): probs = clf_prob.predict_proba(X) n_classes = len(y) assert_array_almost_equal(probs, 1/n_classes) +test_calibration_prob_sum() \ No newline at end of file From 1e50a6c55441ae07bcff91d3a13a5febc7be98d6 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Thu, 3 Nov 2016 03:26:42 +0530 Subject: [PATCH 10/13] change is test file --- sklearn/tests/test_calibration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index bc92c776592c2..e90b0ca81c24d 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -305,4 +305,3 @@ def test_calibration_prob_sum(): probs = clf_prob.predict_proba(X) n_classes = len(y) assert_array_almost_equal(probs, 1/n_classes) -test_calibration_prob_sum() \ No newline at end of file From 466e6a06bbdaf2ea9193ba8476cb48403b4300a7 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Sun, 6 Nov 2016 14:16:31 +0530 Subject: [PATCH 11/13] added an extra test and removed a test with Ridge --- sklearn/calibration.py | 7 ++----- sklearn/tests/test_calibration.py | 21 +++++++++------------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ab63ca2cc5ebe..1bbec9bac912f 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -295,11 +295,8 @@ def _preproc(self, X): raise RuntimeError('classifier has no decision_function or ' 'predict_proba method.') - if hasattr(self.base_estimator, "classes_"): - idx_pos_class = self.label_encoder_.\ - transform(self.base_estimator.classes_) - else: - idx_pos_class = np.arange(df.shape[1]) + idx_pos_class = self.label_encoder_.\ + transform(self.base_estimator.classes_) return df, idx_pos_class diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index e90b0ca81c24d..13b131a62b4aa 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -16,7 +16,6 @@ from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.svm import LinearSVC -from sklearn.linear_model import Ridge from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer from sklearn.metrics import brier_score_loss, log_loss @@ -89,12 +88,6 @@ def test_calibration(): brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled)) - # check that calibration can also deal with regressors that have - # a decision_function - clf_base_regressor = CalibratedClassifierCV(Ridge()) - clf_base_regressor.fit(X_train, y_train) - clf_base_regressor.predict(X_test) - # Check failure cases: # only "isotonic" and "sigmoid" should be accepted as methods clf_invalid_method = CalibratedClassifierCV(clf, method="foo") @@ -293,6 +286,8 @@ def test_calibration_prob_sum(): probs = clf_prob.predict_proba(X) assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0])) + +def test_calibration_less_classes(): # Test to check calibration works fine when train set in a test-train # split does not contain all classes # Since this test uses LOO, at each iteration train set will not contain a @@ -300,8 +295,10 @@ def test_calibration_prob_sum(): X = np.random.randn(10, 5) y = np.arange(10) clf = LinearSVC(C=1.0) - clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) - clf_prob.fit(X, y) - probs = clf_prob.predict_proba(X) - n_classes = len(y) - assert_array_almost_equal(probs, 1/n_classes) + cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) + cal_clf.fit(X, y) + + for i, calibrated_classifier in \ + enumerate(cal_clf.calibrated_classifiers_): + assert_array_equal(calibrated_classifier.predict_proba(X)[:, i], + np.zeros(len(y))) From 12be4ff14e82f97d38791eb9cfb84c1986169a79 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Sun, 6 Nov 2016 23:03:03 +0530 Subject: [PATCH 12/13] stronger test --- sklearn/calibration.py | 2 +- sklearn/tests/test_calibration.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 1bbec9bac912f..b96799f73d13d 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -257,7 +257,7 @@ class _CalibratedClassifier(object): corresponds to Platt's method or 'isotonic' which is a non-parametric approach based on isotonic regression. - classes : array-like, shape (n_classes,) + classes : array-like, shape (n_classes,), optional Contains unique classes used to fit the base estimator. if None, then classes is extracted from the given target values in fit(). diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 13b131a62b4aa..e4499e35d5a67 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -300,5 +300,7 @@ def test_calibration_less_classes(): for i, calibrated_classifier in \ enumerate(cal_clf.calibrated_classifiers_): - assert_array_equal(calibrated_classifier.predict_proba(X)[:, i], - np.zeros(len(y))) + proba = calibrated_classifier.predict_proba(X) + assert_array_equal(proba[:, i], np.zeros(len(y))) + assert_equal(np.all(np.hstack([proba[:, :i], + proba[:, i + 1:]])), True) From 6d9b675825fa733c177dfed52226b1c7f3a22180 Mon Sep 17 00:00:00 2001 From: srivatsan-ramesh Date: Sun, 6 Nov 2016 23:27:23 +0530 Subject: [PATCH 13/13] whats new entry --- doc/whats_new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 1911cc5cbde57..d676312e240de 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -138,6 +138,12 @@ Bug fixes ``partial_fit`` was less than the total number of classes in the data. :issue:`7786` by `Srivatsan Ramesh`_ + - Fixes issue in :class:`calibration.CalibratedClassifierCV` where + the sum of probabilities of each class for a data was not 1, and + ``CalibratedClassifierCV`` now handles the case where the training set + has less number of classes than the total data. :issue:`7799` by + `Srivatsan Ramesh`_ + API changes summary -------------------