From b2ac13c88c0386e05bc65c3b6f1e5781c44cefe1 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 29 Aug 2017 02:08:26 -0400 Subject: [PATCH 01/51] initial commit --- sklearn/utils/tests/test_estimator_checks.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 1b3a1ea7e597a..7f88e9e147834 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -251,3 +251,8 @@ def __init__(self): check_no_fit_attributes_set_in_init, 'estimator_name', NonConformantEstimator) + +def test_check_estimator_pairwise(): + # check that check_estimator() works on estimator with _pairwise + # attribute set_random_state + pass From 124622b6c92ee3c434367f0e921e0cf24efdbd0d Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 29 Aug 2017 12:59:09 -0400 Subject: [PATCH 02/51] add test for check_estimator on SVC(kernel='precomputed') --- sklearn/utils/tests/test_estimator_checks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 7f88e9e147834..82afcac04c2e9 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -18,6 +18,7 @@ from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import NMF from sklearn.linear_model import MultiTaskElasticNet +from sklearn.svm import SVC from sklearn.utils.validation import check_X_y, check_array @@ -254,5 +255,6 @@ def __init__(self): def test_check_estimator_pairwise(): # check that check_estimator() works on estimator with _pairwise - # attribute set_random_state - pass + # attribute set + est = SVC(kernel='precomputed') + check_estimator(est) From 578865e061e8669f1e8e5077db17a735b2f58378 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Wed, 6 Sep 2017 18:17:54 -0400 Subject: [PATCH 03/51] change tests to run on estimators with _pairwise set to True --- sklearn/base.py | 18 ++++++++ sklearn/utils/estimator_checks.py | 77 ++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index aa4f9f9ce17c1..79e78761657a2 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -578,3 +578,21 @@ def is_regressor(estimator): True if estimator is a regressor and False otherwise. """ return getattr(estimator, "_estimator_type", None) == "regressor" + + +def is_pairwise(estimator): + """Returns True if the given estimator has a _pairwise attribute + set to True. + + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if _pairwise is set and True and False otherwise. + """ + return getattr(estimator, "_pairwise", False) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3e7cb198a9d12..63c9536ac0e45 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -36,11 +36,13 @@ from sklearn.base import (clone, TransformerMixin, ClusterMixin, - BaseEstimator, is_classifier, is_regressor) + BaseEstimator, is_classifier, is_regressor, + is_pairwise) from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest +from sklearn.svm import SVC from sklearn.svm.base import BaseLibSVM from sklearn.linear_model.stochastic_gradient import BaseSGD from sklearn.pipeline import make_pipeline @@ -48,6 +50,8 @@ from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import SkipTestWarning from sklearn.model_selection import train_test_split +from sklearn.metrics.pairwise import (rbf_kernel, cosine_similarity, + linear_kernel) from sklearn.utils import shuffle from sklearn.utils.fixes import signature @@ -353,10 +357,22 @@ def _is_32bit(): return struct.calcsize('P') * 8 == 32 +def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): + if is_pairwise(estimator): + return kernel(X, X) + return X + + def check_estimator_sparse_data(name, estimator_orig): + + # Sparse precomputed kernels aren't supported + if getattr(estimator_orig, 'kernel', None) == 'precomputed': + return + rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 + X = gram_matrix_if_pairwise(X, estimator_orig) X_csr = sparse.csr_matrix(X) y = (4 * rng.rand(40)).astype(np.int) # catch deprecation warnings @@ -404,6 +420,7 @@ def check_sample_weights_pandas_series(name, estimator_orig): try: import pandas as pd X = pd.DataFrame([[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]]) + X = gram_matrix_if_pairwise(X, estimator_orig) y = pd.Series([1, 1, 1, 2, 2, 2]) weights = pd.Series([1] * 6) try: @@ -424,7 +441,7 @@ def check_sample_weights_list(name, estimator_orig): if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) - X = rnd.uniform(size=(10, 3)) + X = gram_matrix_if_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(estimator, y) sample_weight = [3] * 10 @@ -436,7 +453,8 @@ def check_sample_weights_list(name, estimator_orig): def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible rng = np.random.RandomState(0) - X = rng.rand(40, 10).astype(object) + X = gram_matrix_if_pairwise(rng.rand(40, 10), estimator_orig) + X = X.astype(object) y = (X[:, 0] * 4).astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -483,6 +501,8 @@ def check_dict_unchanged(name, estimator_orig): else: X = 2 * rnd.uniform(size=(20, 3)) + X = gram_matrix_if_pairwise(X, estimator_orig) + y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -520,6 +540,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) + X = gram_matrix_if_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -566,6 +587,7 @@ def check_fit2d_predict1d(name, estimator_orig): # check by fitting a 2d array and predicting with a 1d array rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) + X = gram_matrix_if_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -798,6 +820,7 @@ def check_pipeline_consistency(name, estimator_orig): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() + X = gram_matrix_if_pairwise(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) set_random_state(estimator) @@ -822,6 +845,7 @@ def check_fit_score_takes_y(name, estimator_orig): # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) + X = gram_matrix_if_pairwise(X, estimator_orig) y = np.arange(10) % 3 estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -847,6 +871,7 @@ def check_fit_score_takes_y(name, estimator_orig): def check_estimators_dtypes(name, estimator_orig): rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) + X_train_32 = gram_matrix_if_pairwise(X_train_32, estimator_orig) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) @@ -892,7 +917,8 @@ def check_estimators_empty_data_messages(name, estimator_orig): def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) - X_train_finite = rnd.uniform(size=(10, 3)) + X_train_finite = gram_matrix_if_pairwise(rnd.uniform(size=(10, 3)), + estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) @@ -969,6 +995,7 @@ def check_estimators_pickle(name, estimator_orig): # some estimators can't do features less than 0 X -= X.min() + X = gram_matrix_if_pairwise(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) @@ -1119,6 +1146,7 @@ def check_classifiers_train(name, classifier_orig): classifier = clone(classifier_orig) if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB']: X -= X.min() + X = gram_matrix_if_pairwise(X, classifier_orig) set_random_state(classifier) # raises error on malformed input for fit with assert_raises(ValueError, msg="The classifer {} does not" @@ -1140,11 +1168,12 @@ def check_classifiers_train(name, classifier_orig): assert_greater(accuracy_score(y, y_pred), 0.83) # raises error on malformed input for predict - with assert_raises(ValueError, msg="The classifier {} does not" - " raise an error when the number of features " - "in predict is different from the number of" - " features in fit.".format(name)): - classifier.predict(X.T) + if not is_pairwise(classifier): + with assert_raises(ValueError, msg="The classifier {} does not" + " raise an error when the number of features " + "in predict is different from the number of" + " features in fit.".format(name)): + classifier.predict(X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict @@ -1160,12 +1189,13 @@ def check_classifiers_train(name, classifier_orig): assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function - with assert_raises(ValueError, msg="The classifier {} does" - " not raise an error when the number of " - "features in decision_function is " - "different from the number of features" - " in fit.".format(name)): - classifier.decision_function(X.T) + if not is_pairwise(classifier): + with assert_raises(ValueError, msg="The classifier {} does" + " not raise an error when the number of " + "features in decision_function is " + "different from the number of features" + " in fit.".format(name)): + classifier.decision_function(X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): @@ -1194,6 +1224,7 @@ def check_estimators_fit_returns_self(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9, n_features=4) # some want non-negative input X -= X.min() + X = gram_matrix_if_pairwise(X, estimator_orig) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1241,7 +1272,7 @@ def check_supervised_y_2d(name, estimator_orig): # These only work on 2d, so this test makes no sense return rnd = np.random.RandomState(0) - X = rnd.uniform(size=(10, 3)) + X = gram_matrix_if_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 3 estimator = clone(estimator_orig) set_random_state(estimator) @@ -1275,6 +1306,7 @@ def check_classifiers_classes(name, classifier_orig): # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 + X = gram_matrix_if_pairwise(X, classifier_orig) y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: @@ -1408,6 +1440,11 @@ def check_class_weight_classifiers(name, classifier_orig): X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) + + X_train = gram_matrix_if_pairwise(X_train, classifier_orig, + kernel=rbf_kernel) + X_test = gram_matrix_if_pairwise(X_test, classifier_orig, + kernel=rbf_kernel) n_centers = len(np.unique(y_train)) if n_centers == 2: @@ -1491,6 +1528,7 @@ def check_estimators_overwrite_params(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9) # some want non-negative input X -= X.min() + X = gram_matrix_if_pairwise(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1565,6 +1603,7 @@ def check_sparsify_coefficients(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_classifier_data_not_an_array(name, estimator_orig): X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) + X = gram_matrix_if_pairwise(X, estimator_orig) y = [1, 1, 1, 2, 2, 2] y = multioutput_estimator_convert_y_2d(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) @@ -1573,6 +1612,7 @@ def check_classifier_data_not_an_array(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_regressor_data_not_an_array(name, estimator_orig): X, y = _boston_subset(n_samples=50) + X = gram_matrix_if_pairwise(X, estimator_orig) y = multioutput_estimator_convert_y_2d(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) @@ -1795,3 +1835,8 @@ def check_decision_proba_consistency(name, estimator_orig): a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) assert_array_equal(rankdata(a), rankdata(b)) + + +def check_pairwise_estimator(): + est = SVC(kernel='precomputed') + check_estimator(est) From d6f3c27cc0e67ee426bed74718fcb53c2f3a5c23 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Wed, 6 Sep 2017 23:12:08 -0400 Subject: [PATCH 04/51] fix typo in is_pairwise docstring --- sklearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 79e78761657a2..6df41978ce134 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -593,6 +593,6 @@ def is_pairwise(estimator): Returns ------- out : bool - True if _pairwise is set and True and False otherwise. + True if _pairwise is set to True and False otherwise. """ return getattr(estimator, "_pairwise", False) From d9fff0ad71eaa64bb875f7ffea2eb1ecdea639d1 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 7 Sep 2017 02:39:54 -0400 Subject: [PATCH 05/51] fix PEP8 issues: line length and unused import --- sklearn/utils/estimator_checks.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 63c9536ac0e45..50a41f7e49bb6 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -50,8 +50,7 @@ from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import SkipTestWarning from sklearn.model_selection import train_test_split -from sklearn.metrics.pairwise import (rbf_kernel, cosine_similarity, - linear_kernel) +from sklearn.metrics.pairwise import rbf_kernel, linear_kernel from sklearn.utils import shuffle from sklearn.utils.fixes import signature @@ -1191,8 +1190,8 @@ def check_classifiers_train(name, classifier_orig): # raises error on malformed input for decision_function if not is_pairwise(classifier): with assert_raises(ValueError, msg="The classifier {} does" - " not raise an error when the number of " - "features in decision_function is " + " not raise an error when the number " + "of features in decision_function is " "different from the number of features" " in fit.".format(name)): classifier.decision_function(X.T) From e89b9e4e141202041eac60229d282de5216ce80a Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 14 Sep 2017 13:29:19 -0400 Subject: [PATCH 06/51] use is_pairwise() to check for precomputed kernel --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 50a41f7e49bb6..e23b5d434f95b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -365,7 +365,7 @@ def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): def check_estimator_sparse_data(name, estimator_orig): # Sparse precomputed kernels aren't supported - if getattr(estimator_orig, 'kernel', None) == 'precomputed': + if is_pairwise(estimator_orig): return rng = np.random.RandomState(0) From c9c6a491ec671a3274a5bc8c51b61a7bc4dd87f5 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 18 Sep 2017 02:07:02 -0400 Subject: [PATCH 07/51] fix precomputed test/train matricies for check_class_weight_classifiers --- sklearn/utils/estimator_checks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e23b5d434f95b..55abc859f774c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1440,10 +1440,11 @@ def check_class_weight_classifiers(name, classifier_orig): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) - X_train = gram_matrix_if_pairwise(X_train, classifier_orig, - kernel=rbf_kernel) - X_test = gram_matrix_if_pairwise(X_test, classifier_orig, - kernel=rbf_kernel) + # can't use gram_if_pairwise() here, setting up gram matrix manually + if is_pairwise(classifier_orig): + X_test = rbf_kernel(X_test, X_train) + X_train = rbf_kernel(X_train, X_train) + n_centers = len(np.unique(y_train)) if n_centers == 2: From 7894231472d8da91a07c37c864c0870accb1c61c Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 18 Sep 2017 02:21:13 -0400 Subject: [PATCH 08/51] fix PEP8 issues --- sklearn/base.py | 8 +++----- sklearn/utils/estimator_checks.py | 4 ---- sklearn/utils/tests/test_estimator_checks.py | 1 + 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 98b9426a2a06d..080190693fa13 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -551,7 +551,6 @@ def is_classifier(estimator): def is_regressor(estimator): """Returns True if the given estimator is (probably) a regressor. - Parameters ---------- estimator : object @@ -566,9 +565,7 @@ def is_regressor(estimator): def is_pairwise(estimator): - """Returns True if the given estimator has a _pairwise attribute - set to True. - + """Returns True if estimator has a _pairwise attribute set to True. Parameters ---------- @@ -580,4 +577,5 @@ def is_pairwise(estimator): out : bool True if _pairwise is set to True and False otherwise. """ - return getattr(estimator, "_pairwise", False) + return bool(getattr(estimator, "_pairwise", False)) + diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 55abc859f774c..8d9fa76ceae29 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1836,7 +1836,3 @@ def check_decision_proba_consistency(name, estimator_orig): b = estimator.decision_function(X_test) assert_array_equal(rankdata(a), rankdata(b)) - -def check_pairwise_estimator(): - est = SVC(kernel='precomputed') - check_estimator(est) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 82afcac04c2e9..50b4ebd514a30 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -253,6 +253,7 @@ def __init__(self): 'estimator_name', NonConformantEstimator) + def test_check_estimator_pairwise(): # check that check_estimator() works on estimator with _pairwise # attribute set From d3fcb3e01551621000a40aa89f311d117f90d47a Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 18 Sep 2017 03:06:31 -0400 Subject: [PATCH 09/51] add final empty line --- sklearn/utils/tests/test_estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 50b4ebd514a30..df98631c6be48 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -259,3 +259,4 @@ def test_check_estimator_pairwise(): # attribute set est = SVC(kernel='precomputed') check_estimator(est) + From ffeb68e95c53c2ec82ef8289f0d95195f30d4c8c Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 18 Sep 2017 12:39:58 -0400 Subject: [PATCH 10/51] ensure check_sample_weights_pandas_series actually operates on pandas dataframes with pairwise kernel --- sklearn/utils/estimator_checks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 8d9fa76ceae29..12a0fd68daa56 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -419,7 +419,10 @@ def check_sample_weights_pandas_series(name, estimator_orig): try: import pandas as pd X = pd.DataFrame([[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]]) - X = gram_matrix_if_pairwise(X, estimator_orig) + # if _pairwise, feed estimator a pandas dataframe of the gram + # matrix + if is_pairwise(estimator_orig): + X = pd.DataFrame(rbf_kernel(X.values, X.values)) y = pd.Series([1, 1, 1, 2, 2, 2]) weights = pd.Series([1] * 6) try: @@ -1835,4 +1838,3 @@ def check_decision_proba_consistency(name, estimator_orig): a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) assert_array_equal(rankdata(a), rankdata(b)) - From 298fa8498d05727c0dfdcd79043c8421530762e1 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 18 Sep 2017 12:52:16 -0400 Subject: [PATCH 11/51] remove blank lines as end of file, flake8 --- sklearn/base.py | 1 - sklearn/utils/tests/test_estimator_checks.py | 1 - 2 files changed, 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 080190693fa13..135016613f138 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -578,4 +578,3 @@ def is_pairwise(estimator): True if _pairwise is set to True and False otherwise. """ return bool(getattr(estimator, "_pairwise", False)) - diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index df98631c6be48..50b4ebd514a30 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -259,4 +259,3 @@ def test_check_estimator_pairwise(): # attribute set est = SVC(kernel='precomputed') check_estimator(est) - From a6319511d4dd7c041c833064f500d3c5059488cf Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 18 Sep 2017 12:55:33 -0400 Subject: [PATCH 12/51] remove unused import --- sklearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 12a0fd68daa56..589438f68e39c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -42,7 +42,6 @@ from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest -from sklearn.svm import SVC from sklearn.svm.base import BaseLibSVM from sklearn.linear_model.stochastic_gradient import BaseSGD from sklearn.pipeline import make_pipeline From b58e6bf4017ba67bf24a6a1fa5bb3066c2b57401 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 19 Sep 2017 16:38:10 -0400 Subject: [PATCH 13/51] add estimator check for estimators that are based on a metric as well as a kernel --- sklearn/utils/tests/test_estimator_checks.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 50b4ebd514a30..f5d8e68486414 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -19,6 +19,7 @@ from sklearn.decomposition import NMF from sklearn.linear_model import MultiTaskElasticNet from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsRegressor from sklearn.utils.validation import check_X_y, check_array @@ -259,3 +260,10 @@ def test_check_estimator_pairwise(): # attribute set est = SVC(kernel='precomputed') check_estimator(est) + +def test_check_estimator_metric_and_kernel(): + # check that check_estimator works for estimator that is based on + # a metric as well as a kernel + + est = KNeighborsRegressor() + check_estimator(est) From 273d8ee97ee49dd3cf3f4d205cc6118f16f60828 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 19 Sep 2017 21:10:11 -0400 Subject: [PATCH 14/51] add extra line, PEP8 --- sklearn/utils/tests/test_estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index f5d8e68486414..b15f893cebd85 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -261,6 +261,7 @@ def test_check_estimator_pairwise(): est = SVC(kernel='precomputed') check_estimator(est) + def test_check_estimator_metric_and_kernel(): # check that check_estimator works for estimator that is based on # a metric as well as a kernel From 68bacdb37d279c5be289def718f04075993977e3 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Wed, 20 Sep 2017 12:43:47 -0400 Subject: [PATCH 15/51] add check to ensure test_check_estimator_pairwise actually checks a pairwise estimator --- sklearn/utils/tests/test_estimator_checks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index b15f893cebd85..b5c55eaa09600 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -8,6 +8,7 @@ from sklearn.utils.testing import (assert_raises_regex, assert_true, assert_equal, ignore_warnings) from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.estimator_checks import is_pairwise from sklearn.utils.estimator_checks import set_random_state from sklearn.utils.estimator_checks import set_checking_parameters from sklearn.utils.estimator_checks import check_estimators_unfitted @@ -259,6 +260,7 @@ def test_check_estimator_pairwise(): # check that check_estimator() works on estimator with _pairwise # attribute set est = SVC(kernel='precomputed') + assert(is_pairwise(est)) check_estimator(est) From 142eab4e1fdb98308cad42f1d597af0b20ed2578 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 28 Sep 2017 01:54:41 -0400 Subject: [PATCH 16/51] alter gram_matrix_if_pairwise to account for pairwise metrics --- sklearn/base.py | 20 ++++++++++++++++++++ sklearn/utils/estimator_checks.py | 12 ++++++++++-- sklearn/utils/tests/test_estimator_checks.py | 4 ++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 135016613f138..fd166d9874efd 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -578,3 +578,23 @@ def is_pairwise(estimator): True if _pairwise is set to True and False otherwise. """ return bool(getattr(estimator, "_pairwise", False)) + + +def is_pairwise_metric(estimator): + """Returns True if estimator has a _pairwise attribute set to True. + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if _pairwise is set to True and False otherwise. + """ + metric = getattr(estimator, "metric", None) + precomputed_metric = metric == 'precomputed' + pairwise = is_pairwise(estimator) + + return precomputed_metric and pairwise diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3179f8658d29c..3f08b3a72639e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -37,9 +37,10 @@ from sklearn.base import (clone, TransformerMixin, ClusterMixin, BaseEstimator, is_classifier, is_regressor, - is_pairwise) + is_pairwise, is_pairwise_metric) from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score +from sklearn.covariance import LedoitWolf from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest from sklearn.svm.base import BaseLibSVM @@ -50,6 +51,7 @@ from sklearn.exceptions import SkipTestWarning from sklearn.model_selection import train_test_split from sklearn.metrics.pairwise import rbf_kernel, linear_kernel +from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils import shuffle from sklearn.utils.fixes import signature @@ -358,8 +360,13 @@ def _is_32bit(): def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): + + if is_pairwise_metric(estimator): + return pairwise_distances(X, metric='mahalanobis') + if is_pairwise(estimator): return kernel(X, X) + return X @@ -1332,7 +1339,7 @@ def check_classifiers_classes(name, classifier_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_regressors_int(name, regressor_orig): X, _ = _boston_subset() - X = X[:50] + X = gram_matrix_if_pairwise(X[:50], regressor_orig) rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) y = multioutput_estimator_convert_y_2d(regressor_orig, y) @@ -1360,6 +1367,7 @@ def check_regressors_int(name, regressor_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_regressors_train(name, regressor_orig): X, y = _boston_subset() + X = gram_matrix_if_pairwise(X, regressor_orig) y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled y = y.ravel() regressor = clone(regressor_orig) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index b5c55eaa09600..005f876462a0d 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -264,9 +264,9 @@ def test_check_estimator_pairwise(): check_estimator(est) -def test_check_estimator_metric_and_kernel(): +def test_check_estimator_pairwise_metric(): # check that check_estimator works for estimator that is based on # a metric as well as a kernel - est = KNeighborsRegressor() + est = KNeighborsRegressor(metric='precomputed') check_estimator(est) From c06e404d47525c6a34aa8d9c9dcbaf3a30e86625 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 28 Sep 2017 02:33:41 -0400 Subject: [PATCH 17/51] make test for 2d y features work --- sklearn/utils/estimator_checks.py | 1 + sklearn/utils/tests/test_estimator_checks.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3f08b3a72639e..d51e5708b34ed 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -651,6 +651,7 @@ def check_fit2d_1feature(name, estimator_orig): # informative message rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) + X = gram_matrix_if_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 005f876462a0d..457d0b4dc6932 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -8,7 +8,7 @@ from sklearn.utils.testing import (assert_raises_regex, assert_true, assert_equal, ignore_warnings) from sklearn.utils.estimator_checks import check_estimator -from sklearn.utils.estimator_checks import is_pairwise +from sklearn.utils.estimator_checks import is_pairwise, is_pairwise_metric from sklearn.utils.estimator_checks import set_random_state from sklearn.utils.estimator_checks import set_checking_parameters from sklearn.utils.estimator_checks import check_estimators_unfitted @@ -259,6 +259,7 @@ def __init__(self): def test_check_estimator_pairwise(): # check that check_estimator() works on estimator with _pairwise # attribute set + est = SVC(kernel='precomputed') assert(is_pairwise(est)) check_estimator(est) @@ -269,4 +270,5 @@ def test_check_estimator_pairwise_metric(): # a metric as well as a kernel est = KNeighborsRegressor(metric='precomputed') + assert(is_pairwise_metric(est)) check_estimator(est) From 7c7f3c400b243f47aa45ae496a6e3fb0cbbf5ec8 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 28 Sep 2017 02:36:47 -0400 Subject: [PATCH 18/51] refactor is_pairwise_metric() --- sklearn/base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index fd166d9874efd..815c1471149e4 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -594,7 +594,6 @@ def is_pairwise_metric(estimator): True if _pairwise is set to True and False otherwise. """ metric = getattr(estimator, "metric", None) - precomputed_metric = metric == 'precomputed' - pairwise = is_pairwise(estimator) - return precomputed_metric and pairwise + return metric == 'precomputed' and is_pairwise(estimator) + From 7d53c90233acda31c65aaf3eff688bf85e5de1da Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 28 Sep 2017 02:37:51 -0400 Subject: [PATCH 19/51] remove extra line --- sklearn/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 815c1471149e4..cec252508b622 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -596,4 +596,3 @@ def is_pairwise_metric(estimator): metric = getattr(estimator, "metric", None) return metric == 'precomputed' and is_pairwise(estimator) - From 9fd424e61a3153b453fd22324f26d91ad2c00e28 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 28 Sep 2017 02:53:50 -0400 Subject: [PATCH 20/51] fix grammar in docstring --- sklearn/utils/tests/test_estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 457d0b4dc6932..393a09c438e23 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -267,7 +267,7 @@ def test_check_estimator_pairwise(): def test_check_estimator_pairwise_metric(): # check that check_estimator works for estimator that is based on - # a metric as well as a kernel + # a precomputed metric est = KNeighborsRegressor(metric='precomputed') assert(is_pairwise_metric(est)) From e10fafd5f2707aa7fdf363c34bccf6353829729b Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 28 Sep 2017 16:51:46 -0400 Subject: [PATCH 21/51] fix gram_matrix_if_pairwise to accept flat 1-D X vector in python 3 --- sklearn/utils/estimator_checks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d51e5708b34ed..f9262c0402050 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -361,6 +361,9 @@ def _is_32bit(): def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): + if len(X.shape) == 1: + X = X.reshape(-1, 1) + if is_pairwise_metric(estimator): return pairwise_distances(X, metric='mahalanobis') From 85bed8ac7c27d2c865c17e4d663e301d6176ec97 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Sun, 1 Oct 2017 23:26:15 -0400 Subject: [PATCH 22/51] remove extra spaces --- sklearn/base.py | 2 +- sklearn/utils/estimator_checks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index cec252508b622..d7d800d945b50 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -595,4 +595,4 @@ def is_pairwise_metric(estimator): """ metric = getattr(estimator, "metric", None) - return metric == 'precomputed' and is_pairwise(estimator) + return metric == 'precomputed' and is_pairwise(estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f9262c0402050..ae49a91ba1adf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -361,7 +361,7 @@ def _is_32bit(): def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): - if len(X.shape) == 1: + if len(X.shape) == 1: X = X.reshape(-1, 1) if is_pairwise_metric(estimator): From 3b9dd25c862a96ee548df1d290cb87827497f7b9 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Fri, 6 Oct 2017 14:35:02 -0400 Subject: [PATCH 23/51] remove unused import in utils/estimator_checks.py --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ae49a91ba1adf..320c93940eac6 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -40,7 +40,6 @@ is_pairwise, is_pairwise_metric) from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score -from sklearn.covariance import LedoitWolf from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest from sklearn.svm.base import BaseLibSVM @@ -654,6 +653,7 @@ def check_fit2d_1feature(name, estimator_orig): # informative message rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) + print(name, X) X = gram_matrix_if_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) From 1e9886ee91558217ea26c94078adff01386e1f7b Mon Sep 17 00:00:00 2001 From: gkjohns Date: Fri, 6 Oct 2017 17:29:17 -0400 Subject: [PATCH 24/51] manually create distance matrix for gram_matrix_if_pairwise() --- sklearn/utils/estimator_checks.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 320c93940eac6..b5284ca262d61 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -364,7 +364,15 @@ def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): X = X.reshape(-1, 1) if is_pairwise_metric(estimator): - return pairwise_distances(X, metric='mahalanobis') + # pairwise_distance() fails for certain versions of scipy + n_obs = X.shape[0] + X_std = (X - X.mean(axis=0)) / X.std(axis=0) + X_out = np.zeros(shape=(n_obs, n_obs)) + for i in range(n_obs): + for j in range(n_obs): + dist = np.sum((X_std[i] - X_std[j]) ** 2) ** .5 + X_out[i,j] = dist + return X_out if is_pairwise(estimator): return kernel(X, X) @@ -653,7 +661,6 @@ def check_fit2d_1feature(name, estimator_orig): # informative message rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) - print(name, X) X = gram_matrix_if_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) From 78654a11d7f2337bd465ad4404b344aa0d59eadb Mon Sep 17 00:00:00 2001 From: gkjohns Date: Fri, 6 Oct 2017 17:49:00 -0400 Subject: [PATCH 25/51] PEP8, add a space --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b5284ca262d61..99b572d59829d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -371,7 +371,7 @@ def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): for i in range(n_obs): for j in range(n_obs): dist = np.sum((X_std[i] - X_std[j]) ** 2) ** .5 - X_out[i,j] = dist + X_out[i, j] = dist return X_out if is_pairwise(estimator): From 3777255be1d933795d804213bc8591558917a78d Mon Sep 17 00:00:00 2001 From: gkjohns Date: Fri, 6 Oct 2017 18:18:06 -0400 Subject: [PATCH 26/51] remove unused import --- sklearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 99b572d59829d..50c67872aa888 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -50,7 +50,6 @@ from sklearn.exceptions import SkipTestWarning from sklearn.model_selection import train_test_split from sklearn.metrics.pairwise import rbf_kernel, linear_kernel -from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils import shuffle from sklearn.utils.fixes import signature From 9b92c7b2acdf342674cad64928bce019dcd7744a Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 17 Oct 2017 21:10:26 -0400 Subject: [PATCH 27/51] make check_classifiers_train() check shape for pairwise estimators, fix doctring for is_pairwise_metric() --- sklearn/base.py | 2 +- sklearn/utils/estimator_checks.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index d7d800d945b50..193c688c15083 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -581,7 +581,7 @@ def is_pairwise(estimator): def is_pairwise_metric(estimator): - """Returns True if estimator has a _pairwise attribute set to True. + """Returns True if estimator accepts pairwise metric. Parameters ---------- diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 50c67872aa888..e25ed77c42968 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1181,7 +1181,13 @@ def check_classifiers_train(name, classifier_orig): assert_greater(accuracy_score(y, y_pred), 0.83) # raises error on malformed input for predict - if not is_pairwise(classifier): + if is_pairwise(classifier): + with assert_raises(ValueError, msg="The classifier {} does not" + " raise an error when the number of features " + "in predict is not equal to (n_test_samples," + "n_training_samples)".format(name)): + classifier.predict(X.reshape(-1, 1)) + else: with assert_raises(ValueError, msg="The classifier {} does not" " raise an error when the number of features " "in predict is different from the number of" From 687204f1846236e1b3e5f8867379672bd374f7d4 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 17 Oct 2017 21:23:08 -0400 Subject: [PATCH 28/51] use pairwise_distance() to create distance matrix --- sklearn/utils/estimator_checks.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e25ed77c42968..62f68c902a82a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -49,7 +49,8 @@ from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import SkipTestWarning from sklearn.model_selection import train_test_split -from sklearn.metrics.pairwise import rbf_kernel, linear_kernel +from sklearn.metrics.pairwise import (rbf_kernel, linear_kernel, + pairwise_distances) from sklearn.utils import shuffle from sklearn.utils.fixes import signature @@ -363,16 +364,7 @@ def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): X = X.reshape(-1, 1) if is_pairwise_metric(estimator): - # pairwise_distance() fails for certain versions of scipy - n_obs = X.shape[0] - X_std = (X - X.mean(axis=0)) / X.std(axis=0) - X_out = np.zeros(shape=(n_obs, n_obs)) - for i in range(n_obs): - for j in range(n_obs): - dist = np.sum((X_std[i] - X_std[j]) ** 2) ** .5 - X_out[i, j] = dist - return X_out - + return pairwise_distances(X, metric='mahalanobis') if is_pairwise(estimator): return kernel(X, X) From cb095fc3da94043d578da44ce33464111331ee45 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Wed, 18 Oct 2017 18:39:08 -0400 Subject: [PATCH 29/51] rename gram_matrix_if_pairwise() to maybe_pairwise(). refactor check_sample_weights_pandas_series() and is_pairwise_metric() --- sklearn/base.py | 2 +- sklearn/utils/estimator_checks.py | 51 +++++++++++++++---------------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 98302af60429a..2640550ee8963 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -594,4 +594,4 @@ def is_pairwise_metric(estimator): """ metric = getattr(estimator, "metric", None) - return metric == 'precomputed' and is_pairwise(estimator) + return bool(metric == 'precomputed') diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2958854b73e74..da030d3f4ea65 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -358,7 +358,7 @@ def _is_32bit(): return struct.calcsize('P') * 8 == 32 -def gram_matrix_if_pairwise(X, estimator, kernel=linear_kernel): +def maybe_pairwise(X, estimator, kernel=linear_kernel): if len(X.shape) == 1: X = X.reshape(-1, 1) @@ -380,7 +380,7 @@ def check_estimator_sparse_data(name, estimator_orig): rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) X_csr = sparse.csr_matrix(X) y = (4 * rng.rand(40)).astype(np.int) # catch deprecation warnings @@ -427,11 +427,8 @@ def check_sample_weights_pandas_series(name, estimator_orig): if has_fit_parameter(estimator, "sample_weight"): try: import pandas as pd - X = pd.DataFrame([[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]]) - # if _pairwise, feed estimator a pandas dataframe of the gram - # matrix - if is_pairwise(estimator_orig): - X = pd.DataFrame(rbf_kernel(X.values, X.values)) + X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]] + X = pd.DataFrame(maybe_pairwise(X, estimator_orig)) y = pd.Series([1, 1, 1, 2, 2, 2]) weights = pd.Series([1] * 6) try: @@ -452,7 +449,7 @@ def check_sample_weights_list(name, estimator_orig): if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) - X = gram_matrix_if_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) + X = maybe_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(estimator, y) sample_weight = [3] * 10 @@ -464,7 +461,7 @@ def check_sample_weights_list(name, estimator_orig): def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible rng = np.random.RandomState(0) - X = gram_matrix_if_pairwise(rng.rand(40, 10), estimator_orig) + X = maybe_pairwise(rng.rand(40, 10), estimator_orig) X = X.astype(object) y = (X[:, 0] * 4).astype(np.int) estimator = clone(estimator_orig) @@ -512,7 +509,7 @@ def check_dict_unchanged(name, estimator_orig): else: X = 2 * rnd.uniform(size=(20, 3)) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) @@ -551,7 +548,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -598,7 +595,7 @@ def check_fit2d_predict1d(name, estimator_orig): # check by fitting a 2d array and predicting with a 1d array rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -652,7 +649,7 @@ def check_fit2d_1feature(name, estimator_orig): # informative message rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -825,7 +822,7 @@ def check_pipeline_consistency(name, estimator_orig): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() - X = gram_matrix_if_pairwise(X, estimator_orig, kernel=rbf_kernel) + X = maybe_pairwise(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) set_random_state(estimator) @@ -850,7 +847,7 @@ def check_fit_score_takes_y(name, estimator_orig): # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = np.arange(10) % 3 estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -876,7 +873,7 @@ def check_fit_score_takes_y(name, estimator_orig): def check_estimators_dtypes(name, estimator_orig): rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) - X_train_32 = gram_matrix_if_pairwise(X_train_32, estimator_orig) + X_train_32 = maybe_pairwise(X_train_32, estimator_orig) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) @@ -922,7 +919,7 @@ def check_estimators_empty_data_messages(name, estimator_orig): def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) - X_train_finite = gram_matrix_if_pairwise(rnd.uniform(size=(10, 3)), + X_train_finite = maybe_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan @@ -1000,7 +997,7 @@ def check_estimators_pickle(name, estimator_orig): # some estimators can't do features less than 0 X -= X.min() - X = gram_matrix_if_pairwise(X, estimator_orig, kernel=rbf_kernel) + X = maybe_pairwise(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) @@ -1170,7 +1167,7 @@ def check_classifiers_train(name, classifier_orig): classifier = clone(classifier_orig) if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB']: X -= X.min() - X = gram_matrix_if_pairwise(X, classifier_orig) + X = maybe_pairwise(X, classifier_orig) set_random_state(classifier) # raises error on malformed input for fit with assert_raises(ValueError, msg="The classifer {} does not" @@ -1254,7 +1251,7 @@ def check_estimators_fit_returns_self(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9, n_features=4) # some want non-negative input X -= X.min() - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1302,7 +1299,7 @@ def check_supervised_y_2d(name, estimator_orig): # These only work on 2d, so this test makes no sense return rnd = np.random.RandomState(0) - X = gram_matrix_if_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) + X = maybe_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 3 estimator = clone(estimator_orig) set_random_state(estimator) @@ -1336,7 +1333,7 @@ def check_classifiers_classes(name, classifier_orig): # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 - X = gram_matrix_if_pairwise(X, classifier_orig) + X = maybe_pairwise(X, classifier_orig) y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: @@ -1368,7 +1365,7 @@ def check_classifiers_classes(name, classifier_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_regressors_int(name, regressor_orig): X, _ = _boston_subset() - X = gram_matrix_if_pairwise(X[:50], regressor_orig) + X = maybe_pairwise(X[:50], regressor_orig) rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) y = multioutput_estimator_convert_y_2d(regressor_orig, y) @@ -1396,7 +1393,7 @@ def check_regressors_int(name, regressor_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_regressors_train(name, regressor_orig): X, y = _boston_subset() - X = gram_matrix_if_pairwise(X, regressor_orig) + X = maybe_pairwise(X, regressor_orig) y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled y = y.ravel() regressor = clone(regressor_orig) @@ -1562,7 +1559,7 @@ def check_estimators_overwrite_params(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9) # some want non-negative input X -= X.min() - X = gram_matrix_if_pairwise(X, estimator_orig, kernel=rbf_kernel) + X = maybe_pairwise(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1637,7 +1634,7 @@ def check_sparsify_coefficients(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_classifier_data_not_an_array(name, estimator_orig): X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = [1, 1, 1, 2, 2, 2] y = multioutput_estimator_convert_y_2d(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) @@ -1646,7 +1643,7 @@ def check_classifier_data_not_an_array(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_regressor_data_not_an_array(name, estimator_orig): X, y = _boston_subset(n_samples=50) - X = gram_matrix_if_pairwise(X, estimator_orig) + X = maybe_pairwise(X, estimator_orig) y = multioutput_estimator_convert_y_2d(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) From eef2accbfc33b148c39c92b97f884e7f1dd544cb Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 19 Oct 2017 00:45:01 -0400 Subject: [PATCH 30/51] cast X to numpy array in check_sample_weights_pandas_series() so maybe_pairwise() has access to shape property --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index da030d3f4ea65..7bbd39fe60d17 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -427,7 +427,7 @@ def check_sample_weights_pandas_series(name, estimator_orig): if has_fit_parameter(estimator, "sample_weight"): try: import pandas as pd - X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]] + X = np.array([[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]]) X = pd.DataFrame(maybe_pairwise(X, estimator_orig)) y = pd.Series([1, 1, 1, 2, 2, 2]) weights = pd.Series([1] * 6) From 0bdb9366aaa95a5f8f6ecb15552ffa2f6ea48424 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 19 Oct 2017 01:07:30 -0400 Subject: [PATCH 31/51] PEP8 fix indentation --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 7bbd39fe60d17..fdee87b20ee3c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -920,7 +920,7 @@ def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) X_train_finite = maybe_pairwise(rnd.uniform(size=(10, 3)), - estimator_orig) + estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) From 84d86156b18e8a4c08109bd269ad9f644f25c9b2 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Fri, 20 Oct 2017 16:25:27 -0400 Subject: [PATCH 32/51] make is_pairwise helper functions private. Remove them from base. Consolidate pairwise tests into single test --- sklearn/base.py | 34 ------------- sklearn/utils/estimator_checks.py | 50 ++++++++++++++++---- sklearn/utils/tests/test_estimator_checks.py | 12 ++--- 3 files changed, 44 insertions(+), 52 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 2640550ee8963..4b7055086d7ba 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -561,37 +561,3 @@ def is_regressor(estimator): True if estimator is a regressor and False otherwise. """ return getattr(estimator, "_estimator_type", None) == "regressor" - - -def is_pairwise(estimator): - """Returns True if estimator has a _pairwise attribute set to True. - - Parameters - ---------- - estimator : object - Estimator object to test. - - Returns - ------- - out : bool - True if _pairwise is set to True and False otherwise. - """ - return bool(getattr(estimator, "_pairwise", False)) - - -def is_pairwise_metric(estimator): - """Returns True if estimator accepts pairwise metric. - - Parameters - ---------- - estimator : object - Estimator object to test. - - Returns - ------- - out : bool - True if _pairwise is set to True and False otherwise. - """ - metric = getattr(estimator, "metric", None) - - return bool(metric == 'precomputed') diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fdee87b20ee3c..d5dd89c5552ca 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -36,8 +36,8 @@ from sklearn.base import (clone, TransformerMixin, ClusterMixin, - BaseEstimator, is_classifier, is_regressor, - is_pairwise, is_pairwise_metric) + BaseEstimator, is_classifier, is_regressor) + from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score from sklearn.random_projection import BaseRandomProjection @@ -357,15 +357,48 @@ def _is_32bit(): """Detect if process is 32bit Python.""" return struct.calcsize('P') * 8 == 32 +def _is_pairwise(estimator): + """Returns True if estimator has a _pairwise attribute set to True. + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if _pairwise is set to True and False otherwise. + """ + return bool(getattr(estimator, "_pairwise", False)) + + +def _is_pairwise_metric(estimator): + """Returns True if estimator accepts pairwise metric. + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if _pairwise is set to True and False otherwise. + """ + metric = getattr(estimator, "metric", None) + + return bool(metric == 'precomputed') def maybe_pairwise(X, estimator, kernel=linear_kernel): if len(X.shape) == 1: X = X.reshape(-1, 1) - if is_pairwise_metric(estimator): + if _is_pairwise_metric(estimator): + # workaround for this function return pairwise_distances(X, metric='mahalanobis') - if is_pairwise(estimator): + if _is_pairwise(estimator): return kernel(X, X) return X @@ -373,8 +406,7 @@ def maybe_pairwise(X, estimator, kernel=linear_kernel): def check_estimator_sparse_data(name, estimator_orig): - # Sparse precomputed kernels aren't supported - if is_pairwise(estimator_orig): + if _is_pairwise(estimator_orig): return rng = np.random.RandomState(0) @@ -1189,7 +1221,7 @@ def check_classifiers_train(name, classifier_orig): assert_greater(accuracy_score(y, y_pred), 0.83) # raises error on malformed input for predict - if is_pairwise(classifier): + if _is_pairwise(classifier): with assert_raises(ValueError, msg="The classifier {} does not" " raise an error when the number of features " "in predict is not equal to (n_test_samples," @@ -1216,7 +1248,7 @@ def check_classifiers_train(name, classifier_orig): assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function - if not is_pairwise(classifier): + if not _is_pairwise(classifier): with assert_raises(ValueError, msg="The classifier {} does" " not raise an error when the number " "of features in decision_function is " @@ -1472,7 +1504,7 @@ def check_class_weight_classifiers(name, classifier_orig): random_state=0) # can't use gram_if_pairwise() here, setting up gram matrix manually - if is_pairwise(classifier_orig): + if _is_pairwise(classifier_orig): X_test = rbf_kernel(X_test, X_train) X_train = rbf_kernel(X_train, X_train) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 393a09c438e23..2323f8a634eb2 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -8,7 +8,6 @@ from sklearn.utils.testing import (assert_raises_regex, assert_true, assert_equal, ignore_warnings) from sklearn.utils.estimator_checks import check_estimator -from sklearn.utils.estimator_checks import is_pairwise, is_pairwise_metric from sklearn.utils.estimator_checks import set_random_state from sklearn.utils.estimator_checks import set_checking_parameters from sklearn.utils.estimator_checks import check_estimators_unfitted @@ -258,17 +257,12 @@ def __init__(self): def test_check_estimator_pairwise(): # check that check_estimator() works on estimator with _pairwise - # attribute set + # kernel or metric + # test precomputed kernel est = SVC(kernel='precomputed') - assert(is_pairwise(est)) check_estimator(est) - -def test_check_estimator_pairwise_metric(): - # check that check_estimator works for estimator that is based on - # a precomputed metric - + # test precomputed metric est = KNeighborsRegressor(metric='precomputed') - assert(is_pairwise_metric(est)) check_estimator(est) From f7b76d9a4245ea19c1df32ed72a2fb529b76e7e7 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 23 Oct 2017 15:57:55 -0400 Subject: [PATCH 33/51] make check_estimator_sparse_data() acknowledge 'Sparse' as well as 'sparse' --- sklearn/neighbors/regression.py | 2 +- sklearn/utils/estimator_checks.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index bd2ffb9b82489..461e14b433956 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -139,7 +139,7 @@ def predict(self, X): y : array of int, shape = [n_samples] or [n_samples, n_outputs] Target values """ - X = check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse=False) neigh_dist, neigh_ind = self.kneighbors(X) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d5dd89c5552ca..d118dec321c92 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -406,9 +406,6 @@ def maybe_pairwise(X, estimator, kernel=linear_kernel): def check_estimator_sparse_data(name, estimator_orig): - if _is_pairwise(estimator_orig): - return - rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 @@ -438,7 +435,7 @@ def check_estimator_sparse_data(name, estimator_orig): probs = estimator.predict_proba(X) assert_equal(probs.shape, (X.shape[0], 4)) except TypeError as e: - if 'sparse' not in repr(e): + if 'sparse' not in repr(e).lower(): print("Estimator %s doesn't seem to fail gracefully on " "sparse data: error message state explicitly that " "sparse input is not supported if this is not the case." From 42a508d27f8b19a3faa516c87e06f0b946b66469 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 24 Oct 2017 10:57:05 -0400 Subject: [PATCH 34/51] remove kneighbors tests for sparse matricies, not supported --- sklearn/neighbors/tests/test_neighbors.py | 43 ----------------------- 1 file changed, 43 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 052c83c71d2e7..fc7faac599879 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -484,29 +484,6 @@ def test_RadiusNeighborsClassifier_multioutput(): assert_array_almost_equal(y_pred_mo, y_pred_so) -def test_kneighbors_classifier_sparse(n_samples=40, - n_features=5, - n_test_pts=10, - n_neighbors=5, - random_state=0): - # Test k-NN classifier on sparse matrices - # Like the above, but with various types of sparse matrices - rng = np.random.RandomState(random_state) - X = 2 * rng.rand(n_samples, n_features) - 1 - X *= X > .2 - y = ((X ** 2).sum(axis=1) < .5).astype(np.int) - - for sparsemat in SPARSE_TYPES: - knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, - algorithm='auto') - knn.fit(sparsemat(X), y) - epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) - for sparsev in SPARSE_TYPES + (np.asarray,): - X_eps = sparsev(X[:n_test_pts] + epsilon) - y_pred = knn.predict(X_eps) - assert_array_equal(y_pred, y[:n_test_pts]) - - def test_KNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) @@ -716,26 +693,6 @@ def test_RadiusNeighborsRegressor_multioutput(n_samples=40, assert_true(np.all(np.abs(y_pred - y_target) < 0.3)) -def test_kneighbors_regressor_sparse(n_samples=40, - n_features=5, - n_test_pts=10, - n_neighbors=5, - random_state=0): - # Test radius-based regression on sparse matrices - # Like the above, but with various types of sparse matrices - rng = np.random.RandomState(random_state) - X = 2 * rng.rand(n_samples, n_features) - 1 - y = ((X ** 2).sum(axis=1) < .25).astype(np.int) - - for sparsemat in SPARSE_TYPES: - knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, - algorithm='auto') - knn.fit(sparsemat(X), y) - for sparsev in SPARSE_OR_DENSE: - X2 = sparsev(X) - assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) - - def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a From adf110a76d25bda20ee611b65ee803eab1a64791 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 24 Oct 2017 16:12:37 -0400 Subject: [PATCH 35/51] bring tests for kneighbors on sparse data back, check for ValueError --- sklearn/neighbors/regression.py | 6 ++++ sklearn/neighbors/tests/test_neighbors.py | 44 +++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 461e14b433956..81628f4877979 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -9,6 +9,7 @@ # License: BSD 3 clause (C) INRIA, University of Amsterdam import numpy as np +from scipy.sparse import issparse from .base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin from .base import RadiusNeighborsMixin, SupervisedFloatMixin @@ -139,6 +140,11 @@ def predict(self, X): y : array of int, shape = [n_samples] or [n_samples, n_outputs] Target values """ + if issparse(X): + raise ValueError( + "Sparse matricies not supported for prediction. " + "Densify your matrix." + ) X = check_array(X, accept_sparse=False) neigh_dist, neigh_ind = self.kneighbors(X) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index fc7faac599879..d4a1fe373cedd 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -484,6 +484,29 @@ def test_RadiusNeighborsClassifier_multioutput(): assert_array_almost_equal(y_pred_mo, y_pred_so) +def test_kneighbors_classifier_sparse(n_samples=40, + n_features=5, + n_test_pts=10, + n_neighbors=5, + random_state=0): + # Test k-NN classifier on sparse matrices + # Like the above, but with various types of sparse matrices + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + X *= X > .2 + y = ((X ** 2).sum(axis=1) < .5).astype(np.int) + + for sparsemat in SPARSE_TYPES: + knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, + algorithm='auto') + knn.fit(sparsemat(X), y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + for sparsev in SPARSE_TYPES + (np.asarray,): + X_eps = sparsev(X[:n_test_pts] + epsilon) + y_pred = knn.predict(X_eps) + assert_array_equal(y_pred, y[:n_test_pts]) + + def test_KNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) @@ -693,6 +716,27 @@ def test_RadiusNeighborsRegressor_multioutput(n_samples=40, assert_true(np.all(np.abs(y_pred - y_target) < 0.3)) +def test_kneighbors_regressor_sparse(n_samples=40, + n_features=5, + n_test_pts=10, + n_neighbors=5, + random_state=0): + # Test radius-based regression on sparse matrices + # Like the above, but with various types of sparse matrices + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + y = ((X ** 2).sum(axis=1) < .25).astype(np.int) + + for sparsemat in SPARSE_TYPES: + knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, + algorithm='auto') + knn.fit(sparsemat(X), y) + for sparsev in SPARSE_OR_DENSE: + X2 = sparsev(X) + assert_raises(ValueError, knn.predict, csr_matrix(X2)) + # assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) + + def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a From 35ae37288bc1c73b7447365d5a155ee0508368b9 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 24 Oct 2017 17:23:08 -0400 Subject: [PATCH 36/51] fix check_estimator_sparse_data() test to account for TypeError and ValueError for sparse matricies --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d118dec321c92..1f4b21b8f0119 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -434,7 +434,7 @@ def check_estimator_sparse_data(name, estimator_orig): if hasattr(estimator, 'predict_proba'): probs = estimator.predict_proba(X) assert_equal(probs.shape, (X.shape[0], 4)) - except TypeError as e: + except (TypeError, ValueError) as e: if 'sparse' not in repr(e).lower(): print("Estimator %s doesn't seem to fail gracefully on " "sparse data: error message state explicitly that " From 688376fb3168d96f8d6ba86b3a8dd2b94b8decb8 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 24 Oct 2017 19:00:48 -0400 Subject: [PATCH 37/51] PEP8 add newlines --- sklearn/utils/estimator_checks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1f4b21b8f0119..3354fda2ac5fd 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -357,6 +357,7 @@ def _is_32bit(): """Detect if process is 32bit Python.""" return struct.calcsize('P') * 8 == 32 + def _is_pairwise(estimator): """Returns True if estimator has a _pairwise attribute set to True. @@ -390,6 +391,7 @@ def _is_pairwise_metric(estimator): return bool(metric == 'precomputed') + def maybe_pairwise(X, estimator, kernel=linear_kernel): if len(X.shape) == 1: From a3e7b64c6ad9eb26425bfbcdd06dc844a4919ffa Mon Sep 17 00:00:00 2001 From: gkjohns Date: Wed, 25 Oct 2017 14:25:39 -0400 Subject: [PATCH 38/51] add Y argument (the input vector again) to pairwise_distances() hopefully mitigate weird Travis error for maybe_pairwise() --- sklearn/utils/estimator_checks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3354fda2ac5fd..5ee13455c674b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -398,8 +398,7 @@ def maybe_pairwise(X, estimator, kernel=linear_kernel): X = X.reshape(-1, 1) if _is_pairwise_metric(estimator): - # workaround for this function - return pairwise_distances(X, metric='mahalanobis') + return pairwise_distances(X, X, metric='mahalanobis') if _is_pairwise(estimator): return kernel(X, X) From 2b131f38dbac70c8f25ffdf6e13d0b0fb6802dc4 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Fri, 27 Oct 2017 10:34:55 -0400 Subject: [PATCH 39/51] use euclidean distance in maybe_pairwise() to try and mitigate weird travis ci error --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5ee13455c674b..da5f6dd0b898f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -398,7 +398,7 @@ def maybe_pairwise(X, estimator, kernel=linear_kernel): X = X.reshape(-1, 1) if _is_pairwise_metric(estimator): - return pairwise_distances(X, X, metric='mahalanobis') + return pairwise_distances(X, metric='euclidean') if _is_pairwise(estimator): return kernel(X, X) From 834918ca80bf5a1ce6c5df5db86d01a116f274b2 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 30 Oct 2017 09:52:01 -0400 Subject: [PATCH 40/51] change maybe_pairwise() name to pairwise_estimator_convert_X() --- sklearn/utils/estimator_checks.py | 46 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index a4b51dc233313..60bfe520b6b33 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -392,7 +392,7 @@ def _is_pairwise_metric(estimator): return bool(metric == 'precomputed') -def maybe_pairwise(X, estimator, kernel=linear_kernel): +def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel): if len(X.shape) == 1: X = X.reshape(-1, 1) @@ -410,7 +410,7 @@ def check_estimator_sparse_data(name, estimator_orig): rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) X_csr = sparse.csr_matrix(X) y = (4 * rng.rand(40)).astype(np.int) # catch deprecation warnings @@ -458,7 +458,7 @@ def check_sample_weights_pandas_series(name, estimator_orig): try: import pandas as pd X = np.array([[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]]) - X = pd.DataFrame(maybe_pairwise(X, estimator_orig)) + X = pd.DataFrame(pairwise_estimator_convert_X(X, estimator_orig)) y = pd.Series([1, 1, 1, 2, 2, 2]) weights = pd.Series([1] * 6) try: @@ -479,7 +479,7 @@ def check_sample_weights_list(name, estimator_orig): if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) - X = maybe_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) + X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(estimator, y) sample_weight = [3] * 10 @@ -491,7 +491,7 @@ def check_sample_weights_list(name, estimator_orig): def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible rng = np.random.RandomState(0) - X = maybe_pairwise(rng.rand(40, 10), estimator_orig) + X = pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) X = X.astype(object) y = (X[:, 0] * 4).astype(np.int) estimator = clone(estimator_orig) @@ -539,7 +539,7 @@ def check_dict_unchanged(name, estimator_orig): else: X = 2 * rnd.uniform(size=(20, 3)) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) @@ -578,7 +578,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -625,7 +625,7 @@ def check_fit2d_predict1d(name, estimator_orig): # check by fitting a 2d array and predicting with a 1d array rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -679,7 +679,7 @@ def check_fit2d_1feature(name, estimator_orig): # informative message rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -852,7 +852,7 @@ def check_pipeline_consistency(name, estimator_orig): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() - X = maybe_pairwise(X, estimator_orig, kernel=rbf_kernel) + X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) set_random_state(estimator) @@ -877,7 +877,7 @@ def check_fit_score_takes_y(name, estimator_orig): # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = np.arange(10) % 3 estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -903,7 +903,7 @@ def check_fit_score_takes_y(name, estimator_orig): def check_estimators_dtypes(name, estimator_orig): rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) - X_train_32 = maybe_pairwise(X_train_32, estimator_orig) + X_train_32 = pairwise_estimator_convert_X(X_train_32, estimator_orig) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) @@ -949,7 +949,7 @@ def check_estimators_empty_data_messages(name, estimator_orig): def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) - X_train_finite = maybe_pairwise(rnd.uniform(size=(10, 3)), + X_train_finite = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan @@ -1027,7 +1027,7 @@ def check_estimators_pickle(name, estimator_orig): # some estimators can't do features less than 0 X -= X.min() - X = maybe_pairwise(X, estimator_orig, kernel=rbf_kernel) + X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) @@ -1202,7 +1202,7 @@ def check_classifiers_train(name, classifier_orig): classifier = clone(classifier_orig) if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB']: X -= X.min() - X = maybe_pairwise(X, classifier_orig) + X = pairwise_estimator_convert_X(X, classifier_orig) set_random_state(classifier) # raises error on malformed input for fit with assert_raises(ValueError, msg="The classifer {} does not" @@ -1286,7 +1286,7 @@ def check_estimators_fit_returns_self(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9, n_features=4) # some want non-negative input X -= X.min() - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1334,7 +1334,7 @@ def check_supervised_y_2d(name, estimator_orig): # These only work on 2d, so this test makes no sense return rnd = np.random.RandomState(0) - X = maybe_pairwise(rnd.uniform(size=(10, 3)), estimator_orig) + X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) y = np.arange(10) % 3 estimator = clone(estimator_orig) set_random_state(estimator) @@ -1368,7 +1368,7 @@ def check_classifiers_classes(name, classifier_orig): # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 - X = maybe_pairwise(X, classifier_orig) + X = pairwise_estimator_convert_X(X, classifier_orig) y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: @@ -1400,7 +1400,7 @@ def check_classifiers_classes(name, classifier_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_regressors_int(name, regressor_orig): X, _ = _boston_subset() - X = maybe_pairwise(X[:50], regressor_orig) + X = pairwise_estimator_convert_X(X[:50], regressor_orig) rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) y = multioutput_estimator_convert_y_2d(regressor_orig, y) @@ -1428,7 +1428,7 @@ def check_regressors_int(name, regressor_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_regressors_train(name, regressor_orig): X, y = _boston_subset() - X = maybe_pairwise(X, regressor_orig) + X = pairwise_estimator_convert_X(X, regressor_orig) y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled y = y.ravel() regressor = clone(regressor_orig) @@ -1594,7 +1594,7 @@ def check_estimators_overwrite_params(name, estimator_orig): X, y = make_blobs(random_state=0, n_samples=9) # some want non-negative input X -= X.min() - X = maybe_pairwise(X, estimator_orig, kernel=rbf_kernel) + X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) @@ -1669,7 +1669,7 @@ def check_sparsify_coefficients(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_classifier_data_not_an_array(name, estimator_orig): X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = [1, 1, 1, 2, 2, 2] y = multioutput_estimator_convert_y_2d(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) @@ -1678,7 +1678,7 @@ def check_classifier_data_not_an_array(name, estimator_orig): @ignore_warnings(category=DeprecationWarning) def check_regressor_data_not_an_array(name, estimator_orig): X, y = _boston_subset(n_samples=50) - X = maybe_pairwise(X, estimator_orig) + X = pairwise_estimator_convert_X(X, estimator_orig) y = multioutput_estimator_convert_y_2d(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) From 5032f3591bf7f98900ddba53956105d44b3079b1 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 30 Oct 2017 11:20:09 -0400 Subject: [PATCH 41/51] change test_kneighbors_regressor_sparse() to only check for error on precomputed sparse X. Re-allow KNeighborsRegressor to predict sparse X (that's not precomputed) --- sklearn/neighbors/regression.py | 8 ++++---- sklearn/neighbors/tests/test_neighbors.py | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 81628f4877979..9fc65303dac8d 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -140,12 +140,12 @@ def predict(self, X): y : array of int, shape = [n_samples] or [n_samples, n_outputs] Target values """ - if issparse(X): + if issparse(X) and self.metric == 'precomputed': raise ValueError( - "Sparse matricies not supported for prediction. " - "Densify your matrix." + "Sparse matricies not supported for prediction with " + "precomputed kernels. Densify your matrix." ) - X = check_array(X, accept_sparse=False) + X = check_array(X, accept_sparse='csr') neigh_dist, neigh_ind = self.kneighbors(X) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index d4a1fe373cedd..31e0a8c7e4d39 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -733,8 +733,11 @@ def test_kneighbors_regressor_sparse(n_samples=40, knn.fit(sparsemat(X), y) for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) - assert_raises(ValueError, knn.predict, csr_matrix(X2)) - # assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) + # sparse precomputed distance matrices not supported for prediction + if knn.metric == 'precomputed': + assert_raises(ValueError, knn.predict, csr_matrix(X2)) + else: + assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) def test_neighbors_iris(): From 892771f3af303d114f12d3a5ad43094546979b78 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 30 Oct 2017 11:22:07 -0400 Subject: [PATCH 42/51] PEP8 fix line length --- sklearn/utils/estimator_checks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 60bfe520b6b33..176daeecc2ebd 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -479,7 +479,8 @@ def check_sample_weights_list(name, estimator_orig): if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) - X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) + X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), + estimator_orig) y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(estimator, y) sample_weight = [3] * 10 From ad658394f93fc84b02439965f4468c24a3c56392 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 30 Oct 2017 11:53:55 -0400 Subject: [PATCH 43/51] PEP8 again --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 176daeecc2ebd..93a903c5997f3 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -951,7 +951,7 @@ def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) X_train_finite = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), - estimator_orig) + estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) From b56899b1ed5ffdcd84a1555e83f7c6aeb5e370d8 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 31 Oct 2017 20:18:37 -0400 Subject: [PATCH 44/51] change check_classifiers_train() test to check pairwise eatimators for decision_function and predict_proba --- sklearn/utils/estimator_checks.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 93a903c5997f3..d0543afa3e919 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1227,7 +1227,7 @@ def check_classifiers_train(name, classifier_orig): # raises error on malformed input for predict if _is_pairwise(classifier): with assert_raises(ValueError, msg="The classifier {} does not" - " raise an error when the number of features " + " raise an error when shape of X" "in predict is not equal to (n_test_samples," "n_training_samples)".format(name)): classifier.predict(X.reshape(-1, 1)) @@ -1252,7 +1252,14 @@ def check_classifiers_train(name, classifier_orig): assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function - if not _is_pairwise(classifier): + if _is_pairwise(classifier): + with assert_raises(ValueError, msg="The classifier {} does" + " not raise an error when the " + "shape of X in decision_function is " + "not equal to (n_test_samples, " + "n_training_samples) in fit.".format(name)): + classifier.decision_function(X.reshape(-1, 1)) + else: with assert_raises(ValueError, msg="The classifier {} does" " not raise an error when the number " "of features in decision_function is " @@ -1269,11 +1276,19 @@ def check_classifiers_train(name, classifier_orig): # check that probas for all classes sum to one assert_allclose(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input for predict_proba - with assert_raises(ValueError, msg="The classifier {} does not" - " raise an error when the number of features " - "in predict_proba is different from the number " - "of features in fit.".format(name)): - classifier.predict_proba(X.T) + if _is_pairwise(classifier_orig): + with assert_raises(ValueError, msg="The classifier {} does not" + " raise an error when the shape of X" + "in predict_proba is not equal to " + "(n_test_samples, n_training_samples)."\ + .format(name)): + classifier.predict_proba(X.reshape(-1, 1)) + else: + with assert_raises(ValueError, msg="The classifier {} does not" + " raise an error when the number of features " + "in predict_proba is different from the number " + "of features in fit.".format(name)): + classifier.predict_proba(X.T) if hasattr(classifier, "predict_log_proba"): # predict_log_proba is a transformation of predict_proba y_log_prob = classifier.predict_log_proba(X) From e055fb83b19c65a4e1aa33959709e14e45fb7fa2 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Tue, 31 Oct 2017 20:59:07 -0400 Subject: [PATCH 45/51] PEP8 line length fix --- sklearn/utils/estimator_checks.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d0543afa3e919..7355f0179b06b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1257,7 +1257,8 @@ def check_classifiers_train(name, classifier_orig): " not raise an error when the " "shape of X in decision_function is " "not equal to (n_test_samples, " - "n_training_samples) in fit.".format(name)): + "n_training_samples) in fit." + .format(name)): classifier.decision_function(X.reshape(-1, 1)) else: with assert_raises(ValueError, msg="The classifier {} does" @@ -1280,14 +1281,15 @@ def check_classifiers_train(name, classifier_orig): with assert_raises(ValueError, msg="The classifier {} does not" " raise an error when the shape of X" "in predict_proba is not equal to " - "(n_test_samples, n_training_samples)."\ + "(n_test_samples, n_training_samples)." .format(name)): classifier.predict_proba(X.reshape(-1, 1)) else: with assert_raises(ValueError, msg="The classifier {} does not" - " raise an error when the number of features " - "in predict_proba is different from the number " - "of features in fit.".format(name)): + " raise an error when the number of " + "features in predict_proba is different " + "from the number of features in fit." + .format(name)): classifier.predict_proba(X.T) if hasattr(classifier, "predict_log_proba"): # predict_log_proba is a transformation of predict_proba From efeb0671261bc41c98e2566d5251171b4045dc6b Mon Sep 17 00:00:00 2001 From: gkjohns Date: Wed, 1 Nov 2017 12:21:23 -0400 Subject: [PATCH 46/51] update whats_new with changes to estimator checks --- doc/whats_new/v0.20.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 0897f331ebda0..9faef68812ea7 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -181,3 +181,9 @@ Cluster - Deprecate ``pooling_func`` unused parameter in :class:`cluster.AgglomerativeClustering`. :issue:`9875` by :user:`Kumar Ashutosh `. + +Changes to estimator checks +--------------------------- + +- Pairwise Estimators + From 3116c23d4363c2b7b06d158395769285ec53d362 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Sun, 5 Nov 2017 18:47:43 -0500 Subject: [PATCH 47/51] add change details to whats_new --- doc/whats_new/v0.20.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 9faef68812ea7..812f8daacdcca 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -185,5 +185,6 @@ Cluster Changes to estimator checks --------------------------- -- Pairwise Estimators - +- Allow tests in :func:`estimator_checks.check_estimator` to test functions + that accept pairwise data. + :issue:`9701` by :user:`Andreas Mueller ` From 42fa8f413a886828f1d9ce8fce0ef1055b901ae3 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 6 Nov 2017 11:59:48 -0500 Subject: [PATCH 48/51] remove unused lines in estimator_checks.pairwise_estimator_convert_X() --- sklearn/utils/estimator_checks.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 7355f0179b06b..40fcb1fdd069f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -394,9 +394,6 @@ def _is_pairwise_metric(estimator): def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel): - if len(X.shape) == 1: - X = X.reshape(-1, 1) - if _is_pairwise_metric(estimator): return pairwise_distances(X, metric='euclidean') if _is_pairwise(estimator): From 69d78764e735723204aa2754fcc7e5fc0695791f Mon Sep 17 00:00:00 2001 From: gkjohns Date: Mon, 6 Nov 2017 12:11:09 -0500 Subject: [PATCH 49/51] remove assert_raises() for precomputed metric in test_kneighbors_regressor_sparse(). Already checked using test_check_estimator_pairwise() --- doc/whats_new/v0.20.rst | 2 +- sklearn/neighbors/regression.py | 2 +- sklearn/neighbors/tests/test_neighbors.py | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 812f8daacdcca..13734bb828660 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -187,4 +187,4 @@ Changes to estimator checks - Allow tests in :func:`estimator_checks.check_estimator` to test functions that accept pairwise data. - :issue:`9701` by :user:`Andreas Mueller ` + :issue:`9701` by :user:`Kyle Johnson ` diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 9fc65303dac8d..b13f16cfd399e 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -142,7 +142,7 @@ def predict(self, X): """ if issparse(X) and self.metric == 'precomputed': raise ValueError( - "Sparse matricies not supported for prediction with " + "Sparse matrices not supported for prediction with " "precomputed kernels. Densify your matrix." ) X = check_array(X, accept_sparse='csr') diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 31e0a8c7e4d39..052c83c71d2e7 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -733,11 +733,7 @@ def test_kneighbors_regressor_sparse(n_samples=40, knn.fit(sparsemat(X), y) for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) - # sparse precomputed distance matrices not supported for prediction - if knn.metric == 'precomputed': - assert_raises(ValueError, knn.predict, csr_matrix(X2)) - else: - assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) + assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) def test_neighbors_iris(): From 44f4dd657de8d9163a8aa00a730fd7d0007d95b6 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 9 Nov 2017 16:53:43 -0500 Subject: [PATCH 50/51] check if test data is sparse, check for ValueError instead of accuracy if yes --- sklearn/neighbors/tests/test_neighbors.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 052c83c71d2e7..89baada19fff4 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -2,7 +2,7 @@ import numpy as np from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, - dok_matrix, lil_matrix) + dok_matrix, lil_matrix, issparse) from sklearn import metrics from sklearn import neighbors, datasets @@ -731,10 +731,22 @@ def test_kneighbors_regressor_sparse(n_samples=40, knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm='auto') knn.fit(sparsemat(X), y) + + knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, + metric='precomputed') + knn_pre.fit(pairwise_distances(X, metric='euclidean'), y) + for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) + X2_pre = sparsev(pairwise_distances(X, metric='euclidean')) + if issparse(sparsev(X2_pre)): + assert_raises(ValueError, knn_pre.predict, X2_pre) + else: + assert_true(\ + np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95) + def test_neighbors_iris(): # Sanity checks on the iris dataset From b5ede88436d69866ec648448de6a9aba18b0a4f5 Mon Sep 17 00:00:00 2001 From: gkjohns Date: Thu, 9 Nov 2017 16:55:09 -0500 Subject: [PATCH 51/51] remove redundant backslash --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 89baada19fff4..ceb53412018b8 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -744,7 +744,7 @@ def test_kneighbors_regressor_sparse(n_samples=40, if issparse(sparsev(X2_pre)): assert_raises(ValueError, knn_pre.predict, X2_pre) else: - assert_true(\ + assert_true( np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95)