From 31815ac4e773ed6762ef179d7c04d59c03de2b23 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Thu, 16 Feb 2023 12:17:33 +0500 Subject: [PATCH 1/4] TST Common test for checking estimator deserialization from a read only buffer --- sklearn/tests/test_common.py | 67 +++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6ef0eaa433d20..81f2944ce5110 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -15,9 +15,11 @@ from itertools import product, chain from functools import partial +import joblib import pytest import numpy as np +from sklearn import clone from sklearn.cluster import ( AffinityPropagation, Birch, @@ -25,7 +27,12 @@ OPTICS, SpectralClustering, ) -from sklearn.datasets import make_blobs +from sklearn.datasets import ( + make_blobs, + make_regression, + load_linnerud, + make_multilabel_classification, +) from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding from sklearn.neighbors import ( LocalOutlierFactor, @@ -601,3 +608,61 @@ def test_global_output_transform_pandas(estimator): _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator) + + +rng = np.random.RandomState(0) +linnerud = load_linnerud() +X_default, y_default = rng.rand(23, 14), rng.randint(1, 3, size=23) +X_multi_out, y_multi_out = make_multilabel_classification(n_classes=3, random_state=0) +X_isotonic, y_isotonic = make_regression(n_samples=10, n_features=1, random_state=0) +X_square = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]] +X_text = [ + "This is the 1st document in my corpus.", + "This document is the 2nd sample.", + "And this is the 3rd one.", + "Is this the 4th document?", +] +X_dict = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}] +X_rnd_projection = rng.rand(25, 3000) +X_tsne = rng.randn(40, 2) + +ESTIMATOR_DATA = { + "CCA": (linnerud.data, linnerud.target), + "ClassifierChain": (X_multi_out, y_multi_out), + "CountVectorizer": (X_text, None), + "DictVectorizer": (X_dict, None), + "GaussianRandomProjection": (X_rnd_projection, None), + "HashingVectorizer": (X_text, None), + "IsotonicRegression": (X_isotonic, y_isotonic), + "KernelCenterer": (X_square, None), + "MultiLabelBinarizer": (X_multi_out, y_multi_out), + "MultiOutputClassifier": (X_multi_out, y_multi_out), + "MultiOutputRegressor": (X_multi_out, y_multi_out), + "MultiTaskElasticNet": (X_multi_out, y_multi_out), + "MultiTaskElasticNetCV": (X_multi_out, y_multi_out), + "MultiTaskLasso": (X_multi_out, y_multi_out), + "MultiTaskLassoCV": (X_multi_out, y_multi_out), + "PLSCanonical": (linnerud.data, linnerud.target), + "PLSSVD": (linnerud.data, linnerud.target), + "RegressorChain": (X_multi_out, y_multi_out), + "SparseRandomProjection": (X_rnd_projection, None), + "TfidfVectorizer": (X_text, None), + "TSNE": (X_tsne, None), +} + + +@pytest.mark.parametrize( + "estimator_org", _tested_estimators(), ids=_get_check_estimator_ids +) +def test_estimator_deserialization_from_readonly_buffer(estimator_org, tmpdir): + pickle_path = str(tmpdir.join("clf.joblib")) + estimator = clone(estimator_org) + estimator_name = estimator.__class__.__name__ + default_data = (X_default, y_default) + X, y = ESTIMATOR_DATA.get(estimator_name, default_data) + if estimator_name in ("LabelBinarizer", "LabelEncoder", "MultiLabelBinarizer"): + estimator.fit(y) + else: + estimator.fit(X, y) + joblib.dump(estimator, pickle_path) + joblib.load(pickle_path, mmap_mode="r") From e87384644b85c38e40119858c45f5a9a32f59aab Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Thu, 16 Feb 2023 17:58:04 +0500 Subject: [PATCH 2/4] Adjust the current pickle estimator check with an argument of readonly_memmap --- sklearn/tests/test_common.py | 67 +------------------------------ sklearn/utils/estimator_checks.py | 26 +++++++----- 2 files changed, 16 insertions(+), 77 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 81f2944ce5110..6ef0eaa433d20 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -15,11 +15,9 @@ from itertools import product, chain from functools import partial -import joblib import pytest import numpy as np -from sklearn import clone from sklearn.cluster import ( AffinityPropagation, Birch, @@ -27,12 +25,7 @@ OPTICS, SpectralClustering, ) -from sklearn.datasets import ( - make_blobs, - make_regression, - load_linnerud, - make_multilabel_classification, -) +from sklearn.datasets import make_blobs from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding from sklearn.neighbors import ( LocalOutlierFactor, @@ -608,61 +601,3 @@ def test_global_output_transform_pandas(estimator): _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator) - - -rng = np.random.RandomState(0) -linnerud = load_linnerud() -X_default, y_default = rng.rand(23, 14), rng.randint(1, 3, size=23) -X_multi_out, y_multi_out = make_multilabel_classification(n_classes=3, random_state=0) -X_isotonic, y_isotonic = make_regression(n_samples=10, n_features=1, random_state=0) -X_square = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]] -X_text = [ - "This is the 1st document in my corpus.", - "This document is the 2nd sample.", - "And this is the 3rd one.", - "Is this the 4th document?", -] -X_dict = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}] -X_rnd_projection = rng.rand(25, 3000) -X_tsne = rng.randn(40, 2) - -ESTIMATOR_DATA = { - "CCA": (linnerud.data, linnerud.target), - "ClassifierChain": (X_multi_out, y_multi_out), - "CountVectorizer": (X_text, None), - "DictVectorizer": (X_dict, None), - "GaussianRandomProjection": (X_rnd_projection, None), - "HashingVectorizer": (X_text, None), - "IsotonicRegression": (X_isotonic, y_isotonic), - "KernelCenterer": (X_square, None), - "MultiLabelBinarizer": (X_multi_out, y_multi_out), - "MultiOutputClassifier": (X_multi_out, y_multi_out), - "MultiOutputRegressor": (X_multi_out, y_multi_out), - "MultiTaskElasticNet": (X_multi_out, y_multi_out), - "MultiTaskElasticNetCV": (X_multi_out, y_multi_out), - "MultiTaskLasso": (X_multi_out, y_multi_out), - "MultiTaskLassoCV": (X_multi_out, y_multi_out), - "PLSCanonical": (linnerud.data, linnerud.target), - "PLSSVD": (linnerud.data, linnerud.target), - "RegressorChain": (X_multi_out, y_multi_out), - "SparseRandomProjection": (X_rnd_projection, None), - "TfidfVectorizer": (X_text, None), - "TSNE": (X_tsne, None), -} - - -@pytest.mark.parametrize( - "estimator_org", _tested_estimators(), ids=_get_check_estimator_ids -) -def test_estimator_deserialization_from_readonly_buffer(estimator_org, tmpdir): - pickle_path = str(tmpdir.join("clf.joblib")) - estimator = clone(estimator_org) - estimator_name = estimator.__class__.__name__ - default_data = (X_default, y_default) - X, y = ESTIMATOR_DATA.get(estimator_name, default_data) - if estimator_name in ("LabelBinarizer", "LabelEncoder", "MultiLabelBinarizer"): - estimator.fit(y) - else: - estimator.fit(X, y) - joblib.dump(estimator, pickle_path) - joblib.load(pickle_path, mmap_mode="r") diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index bbfd12ad39b9c..9eb666c68984d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -129,6 +129,7 @@ def _yield_checks(estimator): # Test that estimators can be pickled, and once pickled # give the same answer as before. yield check_estimators_pickle + yield partial(check_estimators_pickle, readonly_memmap=True) yield check_estimator_get_tags_default_keys @@ -1870,7 +1871,7 @@ def check_nonsquare_error(name, estimator_orig): @ignore_warnings -def check_estimators_pickle(name, estimator_orig): +def check_estimators_pickle(name, estimator_orig, readonly_memmap=False): """Test that we can pickle all estimators.""" check_methods = ["predict", "transform", "decision_function", "predict_proba"] @@ -1899,16 +1900,19 @@ def check_estimators_pickle(name, estimator_orig): set_random_state(estimator) estimator.fit(X, y) - # pickle and unpickle! - pickled_estimator = pickle.dumps(estimator) - module_name = estimator.__module__ - if module_name.startswith("sklearn.") and not ( - "test_" in module_name or module_name.endswith("_testing") - ): - # strict check for sklearn estimators that are not implemented in test - # modules. - assert b"version" in pickled_estimator - unpickled_estimator = pickle.loads(pickled_estimator) + if readonly_memmap: + unpickled_estimator = create_memmap_backed_data(estimator) + else: + # pickle and unpickle! + pickled_estimator = pickle.dumps(estimator) + module_name = estimator.__module__ + if module_name.startswith("sklearn.") and not ( + "test_" in module_name or module_name.endswith("_testing") + ): + # strict check for sklearn estimators that are not implemented in test + # modules. + assert b"version" in pickled_estimator + unpickled_estimator = pickle.loads(pickled_estimator) result = dict() for method in check_methods: From 9751d1dee9171413ba20c0ad0872fec051c312b1 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 17 Feb 2023 12:36:15 +0500 Subject: [PATCH 3/4] * Fix error in hist_gradient_boosting by using const memory views * Mark check_estimators_pickle as xfail for SplineTransformer --- .../ensemble/_hist_gradient_boosting/_predictor.pyx | 4 ++-- sklearn/preprocessing/_polynomial.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index dab18bdd1d49c..85f6b20490db0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -14,7 +14,7 @@ from ._bitset cimport in_bitset_2d_memoryview def _predict_from_raw_data( # raw data = non-binned data - node_struct [:] nodes, + const node_struct [:] nodes, const X_DTYPE_C [:, :] numeric_data, const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets, const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets, @@ -34,7 +34,7 @@ def _predict_from_raw_data( # raw data = non-binned data cdef inline Y_DTYPE_C _predict_one_from_raw_data( - node_struct [:] nodes, + const node_struct [:] nodes, const X_DTYPE_C [:, :] numeric_data, const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets, const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets, diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 46be5d2c13bbc..9e09715a50718 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -936,3 +936,13 @@ def transform(self, X): # We chose the last one. indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0] return XBS[:, indices] + + def _more_tags(self): + return { + "_xfail_checks": { + "check_estimators_pickle": ( + "Current Scipy implementation of _bsplines does not" + "support const memory views." + ), + } + } From 732760b1e52bc27aedac2bc0ee06e71bb6854770 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 17 Feb 2023 17:22:42 +0500 Subject: [PATCH 4/4] Revert the changes to predictor.pyx and instead add _xfail_checks --- .../ensemble/_hist_gradient_boosting/_predictor.pyx | 4 ++-- .../_hist_gradient_boosting/gradient_boosting.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 85f6b20490db0..dab18bdd1d49c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -14,7 +14,7 @@ from ._bitset cimport in_bitset_2d_memoryview def _predict_from_raw_data( # raw data = non-binned data - const node_struct [:] nodes, + node_struct [:] nodes, const X_DTYPE_C [:, :] numeric_data, const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets, const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets, @@ -34,7 +34,7 @@ def _predict_from_raw_data( # raw data = non-binned data cdef inline Y_DTYPE_C _predict_one_from_raw_data( - const node_struct [:] nodes, + node_struct [:] nodes, const X_DTYPE_C [:, :] numeric_data, const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets, const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 31069fe14ee41..23e76b4307b6f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1165,7 +1165,15 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions def _more_tags(self): - return {"allow_nan": True} + return { + "allow_nan": True, + "_xfail_checks": { + "check_estimators_pickle": ( + "The memory views of the nodes parameter need to be defined" + "as read only in the Cython implementation." + ), + }, + } @abstractmethod def _get_loss(self, sample_weight):