From 31815ac4e773ed6762ef179d7c04d59c03de2b23 Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Thu, 16 Feb 2023 12:17:33 +0500
Subject: [PATCH 1/4] TST Common test for checking estimator deserialization
 from a read only buffer

---
 sklearn/tests/test_common.py | 67 +++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 6ef0eaa433d20..81f2944ce5110 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -15,9 +15,11 @@
 from itertools import product, chain
 from functools import partial
 
+import joblib
 import pytest
 import numpy as np
 
+from sklearn import clone
 from sklearn.cluster import (
     AffinityPropagation,
     Birch,
@@ -25,7 +27,12 @@
     OPTICS,
     SpectralClustering,
 )
-from sklearn.datasets import make_blobs
+from sklearn.datasets import (
+    make_blobs,
+    make_regression,
+    load_linnerud,
+    make_multilabel_classification,
+)
 from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding
 from sklearn.neighbors import (
     LocalOutlierFactor,
@@ -601,3 +608,61 @@ def test_global_output_transform_pandas(estimator):
     _set_checking_parameters(estimator)
     with ignore_warnings(category=(FutureWarning)):
         check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator)
+
+
+rng = np.random.RandomState(0)
+linnerud = load_linnerud()
+X_default, y_default = rng.rand(23, 14), rng.randint(1, 3, size=23)
+X_multi_out, y_multi_out = make_multilabel_classification(n_classes=3, random_state=0)
+X_isotonic, y_isotonic = make_regression(n_samples=10, n_features=1, random_state=0)
+X_square = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]]
+X_text = [
+    "This is the 1st document in my corpus.",
+    "This document is the 2nd sample.",
+    "And this is the 3rd one.",
+    "Is this the 4th document?",
+]
+X_dict = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
+X_rnd_projection = rng.rand(25, 3000)
+X_tsne = rng.randn(40, 2)
+
+ESTIMATOR_DATA = {
+    "CCA": (linnerud.data, linnerud.target),
+    "ClassifierChain": (X_multi_out, y_multi_out),
+    "CountVectorizer": (X_text, None),
+    "DictVectorizer": (X_dict, None),
+    "GaussianRandomProjection": (X_rnd_projection, None),
+    "HashingVectorizer": (X_text, None),
+    "IsotonicRegression": (X_isotonic, y_isotonic),
+    "KernelCenterer": (X_square, None),
+    "MultiLabelBinarizer": (X_multi_out, y_multi_out),
+    "MultiOutputClassifier": (X_multi_out, y_multi_out),
+    "MultiOutputRegressor": (X_multi_out, y_multi_out),
+    "MultiTaskElasticNet": (X_multi_out, y_multi_out),
+    "MultiTaskElasticNetCV": (X_multi_out, y_multi_out),
+    "MultiTaskLasso": (X_multi_out, y_multi_out),
+    "MultiTaskLassoCV": (X_multi_out, y_multi_out),
+    "PLSCanonical": (linnerud.data, linnerud.target),
+    "PLSSVD": (linnerud.data, linnerud.target),
+    "RegressorChain": (X_multi_out, y_multi_out),
+    "SparseRandomProjection": (X_rnd_projection, None),
+    "TfidfVectorizer": (X_text, None),
+    "TSNE": (X_tsne, None),
+}
+
+
+@pytest.mark.parametrize(
+    "estimator_org", _tested_estimators(), ids=_get_check_estimator_ids
+)
+def test_estimator_deserialization_from_readonly_buffer(estimator_org, tmpdir):
+    pickle_path = str(tmpdir.join("clf.joblib"))
+    estimator = clone(estimator_org)
+    estimator_name = estimator.__class__.__name__
+    default_data = (X_default, y_default)
+    X, y = ESTIMATOR_DATA.get(estimator_name, default_data)
+    if estimator_name in ("LabelBinarizer", "LabelEncoder", "MultiLabelBinarizer"):
+        estimator.fit(y)
+    else:
+        estimator.fit(X, y)
+    joblib.dump(estimator, pickle_path)
+    joblib.load(pickle_path, mmap_mode="r")

From e87384644b85c38e40119858c45f5a9a32f59aab Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Thu, 16 Feb 2023 17:58:04 +0500
Subject: [PATCH 2/4] Adjust the current pickle estimator check with an
 argument of readonly_memmap

---
 sklearn/tests/test_common.py      | 67 +------------------------------
 sklearn/utils/estimator_checks.py | 26 +++++++-----
 2 files changed, 16 insertions(+), 77 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 81f2944ce5110..6ef0eaa433d20 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -15,11 +15,9 @@
 from itertools import product, chain
 from functools import partial
 
-import joblib
 import pytest
 import numpy as np
 
-from sklearn import clone
 from sklearn.cluster import (
     AffinityPropagation,
     Birch,
@@ -27,12 +25,7 @@
     OPTICS,
     SpectralClustering,
 )
-from sklearn.datasets import (
-    make_blobs,
-    make_regression,
-    load_linnerud,
-    make_multilabel_classification,
-)
+from sklearn.datasets import make_blobs
 from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding
 from sklearn.neighbors import (
     LocalOutlierFactor,
@@ -608,61 +601,3 @@ def test_global_output_transform_pandas(estimator):
     _set_checking_parameters(estimator)
     with ignore_warnings(category=(FutureWarning)):
         check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator)
-
-
-rng = np.random.RandomState(0)
-linnerud = load_linnerud()
-X_default, y_default = rng.rand(23, 14), rng.randint(1, 3, size=23)
-X_multi_out, y_multi_out = make_multilabel_classification(n_classes=3, random_state=0)
-X_isotonic, y_isotonic = make_regression(n_samples=10, n_features=1, random_state=0)
-X_square = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]]
-X_text = [
-    "This is the 1st document in my corpus.",
-    "This document is the 2nd sample.",
-    "And this is the 3rd one.",
-    "Is this the 4th document?",
-]
-X_dict = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
-X_rnd_projection = rng.rand(25, 3000)
-X_tsne = rng.randn(40, 2)
-
-ESTIMATOR_DATA = {
-    "CCA": (linnerud.data, linnerud.target),
-    "ClassifierChain": (X_multi_out, y_multi_out),
-    "CountVectorizer": (X_text, None),
-    "DictVectorizer": (X_dict, None),
-    "GaussianRandomProjection": (X_rnd_projection, None),
-    "HashingVectorizer": (X_text, None),
-    "IsotonicRegression": (X_isotonic, y_isotonic),
-    "KernelCenterer": (X_square, None),
-    "MultiLabelBinarizer": (X_multi_out, y_multi_out),
-    "MultiOutputClassifier": (X_multi_out, y_multi_out),
-    "MultiOutputRegressor": (X_multi_out, y_multi_out),
-    "MultiTaskElasticNet": (X_multi_out, y_multi_out),
-    "MultiTaskElasticNetCV": (X_multi_out, y_multi_out),
-    "MultiTaskLasso": (X_multi_out, y_multi_out),
-    "MultiTaskLassoCV": (X_multi_out, y_multi_out),
-    "PLSCanonical": (linnerud.data, linnerud.target),
-    "PLSSVD": (linnerud.data, linnerud.target),
-    "RegressorChain": (X_multi_out, y_multi_out),
-    "SparseRandomProjection": (X_rnd_projection, None),
-    "TfidfVectorizer": (X_text, None),
-    "TSNE": (X_tsne, None),
-}
-
-
-@pytest.mark.parametrize(
-    "estimator_org", _tested_estimators(), ids=_get_check_estimator_ids
-)
-def test_estimator_deserialization_from_readonly_buffer(estimator_org, tmpdir):
-    pickle_path = str(tmpdir.join("clf.joblib"))
-    estimator = clone(estimator_org)
-    estimator_name = estimator.__class__.__name__
-    default_data = (X_default, y_default)
-    X, y = ESTIMATOR_DATA.get(estimator_name, default_data)
-    if estimator_name in ("LabelBinarizer", "LabelEncoder", "MultiLabelBinarizer"):
-        estimator.fit(y)
-    else:
-        estimator.fit(X, y)
-    joblib.dump(estimator, pickle_path)
-    joblib.load(pickle_path, mmap_mode="r")
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index bbfd12ad39b9c..9eb666c68984d 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -129,6 +129,7 @@ def _yield_checks(estimator):
     # Test that estimators can be pickled, and once pickled
     # give the same answer as before.
     yield check_estimators_pickle
+    yield partial(check_estimators_pickle, readonly_memmap=True)
 
     yield check_estimator_get_tags_default_keys
 
@@ -1870,7 +1871,7 @@ def check_nonsquare_error(name, estimator_orig):
 
 
 @ignore_warnings
-def check_estimators_pickle(name, estimator_orig):
+def check_estimators_pickle(name, estimator_orig, readonly_memmap=False):
     """Test that we can pickle all estimators."""
     check_methods = ["predict", "transform", "decision_function", "predict_proba"]
 
@@ -1899,16 +1900,19 @@ def check_estimators_pickle(name, estimator_orig):
     set_random_state(estimator)
     estimator.fit(X, y)
 
-    # pickle and unpickle!
-    pickled_estimator = pickle.dumps(estimator)
-    module_name = estimator.__module__
-    if module_name.startswith("sklearn.") and not (
-        "test_" in module_name or module_name.endswith("_testing")
-    ):
-        # strict check for sklearn estimators that are not implemented in test
-        # modules.
-        assert b"version" in pickled_estimator
-    unpickled_estimator = pickle.loads(pickled_estimator)
+    if readonly_memmap:
+        unpickled_estimator = create_memmap_backed_data(estimator)
+    else:
+        # pickle and unpickle!
+        pickled_estimator = pickle.dumps(estimator)
+        module_name = estimator.__module__
+        if module_name.startswith("sklearn.") and not (
+            "test_" in module_name or module_name.endswith("_testing")
+        ):
+            # strict check for sklearn estimators that are not implemented in test
+            # modules.
+            assert b"version" in pickled_estimator
+        unpickled_estimator = pickle.loads(pickled_estimator)
 
     result = dict()
     for method in check_methods:

From 9751d1dee9171413ba20c0ad0872fec051c312b1 Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Fri, 17 Feb 2023 12:36:15 +0500
Subject: [PATCH 3/4] * Fix error in hist_gradient_boosting by using const
 memory views * Mark check_estimators_pickle as xfail for SplineTransformer

---
 .../ensemble/_hist_gradient_boosting/_predictor.pyx    |  4 ++--
 sklearn/preprocessing/_polynomial.py                   | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index dab18bdd1d49c..85f6b20490db0 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -14,7 +14,7 @@ from ._bitset cimport in_bitset_2d_memoryview
 
 
 def _predict_from_raw_data(  # raw data = non-binned data
-        node_struct [:] nodes,
+        const node_struct [:] nodes,
         const X_DTYPE_C [:, :] numeric_data,
         const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
         const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
@@ -34,7 +34,7 @@ def _predict_from_raw_data(  # raw data = non-binned data
 
 
 cdef inline Y_DTYPE_C _predict_one_from_raw_data(
-        node_struct [:] nodes,
+        const node_struct [:] nodes,
         const X_DTYPE_C [:, :] numeric_data,
         const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
         const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 46be5d2c13bbc..9e09715a50718 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -936,3 +936,13 @@ def transform(self, X):
             # We chose the last one.
             indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]
             return XBS[:, indices]
+
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_estimators_pickle": (
+                    "Current Scipy implementation of _bsplines does not"
+                    "support const memory views."
+                ),
+            }
+        }

From 732760b1e52bc27aedac2bc0ee06e71bb6854770 Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Fri, 17 Feb 2023 17:22:42 +0500
Subject: [PATCH 4/4] Revert the changes to predictor.pyx and instead add
 _xfail_checks

---
 .../ensemble/_hist_gradient_boosting/_predictor.pyx    |  4 ++--
 .../_hist_gradient_boosting/gradient_boosting.py       | 10 +++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index 85f6b20490db0..dab18bdd1d49c 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -14,7 +14,7 @@ from ._bitset cimport in_bitset_2d_memoryview
 
 
 def _predict_from_raw_data(  # raw data = non-binned data
-        const node_struct [:] nodes,
+        node_struct [:] nodes,
         const X_DTYPE_C [:, :] numeric_data,
         const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
         const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
@@ -34,7 +34,7 @@ def _predict_from_raw_data(  # raw data = non-binned data
 
 
 cdef inline Y_DTYPE_C _predict_one_from_raw_data(
-        const node_struct [:] nodes,
+        node_struct [:] nodes,
         const X_DTYPE_C [:, :] numeric_data,
         const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
         const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 31069fe14ee41..23e76b4307b6f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -1165,7 +1165,15 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
         return averaged_predictions
 
     def _more_tags(self):
-        return {"allow_nan": True}
+        return {
+            "allow_nan": True,
+            "_xfail_checks": {
+                "check_estimators_pickle": (
+                    "The memory views of the nodes parameter need to be defined"
+                    "as read only in the Cython implementation."
+                ),
+            },
+        }
 
     @abstractmethod
     def _get_loss(self, sample_weight):