From 7d9fafb574a289bb42ce266b86ad348f94747da8 Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:10:27 -0500 Subject: [PATCH 1/9] Quick fix. --- sklearn/preprocessing/_target_encoder_fast.pyx | 4 ++-- sklearn/preprocessing/tests/test_target_encoder.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_target_encoder_fast.pyx b/sklearn/preprocessing/_target_encoder_fast.pyx index 39f3ebcf49995..12e15397ffeca 100644 --- a/sklearn/preprocessing/_target_encoder_fast.pyx +++ b/sklearn/preprocessing/_target_encoder_fast.pyx @@ -19,7 +19,7 @@ ctypedef fused Y_DTYPE: def _fit_encoding_fast( INT_DTYPE[:, ::1] X_int, - Y_DTYPE[:] y, + const Y_DTYPE[:] y, cnp.int64_t[::1] n_categories, double smooth, double y_mean, @@ -79,7 +79,7 @@ def _fit_encoding_fast( def _fit_encoding_fast_auto_smooth( INT_DTYPE[:, ::1] X_int, - Y_DTYPE[:] y, + const Y_DTYPE[:] y, cnp.int64_t[::1] n_categories, double y_mean, double y_variance, diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index 248a13f88512d..a7a2d3ad66307 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -701,3 +701,14 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): # cardinality yet non-informative feature instead of the lower # cardinality yet informative feature: assert abs(coef[0]) < abs(coef[2]) + + +def test_27879(): + import pandas as pd + + pd.options.mode.copy_on_write = True + + df = pd.DataFrame({"x": ["a", "b", "c", "c"], "y": [4.0, 5.0, 6.0, 7.0]}) + t = TargetEncoder(target_type="continuous") + output = t.fit_transform(df[["x"]], df["y"]) + assert output.tolist() == df["y"].tolist() From 14f4255845b1e1802f4b3d880a5b4671549daae0 Mon Sep 17 00:00:00 2001 From: s-banach Date: Tue, 23 Jan 2024 13:36:21 -0500 Subject: [PATCH 2/9] fix test --- sklearn/preprocessing/tests/test_target_encoder.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index a7a2d3ad66307..4059396c8007d 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -704,11 +704,8 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): def test_27879(): - import pandas as pd - + pd = pytest.importorskip("pandas") pd.options.mode.copy_on_write = True df = pd.DataFrame({"x": ["a", "b", "c", "c"], "y": [4.0, 5.0, 6.0, 7.0]}) - t = TargetEncoder(target_type="continuous") - output = t.fit_transform(df[["x"]], df["y"]) - assert output.tolist() == df["y"].tolist() + TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) From 31bfdc8965dabe4fbf268714a744f9a41909b61c Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Thu, 25 Jan 2024 22:27:02 -0500 Subject: [PATCH 3/9] Change pandas config in context manager --- sklearn/preprocessing/tests/test_target_encoder.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index 4059396c8007d..72193e572911a 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -705,7 +705,6 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): def test_27879(): pd = pytest.importorskip("pandas") - pd.options.mode.copy_on_write = True - - df = pd.DataFrame({"x": ["a", "b", "c", "c"], "y": [4.0, 5.0, 6.0, 7.0]}) - TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) + with pd.option_context('mode.copy_on_write', True): + df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]}) + TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) From 056b767b25cbc786efc6cd08d376749d9741cd99 Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Thu, 1 Feb 2024 11:00:04 -0500 Subject: [PATCH 4/9] Black format test (Whoops!) --- sklearn/preprocessing/tests/test_target_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index 72193e572911a..5e0f659e1eb2b 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -705,6 +705,6 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): def test_27879(): pd = pytest.importorskip("pandas") - with pd.option_context('mode.copy_on_write', True): + with pd.option_context("mode.copy_on_write", True): df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]}) TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) From ef73bd95957544b6df9712ada80965ed514d389d Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Thu, 1 Feb 2024 22:11:41 -0500 Subject: [PATCH 5/9] Rename test, add pandas minversion to importorskip --- sklearn/preprocessing/tests/test_target_encoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index 5e0f659e1eb2b..1f3de7b442fd1 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -703,8 +703,9 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): assert abs(coef[0]) < abs(coef[2]) -def test_27879(): - pd = pytest.importorskip("pandas") +def test_read_only_input(): + """Copy-on-write makes "y" read-only. See issue #27879.""" + pd = pytest.importorskip("pandas", minversion="2.0") with pd.option_context("mode.copy_on_write", True): df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]}) TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) From 3dbc6af3d240bd619501c05ffd1e3e4700bcbdc0 Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Thu, 1 Feb 2024 22:16:05 -0500 Subject: [PATCH 6/9] Update v1.4.rst --- doc/whats_new/v1.4.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index f255e3abbcab8..09b1f9c5398f5 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -105,6 +105,14 @@ Changelog and `axis=1`, as documented in the docstring. :pr:`28222` by :user:`Guillaume Lemaitre `. + +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :class:`preprocessing.TargetEncoder` no longer fails when `target_type="continuous"` and the input is read-only. + In particular, it now works with pandas copy-on-write mode enabled. + :pr:`28233` by :user:`John Hopfensperger `. + .. _changes_1_4: Version 1.4.0 From 82fa733606a297247954c197c88fa213ce998f23 Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Thu, 1 Feb 2024 22:19:16 -0500 Subject: [PATCH 7/9] fix: use suggested test name --- sklearn/preprocessing/tests/test_target_encoder.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index 1f3de7b442fd1..8c863c575ddad 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -703,8 +703,13 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): assert abs(coef[0]) < abs(coef[2]) -def test_read_only_input(): - """Copy-on-write makes "y" read-only. See issue #27879.""" +def test_pandas_copy_on_write(): + """ + Test cython code suceeds when y is read-only. + + The numpy array underlying df["y"] is read-only when copy-on-write is enabled. + Non-regression test for gh-27879. + """ pd = pytest.importorskip("pandas", minversion="2.0") with pd.option_context("mode.copy_on_write", True): df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]}) From 79ccd8fbfeee60d17ee1aeba4da6e5523c47c308 Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Thu, 1 Feb 2024 23:35:54 -0500 Subject: [PATCH 8/9] Waste more compute money for a spelling error Sorry, I use all my brain power at work --- sklearn/preprocessing/tests/test_target_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py index 8c863c575ddad..81b0f32d04d68 100644 --- a/sklearn/preprocessing/tests/test_target_encoder.py +++ b/sklearn/preprocessing/tests/test_target_encoder.py @@ -705,7 +705,7 @@ def test_target_encoding_for_linear_regression(smooth, global_random_seed): def test_pandas_copy_on_write(): """ - Test cython code suceeds when y is read-only. + Test target-encoder cython code when y is read-only. The numpy array underlying df["y"] is read-only when copy-on-write is enabled. Non-regression test for gh-27879. From abca02b9c27a4c67056d8bc963e5894bdfa7ddd2 Mon Sep 17 00:00:00 2001 From: John Hopfensperger <45150804+s-banach@users.noreply.github.com> Date: Fri, 2 Feb 2024 09:35:54 -0500 Subject: [PATCH 9/9] Update doc/whats_new/v1.4.rst Co-authored-by: Adrin Jalali --- doc/whats_new/v1.4.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 09b1f9c5398f5..cbea5d932e243 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -109,8 +109,9 @@ Changelog :mod:`sklearn.preprocessing` ............................ -- |Fix| :class:`preprocessing.TargetEncoder` no longer fails when `target_type="continuous"` and the input is read-only. - In particular, it now works with pandas copy-on-write mode enabled. +- |Fix| :class:`preprocessing.TargetEncoder` no longer fails when + `target_type="continuous"` and the input is read-only. In particular, it now + works with pandas copy-on-write mode enabled. :pr:`28233` by :user:`John Hopfensperger `. .. _changes_1_4: