From 8545aece705d213a6e4cb4779cead21b6a09d9c0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 2 Aug 2022 17:38:56 -0400 Subject: [PATCH 1/4] FIX Fixes OrdinalEncoder.inverse_tranform nan encoded values --- sklearn/preprocessing/_encoders.py | 14 +++-- sklearn/preprocessing/tests/test_encoders.py | 62 +++++++++++++++++++- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 60a73373235f0..4e5fedd00ea81 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1371,19 +1371,23 @@ def inverse_transform(self, X): found_unknown = {} for i in range(n_features): - labels = X[:, i].astype("int64", copy=False) + labels = X[:, i] # replace values of X[:, i] that were nan with actual indices if i in self._missing_indices: - X_i_mask = _get_mask(X[:, i], self.encoded_missing_value) + X_i_mask = _get_mask(labels, self.encoded_missing_value) labels[X_i_mask] = self._missing_indices[i] if self.handle_unknown == "use_encoded_value": - unknown_labels = labels == self.unknown_value - X_tr[:, i] = self.categories_[i][np.where(unknown_labels, 0, labels)] + unknown_labels = _get_mask(labels, self.unknown_value) + + known_labels = ~unknown_labels + X_tr[known_labels, i] = self.categories_[i][ + labels[known_labels].astype("int64", copy=False) + ] found_unknown[i] = unknown_labels else: - X_tr[:, i] = self.categories_[i][labels] + X_tr[:, i] = self.categories_[i][labels.astype("int64", copy=False)] # insert None values for unknown values if found_unknown: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 54e350c8d7acd..5b397d95d1579 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -735,7 +735,6 @@ def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): def test_ordinal_encoder_raise_categories_shape(): - X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T cats = ["Low", "Medium", "High"] enc = OrdinalEncoder(categories=cats) @@ -1837,6 +1836,15 @@ def test_ordinal_encoder_unknown_missing_interaction(): X_test_trans = oe.transform(X_test) assert_allclose(X_test_trans, [[np.nan], [-3]]) + # Non-regression test for #24082 + X_roundtrip = oe.inverse_transform(X_test_trans) + + # np.nan is unknown so it maps to None + assert X_roundtrip[0][0] is None + + # -3 is the encoded missing value so it maps back to nan + assert np.isnan(X_roundtrip[1][0]) + @pytest.mark.parametrize("with_pandas", [True, False]) def test_ordinal_encoder_encoded_missing_value_error(with_pandas): @@ -1862,3 +1870,55 @@ def test_ordinal_encoder_encoded_missing_value_error(with_pandas): with pytest.raises(ValueError, match=error_msg): oe.fit(X) + + +@pytest.mark.parametrize( + "X_train, X_test_trans_expected, X_roundtrip_expected", + [ + ( + # missing value is not in training set + # inverse transform will considering encoded nan as unknown + np.array([["a"], ["1"]], dtype=object), + [[0], [np.nan], [np.nan]], + np.asarray([["1"], [None], [None]], dtype=object), + ), + ( + # missing value in training set, + # inverse transform will considering encoded nan as missing + np.array([[np.nan], ["1"], ["a"]], dtype=object), + [[0], [np.nan], [np.nan]], + np.asarray([["1"], [np.nan], [np.nan]], dtype=object), + ), + ], +) +def test_ordinal_encoder_unknown_missing_interaction_both_nan( + X_train, X_test_trans_expected, X_roundtrip_expected +): + """Check transform when unknown_value and encoded_missing_value is nan. + + Non-regression test for #24082. + """ + oe = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=np.nan, + ).fit(X_train) + + X_test = np.array([["1"], [np.nan], ["b"]]) + X_test_trans = oe.transform(X_test) + + # both nan and unknown are encoded as nan + assert_allclose(X_test_trans, X_test_trans_expected) + X_roundtrip = oe.inverse_transform(X_test_trans) + + n_samples = X_roundtrip_expected.shape[0] + for i in range(n_samples): + expected_val = X_roundtrip_expected[i, 0] + val = X_roundtrip[i, 0] + + if expected_val is None: + assert val is None + elif is_scalar_nan(expected_val): + assert np.isnan(val) + else: + assert val == expected_val From 26a65afedda57adc0392ac7fceac462c2deb47f3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 2 Aug 2022 17:42:22 -0400 Subject: [PATCH 2/4] DOC Adds whats new number --- doc/whats_new/v1.2.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index ba51e28229462..a7411ddaaa583 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -254,6 +254,13 @@ Changelog instead of failing with a low-level error message at predict-time. :pr:`23874` by :user:`Juan Gomez <2357juan>`. +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :meth:`preprocessing.OrdinalEncoder.inverse_transform` correctly handles + use cases where `unknown_value` or `encoded_missing_value` is `nan`. :pr:`24087` + by `Thomas Fan`_. + :mod:`sklearn.svm` .................. From 0616c3c3118cd5971af0fd57eda5b71d60e99c2b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 4 Aug 2022 10:02:08 -0400 Subject: [PATCH 3/4] CLN Reduce diff --- sklearn/preprocessing/tests/test_encoders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 5b397d95d1579..c439faf50ee7d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -735,6 +735,7 @@ def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): def test_ordinal_encoder_raise_categories_shape(): + X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T cats = ["Low", "Medium", "High"] enc = OrdinalEncoder(categories=cats) From f14192f65b46be4e5eb2e878ef5e067fd2c6a824 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 4 Aug 2022 10:02:42 -0400 Subject: [PATCH 4/4] DOC Move whats new to 1.1.2 --- doc/whats_new/v1.1.rst | 7 +++++++ doc/whats_new/v1.2.rst | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 20801abbcdcbc..1702a8b7710cc 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -33,6 +33,13 @@ Changelog a node if there are duplicates in the dataset. :pr:`23395` by :user:`Jérémie du Boisberranger `. +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :meth:`preprocessing.OrdinalEncoder.inverse_transform` correctly handles + use cases where `unknown_value` or `encoded_missing_value` is `nan`. :pr:`24087` + by `Thomas Fan`_. + .. _changes_1_1_1: Version 1.1.1 diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index a7411ddaaa583..ba51e28229462 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -254,13 +254,6 @@ Changelog instead of failing with a low-level error message at predict-time. :pr:`23874` by :user:`Juan Gomez <2357juan>`. -:mod:`sklearn.preprocessing` -............................ - -- |Fix| :meth:`preprocessing.OrdinalEncoder.inverse_transform` correctly handles - use cases where `unknown_value` or `encoded_missing_value` is `nan`. :pr:`24087` - by `Thomas Fan`_. - :mod:`sklearn.svm` ..................