From b13fcf55ea0886c15496d442f59ad8b152617d36 Mon Sep 17 00:00:00 2001 From: tirthasheshpatel Date: Sun, 1 Dec 2019 23:21:26 +0530 Subject: [PATCH 01/11] FIX: order of values of self.quantiles_ in QuantileTransformer --- sklearn/preprocessing/_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index ef8b9c6db9e3b..e7a80a94c8498 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,6 +2262,8 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) + self.quantiles_ = np.minimum.accumulate( + self.quantiles_[::-1])[::-1] def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. @@ -2305,6 +2307,8 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append( np.nanpercentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) + self.quantiles_ = np.minimum.accumulate( + self.quantiles_[::-1])[::-1] def fit(self, X, y=None): """Compute the quantiles used for transforming. From 7bd784653935881356e99b85e1ccfea591f2f12d Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Mon, 2 Dec 2019 15:55:16 +0530 Subject: [PATCH 02/11] FIX: order of values in self.quantiles_ in QuantileTransformer --- sklearn/preprocessing/_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index e7a80a94c8498..b4796b9fd019b 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,8 +2262,8 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) - self.quantiles_ = np.minimum.accumulate( - self.quantiles_[::-1])[::-1] + self.quantiles_ = np.maximum.accumulate( + self.quantiles_) def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. @@ -2307,8 +2307,8 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append( np.nanpercentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) - self.quantiles_ = np.minimum.accumulate( - self.quantiles_[::-1])[::-1] + self.quantiles_ = np.maximum.accumulate( + self.quantiles_) def fit(self, X, y=None): """Compute the quantiles used for transforming. From 1196430bac1c802d2bb0e3532794a74741725a22 Mon Sep 17 00:00:00 2001 From: tirthasheshpatel Date: Mon, 2 Dec 2019 19:02:25 +0530 Subject: [PATCH 03/11] add comment explaining changes --- sklearn/preprocessing/_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index b4796b9fd019b..25ad9a13f4401 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2307,8 +2307,8 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append( np.nanpercentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) - self.quantiles_ = np.maximum.accumulate( - self.quantiles_) + # make sure the quantiles are monotonically increasing + self.quantiles_ = np.maximum.accumulate(self.quantiles_) def fit(self, X, y=None): """Compute the quantiles used for transforming. From 3816224620255a982a8a279e4c4633b5267c81f0 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Mon, 2 Dec 2019 21:23:26 +0530 Subject: [PATCH 04/11] Update _data.py --- sklearn/preprocessing/_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 25ad9a13f4401..a1fbbe8d38b92 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,8 +2262,7 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) - self.quantiles_ = np.maximum.accumulate( - self.quantiles_) + self.quantiles_ = np.maximum.accumulate(self.quantiles_) def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. From b47a78699f638c8a8c19b44e3f6646aa2c9d6052 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Mon, 2 Dec 2019 21:24:17 +0530 Subject: [PATCH 05/11] Update _data.py --- sklearn/preprocessing/_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index a1fbbe8d38b92..16c7b3214661f 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,6 +2262,7 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) + # make sure the quantiles are monotonically increasing self.quantiles_ = np.maximum.accumulate(self.quantiles_) def _sparse_fit(self, X, random_state): From 3eb3aa9f7c34232674806ae3414413f7d9d1d9d5 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Fri, 6 Dec 2019 22:33:52 +0530 Subject: [PATCH 06/11] Update sklearn/preprocessing/_data.py Add suggested docstring... Co-Authored-By: Guillaume Lemaitre --- sklearn/preprocessing/_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 16c7b3214661f..125049bd09b84 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,7 +2262,8 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) - # make sure the quantiles are monotonically increasing + # due to floating-point precision error in `np.nanpercentile`, + # make sure that quantiles are monotonically increasing self.quantiles_ = np.maximum.accumulate(self.quantiles_) def _sparse_fit(self, X, random_state): From ebe75907df9bdf02735fdc038ea7910522960149 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Fri, 6 Dec 2019 22:34:19 +0530 Subject: [PATCH 07/11] Update sklearn/preprocessing/_data.py Add suggested docstring Co-Authored-By: Guillaume Lemaitre --- sklearn/preprocessing/_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 125049bd09b84..64691d83b0651 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2308,6 +2308,7 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append( np.nanpercentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) + # due to floating-point precision error in `np.nanpercentile`, # make sure the quantiles are monotonically increasing self.quantiles_ = np.maximum.accumulate(self.quantiles_) From ec222ea30c0de571cc5e1d0397dd6a1ba5924b61 Mon Sep 17 00:00:00 2001 From: tirthasheshpatel Date: Fri, 6 Dec 2019 22:47:01 +0530 Subject: [PATCH 08/11] add fix in whats_new --- doc/whats_new/v0.22.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e14601c1b52a7..ff3d922ddfbad 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -768,6 +768,10 @@ Changelog :class:`preprocessing.KernelCenterer` :pr:`14336` by :user:`Gregory Dexter `. +- |Fix| :class:`perprocessing.QuantileTransformer` now guarantees the + `quantiles_` attribute to be completely sorted in non-decreasing manner. + :pr:`15751` by :user:`Tirth Patel `. + :mod:`sklearn.model_selection` .............................. From c9cf96e7393b6a02943284bb9aeaf290f15f3f6d Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Sat, 7 Dec 2019 16:42:17 +0530 Subject: [PATCH 09/11] Update doc/whats_new/v0.22.rst Co-Authored-By: fcharras --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index ff3d922ddfbad..484600701a2a4 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -768,7 +768,7 @@ Changelog :class:`preprocessing.KernelCenterer` :pr:`14336` by :user:`Gregory Dexter `. -- |Fix| :class:`perprocessing.QuantileTransformer` now guarantees the +- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the `quantiles_` attribute to be completely sorted in non-decreasing manner. :pr:`15751` by :user:`Tirth Patel `. From 911859db678735a6a5b9b0a9e4dd785654b20fda Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 21 Dec 2019 15:36:15 +0100 Subject: [PATCH 10/11] Add non-regression test + reference to upstream numpy issue --- sklearn/preprocessing/_data.py | 8 ++++++-- sklearn/preprocessing/tests/test_data.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 64691d83b0651..27a752813dd2a 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,8 +2262,10 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) - # due to floating-point precision error in `np.nanpercentile`, - # make sure that quantiles are monotonically increasing + # Due to floating-point precision error in `np.nanpercentile`, + # make sure that quantiles are monotonically increasing. + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 self.quantiles_ = np.maximum.accumulate(self.quantiles_) def _sparse_fit(self, X, random_state): @@ -2310,6 +2312,8 @@ def _sparse_fit(self, X, random_state): self.quantiles_ = np.transpose(self.quantiles_) # due to floating-point precision error in `np.nanpercentile`, # make sure the quantiles are monotonically increasing + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 self.quantiles_ = np.maximum.accumulate(self.quantiles_) def fit(self, X, y=None): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 060719200fa99..5926bff21acd4 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1533,6 +1533,27 @@ def test_quantile_transform_nan(): assert not np.isnan(transformer.quantiles_[:, 1:]).any() +@pytest.mark.parametrize("sparse_data", [False, True]) +def test_quantile_transformer_sorted_quantiles(sparse_data): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15733 + # Taken from upstream bug report: + # https://github.com/numpy/numpy/issues/14685 + X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) + X = 0.1 * X.reshape(-1, 1) + if sparse_data: + X = sparse.csc_matrix(X) + + n_quantiles = 100 + qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) + + # Check that the estimated quantile threasholds are monotically + # increasing: + quantiles = qt.quantiles_[:, 0] + assert len(quantiles) == 100 + assert all(np.diff(quantiles) >= 0) + + def test_deprecated_quantile_transform_copy(): future_message = ("The default value of `copy` will change from False to " "True in 0.23 in order to make it more consistent with " From f81b17c9cc598fc4413ce145de8e947464fb3de2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 21 Dec 2019 15:48:22 +0100 Subject: [PATCH 11/11] Use _convert_container --- sklearn/preprocessing/tests/test_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a7d49d856a7d1..cdff446cb336c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -25,6 +25,7 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import skip_if_32bit +from sklearn.utils._testing import _convert_container from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.preprocessing._data import _handle_zeros_in_scale @@ -1532,16 +1533,15 @@ def test_quantile_transform_nan(): assert not np.isnan(transformer.quantiles_[:, 1:]).any() -@pytest.mark.parametrize("sparse_data", [False, True]) -def test_quantile_transformer_sorted_quantiles(sparse_data): +@pytest.mark.parametrize("array_type", ['array', 'sparse']) +def test_quantile_transformer_sorted_quantiles(array_type): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15733 # Taken from upstream bug report: # https://github.com/numpy/numpy/issues/14685 X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) X = 0.1 * X.reshape(-1, 1) - if sparse_data: - X = sparse.csc_matrix(X) + X = _convert_container(X, array_type) n_quantiles = 100 qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)