diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 00c957448550f..d1cfe0d42239c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -817,6 +817,10 @@ Changelog :class:`preprocessing.KernelCenterer` :pr:`14336` by :user:`Gregory Dexter `. +- |Fix| :class:`preprocessing.QuantileTransformer` now guarantees the + `quantiles_` attribute to be completely sorted in non-decreasing manner. + :pr:`15751` by :user:`Tirth Patel `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 9227efa958b43..b047908842b38 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2262,6 +2262,11 @@ def _dense_fit(self, X, random_state): col = col.take(subsample_idx, mode='clip') self.quantiles_.append(np.nanpercentile(col, references)) self.quantiles_ = np.transpose(self.quantiles_) + # Due to floating-point precision error in `np.nanpercentile`, + # make sure that quantiles are monotonically increasing. + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) def _sparse_fit(self, X, random_state): """Compute percentiles for sparse matrices. @@ -2305,6 +2310,11 @@ def _sparse_fit(self, X, random_state): self.quantiles_.append( np.nanpercentile(column_data, references)) self.quantiles_ = np.transpose(self.quantiles_) + # due to floating-point precision error in `np.nanpercentile`, + # make sure the quantiles are monotonically increasing + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) def fit(self, X, y=None): """Compute the quantiles used for transforming. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 9a8e31d468f1c..cdff446cb336c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -25,6 +25,7 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import skip_if_32bit +from sklearn.utils._testing import _convert_container from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.preprocessing._data import _handle_zeros_in_scale @@ -1532,6 +1533,26 @@ def test_quantile_transform_nan(): assert not np.isnan(transformer.quantiles_[:, 1:]).any() +@pytest.mark.parametrize("array_type", ['array', 'sparse']) +def test_quantile_transformer_sorted_quantiles(array_type): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15733 + # Taken from upstream bug report: + # https://github.com/numpy/numpy/issues/14685 + X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) + X = 0.1 * X.reshape(-1, 1) + X = _convert_container(X, array_type) + + n_quantiles = 100 + qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) + + # Check that the estimated quantile threasholds are monotically + # increasing: + quantiles = qt.quantiles_[:, 0] + assert len(quantiles) == 100 + assert all(np.diff(quantiles) >= 0) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90),