Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f8adfa2

Browse files
glemaitreogrisel
authored andcommitted
[MRG] Ignore and pass-through NaNs in RobustScaler and robust_scale (#11308)
1 parent 0fc7ce6 commit f8adfa2

File tree

7 files changed

+171
-58
lines changed

7 files changed

+171
-58
lines changed

doc/whats_new/v0.20.rst

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -294,27 +294,28 @@ Preprocessing
294294
classes found which are ignored.
295295
:issue:`10913` by :user:`Rodrigo Agundez <rragundez>`.
296296

297-
- :class:`preprocessing.QuantileTransformer` handles and ignores NaN values.
298-
:issue:`10404` by :user:`Guillaume Lemaitre <glemaitre>`.
299-
300-
- Updated :class:`preprocessing.MinMaxScaler` and
301-
:func:`preprocessing.minmax_scale` to pass through NaN values.
302-
:issue:`10404` and :issue:`11243` by :user:`Lucija Gregov <LucijaGregov>` and
297+
- NaN values are ignored and handled in the following preprocessing methods:
298+
:class:`preprocessing.MaxAbsScaler`,
299+
:class:`preprocessing.MinMaxScaler`,
300+
:class:`preprocessing.RobustScaler`,
301+
:class:`preprocessing.StandardScaler`,
302+
:class:`preprocessing.PowerTransformer`,
303+
:class:`preprocessing.QuantileTransformer` classes and
304+
:func:`preprocessing.maxabs_scale`,
305+
:func:`preprocessing.minmax_scale`,
306+
:func:`preprocessing.robust_scale`,
307+
:func:`preprocessing.scale`,
308+
:func:`preprocessing.power_transform`,
309+
:func:`preprocessing.quantile_transform` functions respectively addressed in
310+
issues :issue:`11011`, :issue:`11005`, :issue:`11308`, :issue:`11206`,
311+
:issue:`11306`, and :issue:`10437`.
312+
By :user:`Lucija Gregov <LucijaGregov>` and
303313
:user:`Guillaume Lemaitre <glemaitre>`.
304314

305-
- :class:`preprocessing.StandardScaler` and :func:`preprocessing.scale`
306-
ignore and pass-through NaN values.
307-
:issue:`11206` by :user:`Guillaume Lemaitre <glemaitre>`.
308-
309-
- :class:`preprocessing.MaxAbsScaler` and :func:`preprocessing.maxabs_scale`
310-
handles and ignores NaN values.
311-
:issue:`11011` by `Lucija Gregov <LucihaGregov>` and
312-
:user:`Guillaume Lemaitre <glemaitre>`
313-
314-
- :class:`preprocessing.PowerTransformer` and
315-
:func:`preprocessing.power_transform` ignore and pass-through NaN values.
316-
:issue:`11306` by :user:`Guillaume Lemaitre <glemaitre>`.
317-
315+
- :class:`preprocessing.RobustScaler` and :func:`preprocessing.robust_scale`
316+
can be fitted using sparse matrices.
317+
:issue:`11308` by :user:`Guillaume Lemaitre <glemaitre>`.
318+
318319
Model evaluation and meta-estimators
319320

320321
- A scorer based on :func:`metrics.brier_score_loss` is also available.

sklearn/preprocessing/data.py

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from ..utils import check_array
2525
from ..utils.extmath import row_norms
2626
from ..utils.extmath import _incremental_mean_and_var
27-
from ..utils.fixes import boxcox, nanpercentile
27+
from ..utils.fixes import boxcox, nanpercentile, nanmedian
2828
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
2929
inplace_csr_row_normalize_l2)
3030
from ..utils.sparsefuncs import (inplace_column_scale,
@@ -1092,18 +1092,6 @@ def __init__(self, with_centering=True, with_scaling=True,
10921092
self.quantile_range = quantile_range
10931093
self.copy = copy
10941094

1095-
def _check_array(self, X, copy):
1096-
"""Makes sure centering is not enabled for sparse matrices."""
1097-
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
1098-
estimator=self, dtype=FLOAT_DTYPES)
1099-
1100-
if sparse.issparse(X):
1101-
if self.with_centering:
1102-
raise ValueError(
1103-
"Cannot center sparse matrices: use `with_centering=False`"
1104-
" instead. See docstring for motivation and alternatives.")
1105-
return X
1106-
11071095
def fit(self, X, y=None):
11081096
"""Compute the median and quantiles to be used for scaling.
11091097
@@ -1113,39 +1101,60 @@ def fit(self, X, y=None):
11131101
The data used to compute the median and quantiles
11141102
used for later scaling along the features axis.
11151103
"""
1116-
if sparse.issparse(X):
1117-
raise TypeError("RobustScaler cannot be fitted on sparse inputs")
1118-
X = self._check_array(X, self.copy)
1104+
# at fit, convert sparse matrices to csc for optimized computation of
1105+
# the quantiles
1106+
X = check_array(X, accept_sparse='csc', copy=self.copy, estimator=self,
1107+
dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
1108+
1109+
q_min, q_max = self.quantile_range
1110+
if not 0 <= q_min <= q_max <= 100:
1111+
raise ValueError("Invalid quantile range: %s" %
1112+
str(self.quantile_range))
1113+
11191114
if self.with_centering:
1120-
self.center_ = np.median(X, axis=0)
1115+
if sparse.issparse(X):
1116+
raise ValueError(
1117+
"Cannot center sparse matrices: use `with_centering=False`"
1118+
" instead. See docstring for motivation and alternatives.")
1119+
self.center_ = nanmedian(X, axis=0)
1120+
else:
1121+
self.center_ = None
11211122

11221123
if self.with_scaling:
1123-
q_min, q_max = self.quantile_range
1124-
if not 0 <= q_min <= q_max <= 100:
1125-
raise ValueError("Invalid quantile range: %s" %
1126-
str(self.quantile_range))
1124+
quantiles = []
1125+
for feature_idx in range(X.shape[1]):
1126+
if sparse.issparse(X):
1127+
column_nnz_data = X.data[X.indptr[feature_idx]:
1128+
X.indptr[feature_idx + 1]]
1129+
column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
1130+
column_data[:len(column_nnz_data)] = column_nnz_data
1131+
else:
1132+
column_data = X[:, feature_idx]
11271133

1128-
q = np.percentile(X, self.quantile_range, axis=0)
1129-
self.scale_ = (q[1] - q[0])
1134+
quantiles.append(nanpercentile(column_data,
1135+
self.quantile_range))
1136+
1137+
quantiles = np.transpose(quantiles)
1138+
1139+
self.scale_ = quantiles[1] - quantiles[0]
11301140
self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
1141+
else:
1142+
self.scale_ = None
1143+
11311144
return self
11321145

11331146
def transform(self, X):
11341147
"""Center and scale the data.
11351148
1136-
Can be called on sparse input, provided that ``RobustScaler`` has been
1137-
fitted to dense input and ``with_centering=False``.
1138-
11391149
Parameters
11401150
----------
11411151
X : {array-like, sparse matrix}
11421152
The data used to scale along the specified axis.
11431153
"""
1144-
if self.with_centering:
1145-
check_is_fitted(self, 'center_')
1146-
if self.with_scaling:
1147-
check_is_fitted(self, 'scale_')
1148-
X = self._check_array(X, self.copy)
1154+
check_is_fitted(self, 'center_', 'scale_')
1155+
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
1156+
estimator=self, dtype=FLOAT_DTYPES,
1157+
force_all_finite='allow-nan')
11491158

11501159
if sparse.issparse(X):
11511160
if self.with_scaling:
@@ -1165,11 +1174,10 @@ def inverse_transform(self, X):
11651174
X : array-like
11661175
The data used to scale along the specified axis.
11671176
"""
1168-
if self.with_centering:
1169-
check_is_fitted(self, 'center_')
1170-
if self.with_scaling:
1171-
check_is_fitted(self, 'scale_')
1172-
X = self._check_array(X, self.copy)
1177+
check_is_fitted(self, 'center_', 'scale_')
1178+
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
1179+
estimator=self, dtype=FLOAT_DTYPES,
1180+
force_all_finite='allow-nan')
11731181

11741182
if sparse.issparse(X):
11751183
if self.with_scaling:
@@ -1242,7 +1250,8 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
12421250
(e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).
12431251
"""
12441252
X = check_array(X, accept_sparse=('csr', 'csc'), copy=False,
1245-
ensure_2d=False, dtype=FLOAT_DTYPES)
1253+
ensure_2d=False, dtype=FLOAT_DTYPES,
1254+
force_all_finite='allow-nan')
12461255
original_ndim = X.ndim
12471256

12481257
if original_ndim == 1:

sklearn/preprocessing/tests/test_common.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
from sklearn.preprocessing import scale
1616
from sklearn.preprocessing import power_transform
1717
from sklearn.preprocessing import quantile_transform
18+
from sklearn.preprocessing import robust_scale
1819

1920
from sklearn.preprocessing import MaxAbsScaler
2021
from sklearn.preprocessing import MinMaxScaler
2122
from sklearn.preprocessing import StandardScaler
2223
from sklearn.preprocessing import PowerTransformer
2324
from sklearn.preprocessing import QuantileTransformer
25+
from sklearn.preprocessing import RobustScaler
2426

2527
from sklearn.utils.testing import assert_array_equal
2628
from sklearn.utils.testing import assert_allclose
@@ -40,7 +42,9 @@ def _get_valid_samples_by_column(X, col):
4042
(StandardScaler(), scale, False, False),
4143
(StandardScaler(with_mean=False), scale, True, False),
4244
(PowerTransformer(), power_transform, False, True),
43-
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False)]
45+
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
46+
(RobustScaler(), robust_scale, False, False),
47+
(RobustScaler(with_centering=False), robust_scale, True, False)]
4448
)
4549
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
4650
# check that the preprocessing method let pass nan

sklearn/preprocessing/tests/test_data.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -906,6 +906,52 @@ def test_scale_input_finiteness_validation():
906906
scale, X)
907907

908908

909+
def test_robust_scaler_error_sparse():
910+
X_sparse = sparse.rand(1000, 10)
911+
scaler = RobustScaler(with_centering=True)
912+
err_msg = "Cannot center sparse matrices"
913+
with pytest.raises(ValueError, match=err_msg):
914+
scaler.fit(X_sparse)
915+
916+
917+
@pytest.mark.parametrize("with_centering", [True, False])
918+
@pytest.mark.parametrize("with_scaling", [True, False])
919+
@pytest.mark.parametrize("X", [np.random.randn(10, 3),
920+
sparse.rand(10, 3, density=0.5)])
921+
def test_robust_scaler_attributes(X, with_centering, with_scaling):
922+
# check consistent type of attributes
923+
if with_centering and sparse.issparse(X):
924+
pytest.skip("RobustScaler cannot center sparse matrix")
925+
926+
scaler = RobustScaler(with_centering=with_centering,
927+
with_scaling=with_scaling)
928+
scaler.fit(X)
929+
930+
if with_centering:
931+
assert isinstance(scaler.center_, np.ndarray)
932+
else:
933+
assert scaler.center_ is None
934+
if with_scaling:
935+
assert isinstance(scaler.scale_, np.ndarray)
936+
else:
937+
assert scaler.scale_ is None
938+
939+
940+
def test_robust_scaler_col_zero_sparse():
941+
# check that the scaler is working when there is not data materialized in a
942+
# column of a sparse matrix
943+
X = np.random.randn(10, 5)
944+
X[:, 0] = 0
945+
X = sparse.csr_matrix(X)
946+
947+
scaler = RobustScaler(with_centering=False)
948+
scaler.fit(X)
949+
assert scaler.scale_[0] == pytest.approx(1)
950+
951+
X_trans = scaler.transform(X)
952+
assert_allclose(X[:, 0].toarray(), X_trans[:, 0].toarray())
953+
954+
909955
def test_robust_scaler_2d_arrays():
910956
# Test robust scaling of 2d array along first axis
911957
rng = np.random.RandomState(0)
@@ -919,6 +965,29 @@ def test_robust_scaler_2d_arrays():
919965
assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
920966

921967

968+
@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
969+
@pytest.mark.parametrize("strictly_signed",
970+
['positive', 'negative', 'zeros', None])
971+
def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
972+
# Check the equivalence of the fitting with dense and sparse matrices
973+
X_sparse = sparse.rand(1000, 5, density=density).tocsc()
974+
if strictly_signed == 'positive':
975+
X_sparse.data = np.abs(X_sparse.data)
976+
elif strictly_signed == 'negative':
977+
X_sparse.data = - np.abs(X_sparse.data)
978+
elif strictly_signed == 'zeros':
979+
X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
980+
X_dense = X_sparse.toarray()
981+
982+
scaler_sparse = RobustScaler(with_centering=False)
983+
scaler_dense = RobustScaler(with_centering=False)
984+
985+
scaler_sparse.fit(X_sparse)
986+
scaler_dense.fit(X_dense)
987+
988+
assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
989+
990+
922991
def test_robust_scaler_transform_one_row_csr():
923992
# Check RobustScaler on transforming csr matrix with one row
924993
rng = np.random.RandomState(0)

sklearn/utils/estimator_checks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
'RandomForestRegressor', 'Ridge', 'RidgeCV']
8080

8181
ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer',
82-
'MaxAbsScaler', 'MinMaxScaler', 'StandardScaler',
82+
'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
8383
'PowerTransformer', 'QuantileTransformer']
8484

8585

sklearn/utils/fixes.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,19 @@ def nanpercentile(a, q):
282282
from numpy import nanpercentile # noqa
283283

284284

285+
if np_version < (1, 9):
286+
def nanmedian(a, axis=None):
287+
if axis is None:
288+
data = a.reshape(-1)
289+
return np.median(np.compress(~np.isnan(data), data))
290+
else:
291+
data = a.T if not axis else a
292+
return np.array([np.median(np.compress(~np.isnan(row), row))
293+
for row in data])
294+
else:
295+
from numpy import nanmedian # noqa
296+
297+
285298
# Fix for behavior inconsistency on numpy.equal for object dtypes.
286299
# For numpy versions < 1.13, numpy.equal tests element-wise identity of objects
287300
# instead of equality. This fix returns the mask of NaNs in an array of

sklearn/utils/tests/test_fixes.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from sklearn.utils.fixes import divide
1616
from sklearn.utils.fixes import MaskedArray
17+
from sklearn.utils.fixes import nanmedian
1718
from sklearn.utils.fixes import nanpercentile
1819

1920

@@ -31,6 +32,22 @@ def test_masked_array_obj_dtype_pickleable():
3132
assert_array_equal(marr.mask, marr_pickled.mask)
3233

3334

35+
@pytest.mark.parametrize(
36+
"axis, expected_median",
37+
[(None, 4.0),
38+
(0, np.array([1., 3.5, 3.5, 4., 7., np.nan])),
39+
(1, np.array([1., 6.]))]
40+
)
41+
def test_nanmedian(axis, expected_median):
42+
X = np.array([[1, 1, 1, 2, np.nan, np.nan],
43+
[np.nan, 6, 6, 6, 7, np.nan]])
44+
median = nanmedian(X, axis=axis)
45+
if axis is None:
46+
assert median == pytest.approx(expected_median)
47+
else:
48+
assert_allclose(median, expected_median)
49+
50+
3451
@pytest.mark.parametrize(
3552
"a, q, expected_percentile",
3653
[(np.array([1, 2, 3, np.nan]), [0, 50, 100], np.array([1., 2., 3.])),

0 commit comments

Comments
 (0)