2424from ..utils import check_array
2525from ..utils .extmath import row_norms
2626from ..utils .extmath import _incremental_mean_and_var
27- from ..utils .fixes import boxcox , nanpercentile
27+ from ..utils .fixes import boxcox , nanpercentile , nanmedian
2828from ..utils .sparsefuncs_fast import (inplace_csr_row_normalize_l1 ,
2929 inplace_csr_row_normalize_l2 )
3030from ..utils .sparsefuncs import (inplace_column_scale ,
@@ -1092,18 +1092,6 @@ def __init__(self, with_centering=True, with_scaling=True,
10921092 self .quantile_range = quantile_range
10931093 self .copy = copy
10941094
1095- def _check_array (self , X , copy ):
1096- """Makes sure centering is not enabled for sparse matrices."""
1097- X = check_array (X , accept_sparse = ('csr' , 'csc' ), copy = self .copy ,
1098- estimator = self , dtype = FLOAT_DTYPES )
1099-
1100- if sparse .issparse (X ):
1101- if self .with_centering :
1102- raise ValueError (
1103- "Cannot center sparse matrices: use `with_centering=False`"
1104- " instead. See docstring for motivation and alternatives." )
1105- return X
1106-
11071095 def fit (self , X , y = None ):
11081096 """Compute the median and quantiles to be used for scaling.
11091097
@@ -1113,39 +1101,60 @@ def fit(self, X, y=None):
11131101 The data used to compute the median and quantiles
11141102 used for later scaling along the features axis.
11151103 """
1116- if sparse .issparse (X ):
1117- raise TypeError ("RobustScaler cannot be fitted on sparse inputs" )
1118- X = self ._check_array (X , self .copy )
1104+ # at fit, convert sparse matrices to csc for optimized computation of
1105+ # the quantiles
1106+ X = check_array (X , accept_sparse = 'csc' , copy = self .copy , estimator = self ,
1107+ dtype = FLOAT_DTYPES , force_all_finite = 'allow-nan' )
1108+
1109+ q_min , q_max = self .quantile_range
1110+ if not 0 <= q_min <= q_max <= 100 :
1111+ raise ValueError ("Invalid quantile range: %s" %
1112+ str (self .quantile_range ))
1113+
11191114 if self .with_centering :
1120- self .center_ = np .median (X , axis = 0 )
1115+ if sparse .issparse (X ):
1116+ raise ValueError (
1117+ "Cannot center sparse matrices: use `with_centering=False`"
1118+ " instead. See docstring for motivation and alternatives." )
1119+ self .center_ = nanmedian (X , axis = 0 )
1120+ else :
1121+ self .center_ = None
11211122
11221123 if self .with_scaling :
1123- q_min , q_max = self .quantile_range
1124- if not 0 <= q_min <= q_max <= 100 :
1125- raise ValueError ("Invalid quantile range: %s" %
1126- str (self .quantile_range ))
1124+ quantiles = []
1125+ for feature_idx in range (X .shape [1 ]):
1126+ if sparse .issparse (X ):
1127+ column_nnz_data = X .data [X .indptr [feature_idx ]:
1128+ X .indptr [feature_idx + 1 ]]
1129+ column_data = np .zeros (shape = X .shape [0 ], dtype = X .dtype )
1130+ column_data [:len (column_nnz_data )] = column_nnz_data
1131+ else :
1132+ column_data = X [:, feature_idx ]
11271133
1128- q = np .percentile (X , self .quantile_range , axis = 0 )
1129- self .scale_ = (q [1 ] - q [0 ])
1134+ quantiles .append (nanpercentile (column_data ,
1135+ self .quantile_range ))
1136+
1137+ quantiles = np .transpose (quantiles )
1138+
1139+ self .scale_ = quantiles [1 ] - quantiles [0 ]
11301140 self .scale_ = _handle_zeros_in_scale (self .scale_ , copy = False )
1141+ else :
1142+ self .scale_ = None
1143+
11311144 return self
11321145
11331146 def transform (self , X ):
11341147 """Center and scale the data.
11351148
1136- Can be called on sparse input, provided that ``RobustScaler`` has been
1137- fitted to dense input and ``with_centering=False``.
1138-
11391149 Parameters
11401150 ----------
11411151 X : {array-like, sparse matrix}
11421152 The data used to scale along the specified axis.
11431153 """
1144- if self .with_centering :
1145- check_is_fitted (self , 'center_' )
1146- if self .with_scaling :
1147- check_is_fitted (self , 'scale_' )
1148- X = self ._check_array (X , self .copy )
1154+ check_is_fitted (self , 'center_' , 'scale_' )
1155+ X = check_array (X , accept_sparse = ('csr' , 'csc' ), copy = self .copy ,
1156+ estimator = self , dtype = FLOAT_DTYPES ,
1157+ force_all_finite = 'allow-nan' )
11491158
11501159 if sparse .issparse (X ):
11511160 if self .with_scaling :
@@ -1165,11 +1174,10 @@ def inverse_transform(self, X):
11651174 X : array-like
11661175 The data used to scale along the specified axis.
11671176 """
1168- if self .with_centering :
1169- check_is_fitted (self , 'center_' )
1170- if self .with_scaling :
1171- check_is_fitted (self , 'scale_' )
1172- X = self ._check_array (X , self .copy )
1177+ check_is_fitted (self , 'center_' , 'scale_' )
1178+ X = check_array (X , accept_sparse = ('csr' , 'csc' ), copy = self .copy ,
1179+ estimator = self , dtype = FLOAT_DTYPES ,
1180+ force_all_finite = 'allow-nan' )
11731181
11741182 if sparse .issparse (X ):
11751183 if self .with_scaling :
@@ -1242,7 +1250,8 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
12421250 (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).
12431251 """
12441252 X = check_array (X , accept_sparse = ('csr' , 'csc' ), copy = False ,
1245- ensure_2d = False , dtype = FLOAT_DTYPES )
1253+ ensure_2d = False , dtype = FLOAT_DTYPES ,
1254+ force_all_finite = 'allow-nan' )
12461255 original_ndim = X .ndim
12471256
12481257 if original_ndim == 1 :
0 commit comments