Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 495fdd9

Browse files
ENH Add "per_feature" option to Scalar classes
Defaults to True, which is the previous behavior. Also allow the scalar functions to take "axis=None" inputs in addition to 0 or 1. Includes some tweaks to the sparsefuncs helper functions to deal with `axis=None` inputs. Includes a new function `sparse_mean_variance` in "sparsefuncs_fast.pyx" to find the variance of a sparse array; that didn't seem to exist yet. This is one line in pure Python, but writing in Cython is faster and avoids extra memory use (as long as the input array has dtype np.float32 or np.float64).
1 parent 8b6a4e3 commit 495fdd9

File tree

9 files changed

+2674
-1739
lines changed

9 files changed

+2674
-1739
lines changed

doc/modules/preprocessing.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ This class is hence suitable for use in the early steps of a
7373

7474
>>> scaler = preprocessing.StandardScaler().fit(X)
7575
>>> scaler
76-
StandardScaler(copy=True, with_mean=True, with_std=True)
76+
StandardScaler(copy=True, per_feature=True, with_mean=True, with_std=True)
7777

7878
>>> scaler.mean_ # doctest: +ELLIPSIS
7979
array([ 1. ..., 0. ..., 0.33...])

sklearn/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ def make_pipeline(*steps):
375375
>>> from sklearn.preprocessing import StandardScaler
376376
>>> make_pipeline(StandardScaler(), GaussianNB()) # doctest: +NORMALIZE_WHITESPACE
377377
Pipeline(steps=[('standardscaler',
378-
StandardScaler(copy=True, with_mean=True, with_std=True)),
378+
StandardScaler(copy=True, per_feature=True, with_mean=True, with_std=True)),
379379
('gaussiannb', GaussianNB())])
380380
381381
Returns

sklearn/preprocessing/data.py

Lines changed: 86 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,14 @@ def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
5252
Zero valued std components are reset to 1.0 to avoid NaNs when scaling.
5353
"""
5454
X = np.asarray(X)
55-
Xr = np.rollaxis(X, axis)
5655

5756
if with_mean:
58-
mean_ = Xr.mean(axis=0)
57+
mean_ = X.mean(axis=axis)
5958
else:
6059
mean_ = None
6160

6261
if with_std:
63-
std_ = Xr.std(axis=0)
62+
std_ = X.std(axis=axis)
6463
std_ = _handle_zeros_in_scale(std_)
6564
else:
6665
std_ = None
@@ -95,10 +94,10 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
9594
X : array-like or CSR matrix.
9695
The data to center and scale.
9796
98-
axis : int (0 by default)
97+
axis : int or None (0 by default)
9998
axis used to compute the means and standard deviations along. If 0,
100-
independently standardize each feature, otherwise (if 1) standardize
101-
each sample.
99+
independently standardize each feature, if 1 standardize
100+
each sample, if None standardize by all data taken together.
102101
103102
with_mean : boolean, True by default
104103
If True, center the data before scaling.
@@ -158,10 +157,11 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
158157
X = X.copy()
159158
# Xr is a view on the original array that enables easy use of
160159
# broadcasting on the axis in which we are interested in
161-
Xr = np.rollaxis(X, axis)
160+
Xr = np.rollaxis(X, (0 if axis is None else axis))
161+
_check_axis = None if axis is None else 0
162162
if with_mean:
163163
Xr -= mean_
164-
mean_1 = Xr.mean(axis=0)
164+
mean_1 = Xr.mean(axis=_check_axis)
165165
# Verify that mean_1 is 'close to zero'. If X contains very
166166
# large values, mean_1 can also be very large, due to a lack of
167167
# precision of mean_. In this case, a pre-scaling of the
@@ -177,7 +177,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
177177
if with_std:
178178
Xr /= std_
179179
if with_mean:
180-
mean_2 = Xr.mean(axis=0)
180+
mean_2 = Xr.mean(axis=_check_axis)
181181
# If mean_2 is not 'close to zero', it comes from the fact that
182182
# std_ is very small so that mean_2 = mean_1/std_ > 0, even if
183183
# mean_1 was close to zero. The problem is thus essentially due
@@ -196,9 +196,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
196196
class MinMaxScaler(BaseEstimator, TransformerMixin):
197197
"""Standardizes features by scaling each feature to a given range.
198198
199-
This estimator scales and translates each feature individually such
200-
that it is in the given range on the training set, i.e. between
201-
zero and one.
199+
This estimator scales and translates each feature individually (unless
200+
initialized with "per_feature=False") such that it is in the given
201+
range on the training set, i.e. between zero and one.
202202
203203
The standardization is given by::
204204
@@ -221,6 +221,11 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
221221
Set to False to perform inplace row normalization and avoid a
222222
copy (if the input is already a numpy array).
223223
224+
per_feature : boolean, optional, default True
225+
Set to False to scale features based on the maximum and minimum
226+
values over the entire input array, rather than the maximum
227+
and minimum values by feature.
228+
224229
Attributes
225230
----------
226231
min_ : ndarray, shape (n_features,)
@@ -230,9 +235,10 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
230235
Per feature relative scaling of the data.
231236
"""
232237

233-
def __init__(self, feature_range=(0, 1), copy=True):
238+
def __init__(self, feature_range=(0, 1), copy=True, per_feature=True):
234239
self.feature_range = feature_range
235240
self.copy = copy
241+
self.per_feature = per_feature
236242

237243
def fit(self, X, y=None):
238244
"""Compute the minimum and maximum to be used for later scaling.
@@ -249,8 +255,9 @@ def fit(self, X, y=None):
249255
if feature_range[0] >= feature_range[1]:
250256
raise ValueError("Minimum of desired feature range must be smaller"
251257
" than maximum. Got %s." % str(feature_range))
252-
data_min = np.min(X, axis=0)
253-
data_range = np.max(X, axis=0) - data_min
258+
axis = 0 if self.per_feature else None
259+
data_min = np.min(X, axis=axis)
260+
data_range = np.max(X, axis=axis) - data_min
254261
data_range = _handle_zeros_in_scale(data_range)
255262
self.scale_ = (feature_range[1] - feature_range[0]) / data_range
256263
self.min_ = feature_range[0] - data_min * self.scale_
@@ -292,10 +299,10 @@ def inverse_transform(self, X):
292299
class StandardScaler(BaseEstimator, TransformerMixin):
293300
"""Standardize features by removing the mean and scaling to unit variance
294301
295-
Centering and scaling happen independently on each feature by computing
296-
the relevant statistics on the samples in the training set. Mean and
297-
standard deviation are then stored to be used on later data using the
298-
`transform` method.
302+
Centering and scaling happen independently on each feature (unless
303+
initialized with "per_feature=False") by computing the relevant statistics
304+
on the samples in the training set. Mean and standard deviation are
305+
then stored to be used on later data using the `transform` method.
299306
300307
Standardization of a dataset is a common requirement for many
301308
machine learning estimators: they might behave badly if the
@@ -331,13 +338,17 @@ class StandardScaler(BaseEstimator, TransformerMixin):
331338
not a NumPy array or scipy.sparse CSR matrix, a copy may still be
332339
returned.
333340
341+
per_feature : boolean, optional, default True
342+
Set to False to scale features based on the mean and variance taken
343+
over the entire input array, rather than per feature.
344+
334345
Attributes
335346
----------
336-
mean_ : array of floats with shape [n_features]
347+
mean_ : array of floats with shape [n_features] or scalar float
337348
The mean value for each feature in the training set.
338349
339-
std_ : array of floats with shape [n_features]
340-
The standard deviation for each feature in the training set.
350+
std_ : array of floats with shape [n_features] or scalar float
351+
The standard deviation for each feature in the training set.
341352
Set to one if the standard deviation is zero for a given feature.
342353
343354
See also
@@ -349,10 +360,12 @@ class StandardScaler(BaseEstimator, TransformerMixin):
349360
to further remove the linear correlation across features.
350361
"""
351362

352-
def __init__(self, copy=True, with_mean=True, with_std=True):
363+
def __init__(self, copy=True, with_mean=True, with_std=True,
364+
per_feature=True):
353365
self.with_mean = with_mean
354366
self.with_std = with_std
355367
self.copy = copy
368+
self.per_feature = per_feature
356369

357370
def fit(self, X, y=None):
358371
"""Compute the mean and std to be used for later scaling.
@@ -366,6 +379,7 @@ def fit(self, X, y=None):
366379
X = check_array(X, accept_sparse='csr', copy=self.copy,
367380
ensure_2d=False, warn_on_dtype=True,
368381
estimator=self, dtype=FLOAT_DTYPES)
382+
axis = 0 if self.per_feature else None
369383
if sparse.issparse(X):
370384
if self.with_mean:
371385
raise ValueError(
@@ -374,15 +388,15 @@ def fit(self, X, y=None):
374388
self.mean_ = None
375389

376390
if self.with_std:
377-
var = mean_variance_axis(X, axis=0)[1]
391+
var = mean_variance_axis(X, axis=axis)[1]
378392
self.std_ = np.sqrt(var)
379393
self.std_ = _handle_zeros_in_scale(self.std_)
380394
else:
381395
self.std_ = None
382396
return self
383397
else:
384398
self.mean_, self.std_ = _mean_and_std(
385-
X, axis=0, with_mean=self.with_mean, with_std=self.with_std)
399+
X, axis=axis, with_mean=self.with_mean, with_std=self.with_std)
386400
return self
387401

388402
def transform(self, X, y=None, copy=None):
@@ -450,10 +464,11 @@ def inverse_transform(self, X, copy=None):
450464
class MaxAbsScaler(BaseEstimator, TransformerMixin):
451465
"""Scale each feature by its maximum absolute value.
452466
453-
This estimator scales and translates each feature individually such
454-
that the maximal absolute value of each feature in the
455-
training set will be 1.0. It does not shift/center the data, and
456-
thus does not destroy any sparsity.
467+
This estimator scales and translates each feature individually
468+
(unless "per_feature=False", in which case all features will be
469+
considered together) such that the maximal absolute value of each
470+
feature in the training set will be 1.0. It does not shift/center
471+
the data, and thus does not destroy any sparsity.
457472
458473
This scaler can also be applied to sparse CSR or CSC matrices.
459474
@@ -463,14 +478,19 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin):
463478
Set to False to perform inplace scaling and avoid a copy (if the input
464479
is already a numpy array).
465480
481+
per_feature : boolean, optional, default True
482+
Set to False to scale features based on the maximum absolute value
483+
in the entire input array, rather than per feature.
484+
466485
Attributes
467486
----------
468-
scale_ : ndarray, shape (n_features,)
487+
scale_ : ndarray, shape (n_features,) or scalar float
469488
Per feature relative scaling of the data.
470489
"""
471490

472-
def __init__(self, copy=True):
491+
def __init__(self, copy=True, per_feature=True):
473492
self.copy = copy
493+
self.per_feature = per_feature
474494

475495
def fit(self, X, y=None):
476496
"""Compute the minimum and maximum to be used for later scaling.
@@ -483,13 +503,15 @@ def fit(self, X, y=None):
483503
"""
484504
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
485505
ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)
506+
axis = 0 if self.per_feature else None
486507
if sparse.issparse(X):
487-
mins, maxs = min_max_axis(X, axis=0)
508+
mins, maxs = min_max_axis(X, axis=axis)
488509
scales = np.maximum(np.abs(mins), np.abs(maxs))
489510
else:
490-
scales = np.abs(X).max(axis=0)
491-
scales = np.array(scales)
492-
scales = scales.reshape(-1)
511+
scales = np.abs(X).max(axis=axis)
512+
if self.per_feature:
513+
scales = np.array(scales)
514+
scales = scales.reshape(-1)
493515
self.scale_ = _handle_zeros_in_scale(scales)
494516
return self
495517

@@ -545,19 +567,21 @@ def maxabs_scale(X, axis=0, copy=True):
545567
546568
Parameters
547569
----------
548-
axis : int (0 by default)
570+
axis : int or None (0 by default)
549571
axis used to scale along. If 0, independently scale each feature,
550-
otherwise (if 1) scale each sample.
572+
if 1 scale each sample, and if None, scale by all data together.
551573
552574
copy : boolean, optional, default is True
553575
Set to False to perform inplace scaling and avoid a copy (if the input
554576
is already a numpy array).
555577
"""
556-
s = MaxAbsScaler(copy=copy)
557-
if axis == 0:
578+
s = MaxAbsScaler(copy=copy, per_feature=(axis is not None))
579+
if axis == 0 or axis is None:
558580
return s.fit_transform(X)
559-
else:
581+
elif axis == 1:
560582
return s.fit_transform(X.T).T
583+
else:
584+
raise ValueError('"axis" must be 0, 1, or None.')
561585

562586

563587
class RobustScaler(BaseEstimator, TransformerMixin):
@@ -567,8 +591,8 @@ class RobustScaler(BaseEstimator, TransformerMixin):
567591
the Interquartile Range (IQR). The IQR is the range between the 1st
568592
quartile (25th quantile) and the 3rd quartile (75th quantile).
569593
570-
Centering and scaling happen independently on each feature (or each
571-
sample, depending on the `axis` argument) by computing the relevant
594+
Centering and scaling happen independently on each feature (unless
595+
initialized with "per_feature=False") by computing the relevant
572596
statistics on the samples in the training set. Median and interquartile
573597
range are then stored to be used on later data using the `transform`
574598
method.
@@ -599,12 +623,16 @@ class RobustScaler(BaseEstimator, TransformerMixin):
599623
not a NumPy array or scipy.sparse CSR matrix, a copy may still be
600624
returned.
601625
626+
per_feature : boolean, optional, default True
627+
Set to False to scale features based on values over the entire input
628+
array, rather than by feature.
629+
602630
Attributes
603631
----------
604-
center_ : array of floats
632+
center_ : array of floats or scalar float
605633
The median value for each feature in the training set.
606634
607-
scale_ : array of floats
635+
scale_ : array of floats or scalar float
608636
The (scaled) interquartile range for each feature in the training set.
609637
610638
See also
@@ -623,10 +651,12 @@ class RobustScaler(BaseEstimator, TransformerMixin):
623651
http://en.wikipedia.org/wiki/Interquartile_range
624652
"""
625653

626-
def __init__(self, with_centering=True, with_scaling=True, copy=True):
654+
def __init__(self, with_centering=True, with_scaling=True,
655+
copy=True, per_feature=True):
627656
self.with_centering = with_centering
628657
self.with_scaling = with_scaling
629658
self.copy = copy
659+
self.per_feature = per_feature
630660

631661
def _check_array(self, X, copy):
632662
"""Makes sure centering is not enabled for sparse matrices."""
@@ -652,11 +682,12 @@ def fit(self, X, y=None):
652682
raise TypeError("RobustScaler cannot be fitted on sparse inputs")
653683

654684
X = self._check_array(X, self.copy)
685+
axis = 0 if self.per_feature else None
655686
if self.with_centering:
656-
self.center_ = np.median(X, axis=0)
687+
self.center_ = np.median(X, axis=axis)
657688

658689
if self.with_scaling:
659-
q = np.percentile(X, (25, 75), axis=0)
690+
q = np.percentile(X, (25, 75), axis=axis)
660691
self.scale_ = (q[1] - q[0])
661692
self.scale_ = _handle_zeros_in_scale(self.scale_)
662693
return self
@@ -727,10 +758,10 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True, copy=True):
727758
X : array-like.
728759
The data to center and scale.
729760
730-
axis : int (0 by default)
761+
axis : int or None (0 by default)
731762
axis used to compute the medians and IQR along. If 0,
732-
independently scale each feature, otherwise (if 1) scale
733-
each sample.
763+
independently scale each feature, if 1, scale
764+
each sample, and if None, scale all data together.
734765
735766
with_centering : boolean, True by default
736767
If True, center the data before scaling.
@@ -764,11 +795,13 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True, copy=True):
764795
:class:`sklearn.pipeline.Pipeline`)
765796
"""
766797
s = RobustScaler(with_centering=with_centering, with_scaling=with_scaling,
767-
copy=copy)
768-
if axis == 0:
798+
copy=copy, per_feature=(axis is not None))
799+
if axis == 0 or axis is None:
769800
return s.fit_transform(X)
770-
else:
801+
elif axis == 1:
771802
return s.fit_transform(X.T).T
803+
else:
804+
raise ValueError('"axis" must be 0, 1, or None.')
772805

773806

774807
class PolynomialFeatures(BaseEstimator, TransformerMixin):

0 commit comments

Comments
 (0)