scikit-learn · untom · Jul 21, 2014 · Jul 21, 2014 · Jul 21, 2014 · Jul 21, 2014
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -72,13 +72,14 @@ This class is hence suitable for use in the early steps of a
 :class:`sklearn.pipeline.Pipeline`::
 
   >>> scaler = preprocessing.StandardScaler().fit(X)
-  >>> scaler
-  StandardScaler(copy=True, with_mean=True, with_std=True)
+  >>> scaler                                        # doctest: +NORMALIZE_WHITESPACE
+  StandardScaler(axis=0, copy=True, with_centering=True, with_mean=None,
+            with_scaling=True, with_std=None)
 
-  >>> scaler.mean_                                      # doctest: +ELLIPSIS
+  >>> scaler.center_                                    # doctest: +ELLIPSIS
   array([ 1. ...,  0. ...,  0.33...])
 
-  >>> scaler.std_                                       # doctest: +ELLIPSIS
+  >>> scaler.scale_                                     # doctest: +ELLIPSIS
   array([ 0.81...,  0.81...,  1.24...])
 
   >>> scaler.transform(X)                               # doctest: +ELLIPSIS
@@ -94,17 +95,19 @@ same way it did on the training set::
   array([[-2.44...,  1.22..., -0.26...]])
 
 It is possible to disable either centering or scaling by either
-passing ``with_mean=False`` or ``with_std=False`` to the constructor
+passing ``with_centering=False`` or ``with_scaling=False`` to the constructor
 of :class:`StandardScaler`.
 
 
 Scaling features to a range
 ---------------------------
 An alternative standardization is scaling features to
-lie between a given minimum and maximum value, often between zero and one.
-This can be achieved using :class:`MinMaxScaler`.
+lie between a given minimum and maximum value, such as between zero and one, or
+so that the maximum value of each feature is scaled to unit size.
+This can be achieved using :class:`MinMaxScaler` or :class:`MaxAbsScaler`,
+respectively.
 
-The motivation to use this scaling include robustness to very small
+The motivation to use such a scaling include robustness to very small
 standard deviations of features and preserving zero entries in sparse data.
 
 Here is an example to scale a toy data matrix to the ``[0, 1]`` range::
@@ -132,11 +135,8 @@ applied to be consistent with the transformation performed on the train data::
 It is possible to introspect the scaler attributes to find about the exact
 nature of the transformation learned on the training data::
 
-  >>> min_max_scaler.scale_                             # doctest: +ELLIPSIS
-  array([ 0.5       ,  0.5       ,  0.33...])
-
-  >>> min_max_scaler.min_                               # doctest: +ELLIPSIS
-  array([ 0.        ,  0.5       ,  0.33...])
+  >>> min_max_scaler.scale_                       # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+  array([ 2.,  2.,  3.])
 
 If :class:`MinMaxScaler` is given an explicit ``feature_range=(min, max)`` the
 full formula is::
@@ -145,6 +145,76 @@ full formula is::
 
     X_scaled = X_std / (max - min) + min
 
+As with :func:`scale`, the ``preprocessing`` module further provides a
+convenience function function :func:`minmax_scale` if you don't want to use
+the `Transformer` API.
+
+:class:`MaxAbsScaler` works in a very similar fashion, but scales data so
+it lies within the range ``[-1, 1]``, and is meant for data
+that is already centered at zero. In particular, this scaler is very well
+suited for sparse data.
+
+Here is how to use the toy data from the previous example with this scaler::
+
+  >>> X_train = np.array([[ 1., -1.,  2.],
+  ...                     [ 2.,  0.,  0.],
+  ...                     [ 0.,  1., -1.]])
+  ...
+  >>> max_abs_scaler = preprocessing.MaxAbsScaler()
+  >>> X_train_maxabs = max_abs_scaler.fit_transform(X_train)
+  >>> X_train_maxabs                             #doctest +NORMALIZE_WHITESPACE^
+  array([[ 0.5, -1. ,  1. ],
+         [ 1. ,  0. ,  0. ],
+         [ 0. ,  1. , -0.5]])
+  >>> X_test = np.array([[ -3., -1.,  4.]])
+  >>> X_test_maxabs = max_abs_scaler.transform(X_test)
+  >>> X_test_maxabs                               # doctest: +NORMALIZE_WHITESPACE
+  array([[-1.5, -1. ,  2. ]])
+  >>> max_abs_scaler.scale_                       # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+  array([ 2.,  1.,  2.])
+
+
+As with :func:`scale`, the ``preprocessing`` module further provides a
+convenience function function :func:`maxabs_scale` if you don't want to use
+the `Transformer` API.
+
+
+Scaling sparse data
+-------------------
+Centering sparse data would destroy the sparseness structure in the data, and
+thus rarely is a sensible thing to do. However, it can make sense to scale
+sparse inputs, especially if features are on different scales.
+
+:class:`MaxAbsScaler`  and :func:`maxabs_scale` were specifically designed
+for scaling sparse data, and are the recommended way to go about this.
+However, :func:`scale` and :class:`StandardScaler` can accept ``scipy.sparse``
+matrices  as input, as long as ``with_centering=False`` is explicitly passed
+to the constructor. Otherwise a ``ValueError`` will be raised as
+silently centering would break the sparsity and would often crash the
+execution by allocating excessive amounts of memory unintentionally.
+:class:`RobustScaler` cannot be `fit`ted to sparse inputs, but you can use the
+`transform` method on sparse inputs.
+
+Note that the scalers accept both Compressed Sparse Rows and Compressed
+Sparse Columns format (see ``scipy.sparse.csr_matrix`` and
+``scipy.sparse.csc_matrix``). Any other sparse input will be **converted to
+the Compressed Sparse Rows representation**.  To avoid unnecessary memory
+copies, it is recommended to choose the CSR or CSC representation upstream.
+
+Finally, if the centered data is expected to be small enough, explicitly
+converting the input to an array using the ``toarray`` method of sparse matrices
+is another option.
+
+
+Scaling data with outliers
+--------------------------
+If your data contains many outliers, scaling using the mean and variance
+of the data does sometimes not work very well. In these cases, you can use
+:func:`robust_scale` and :class:`RobustScaler` as drop-in replacements
+instead, which use more robust estimates for the center and range of your
+data.
+
+
 .. topic:: References:
 
   Further discussion on the importance of centering and scaling data is
@@ -161,26 +231,9 @@ full formula is::
   or :class:`sklearn.decomposition.RandomizedPCA` with ``whiten=True``
   to further remove the linear correlation across features.
 
-.. topic:: Sparse input
-
-  :func:`scale` and :class:`StandardScaler` accept ``scipy.sparse`` matrices
-  as input **only when with_mean=False is explicitly passed to the
-  constructor**. Otherwise a ``ValueError`` will be raised as
-  silently centering would break the sparsity and would often crash the
-  execution by allocating excessive amounts of memory unintentionally.
-
-  If the centered data is expected to be small enough, explicitly convert
-  the input to an array using the ``toarray`` method of sparse matrices
-  instead.
-
-  For sparse input the data is **converted to the Compressed Sparse Rows
-  representation** (see ``scipy.sparse.csr_matrix``).
-  To avoid unnecessary memory copies, it is recommended to choose the CSR
-  representation upstream.
-
 .. topic:: Scaling target variables in regression
 
-    :func:`scale` and :class:`StandardScaler` work out-of-the-box with 1d arrays.
+    All scaling functions and classes work out-of-the-box with 1d arrays.
     This is very useful for scaling the target / response variables used
     for regression.
 
@@ -526,6 +579,5 @@ similarly.
 
    Note that if features have very different scaling or statistical
    properties, :class:`cluster.FeatureAgglomeration` maye not be able to
-   capture the links between related features. Using a 
+   capture the links between related features. Using a
    :class:`preprocessing.StandardScaler` can be useful in these settings.
-
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
@@ -20,7 +20,7 @@
 from ..metrics.pairwise import euclidean_distances
 from ..utils.extmath import row_norms
 from ..utils.sparsefuncs_fast import assign_rows_csr
-from ..utils.sparsefuncs import mean_variance_axis0
+from ..utils.sparsefuncs import mean_variance_axis
 from ..utils.fixes import astype
 from ..utils import check_array
 from ..utils import check_random_state
@@ -141,7 +141,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 def _tolerance(X, tol):
     """Return a tolerance which is independent of the dataset"""
     if sp.issparse(X):
-        variances = mean_variance_axis0(X)[1]
+        variances = mean_variance_axis(X, axis=0)[1]
     else:
         variances = np.var(X, axis=0)
     return np.mean(variances) * tol

diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
@@ -17,7 +17,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
-from ..utils.sparsefuncs import mean_variance_axis0
+from ..utils.sparsefuncs import mean_variance_axis
 
 __all__ = ["TruncatedSVD"]
 
@@ -175,7 +175,7 @@ def fit_transform(self, X, y=None):
         X_transformed = np.dot(U, np.diag(Sigma))
         self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
         if sp.issparse(X):
-            _, full_var = mean_variance_axis0(X)
+            _, full_var = mean_variance_axis(X, axis=0)
             full_var = full_var.sum()
         else:
             full_var = np.var(X, axis=0).sum()

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -27,7 +27,7 @@
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
 from ..utils import as_float_array, check_array
 from ..utils.extmath import safe_sparse_dot
-from ..utils.sparsefuncs import mean_variance_axis0, inplace_column_scale
+from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 
 
 ###
@@ -55,7 +55,7 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
         else:
             X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)
 
-        X_mean, X_var = mean_variance_axis0(X)
+        X_mean, X_var = mean_variance_axis(X, axis=0)
         if normalize:
             # transform variance to std in-place
             # XXX: currently scaled to variance=n_samples to match center_data

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -237,10 +237,11 @@ def make_pipeline(*steps):
     --------
     >>> from sklearn.naive_bayes import GaussianNB
     >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
     >>> make_pipeline(StandardScaler(), GaussianNB())    # doctest: +NORMALIZE_WHITESPACE
-    Pipeline(steps=[('standardscaler',
-                     StandardScaler(copy=True, with_mean=True, with_std=True)),
-                    ('gaussiannb', GaussianNB())])
+        Pipeline(steps=[('standardscaler', StandardScaler(axis=0, copy=True,
+                 with_centering=True, with_mean=None, with_scaling=True,
+                 with_std=None)), ('gaussiannb', GaussianNB())])
 
     Returns
     -------

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -6,12 +6,17 @@
 from .data import Binarizer
 from .data import KernelCenterer
 from .data import MinMaxScaler
+from .data import MaxAbsScaler
 from .data import Normalizer
 from .data import StandardScaler
+from .data import RobustScaler
 from .data import add_dummy_feature
 from .data import binarize
 from .data import normalize
 from .data import scale
+from .data import robust_scale
+from .data import minmax_scale
+from .data import maxabs_scale
 from .data import OneHotEncoder
 
 from .data import PolynomialFeatures
@@ -31,13 +36,18 @@
     'LabelEncoder',
     'MultiLabelBinarizer',
     'MinMaxScaler',
+    'MaxAbsScaler',
     'Normalizer',
     'OneHotEncoder',
     'StandardScaler',
+    'RobustScaler',
     'add_dummy_feature',
     'PolynomialFeatures',
     'binarize',
     'normalize',
     'scale',
+    'robust_scale',
+    'minmax_scale',
+    'maxabs_scale',
     'label_binarize',
 ]