From 78cfb870b5cea929d635da8708dd8b5c8523762a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20Tr=C3=A9segnie?= <nicolas.tresegnie@gmail.com>
Date: Fri, 26 Jul 2013 11:36:42 +0200
Subject: [PATCH 1/6] Imp splitting of preprocessing.py

---
 sklearn/preprocessing.py            | 1776 ---------------------------
 sklearn/preprocessing/__init__.py   |   36 +
 sklearn/preprocessing/data.py       |  751 +++++++++++
 sklearn/preprocessing/imputation.py |  414 +++++++
 sklearn/preprocessing/label.py      |  687 +++++++++++
 5 files changed, 1888 insertions(+), 1776 deletions(-)
 delete mode 100644 sklearn/preprocessing.py
 create mode 100644 sklearn/preprocessing/__init__.py
 create mode 100644 sklearn/preprocessing/data.py
 create mode 100644 sklearn/preprocessing/imputation.py
 create mode 100644 sklearn/preprocessing/label.py

diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py
deleted file mode 100644
index 4dafdd75c85b9..0000000000000
--- a/sklearn/preprocessing.py
+++ /dev/null
@@ -1,1776 +0,0 @@
-# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-#          Mathieu Blondel <mathieu@mblondel.org>
-#          Olivier Grisel <olivier.grisel@ensta.org>
-#          Andreas Mueller <amueller@ais.uni-bonn.de>
-# License: BSD 3 clause
-
-import warnings
-import numbers
-import math
-
-import numpy as np
-import numpy.ma as ma
-from scipy import sparse
-from scipy import stats
-
-from .base import BaseEstimator, TransformerMixin
-from .utils import check_arrays
-from .utils import array2d
-from .utils import as_float_array
-from .utils import atleast2d_or_csr
-from .utils import atleast2d_or_csc
-from .utils import safe_asarray
-from .utils import warn_if_not_float
-from .utils.fixes import unique
-from .utils import deprecated
-
-from .utils.multiclass import unique_labels
-from .utils.multiclass import type_of_target
-
-from .utils.sparsefuncs import inplace_csr_row_normalize_l1
-from .utils.sparsefuncs import inplace_csr_row_normalize_l2
-from .utils.sparsefuncs import inplace_csr_column_scale
-from .utils.sparsefuncs import mean_variance_axis0
-from .externals import six
-
-zip = six.moves.zip
-map = six.moves.map
-
-__all__ = ['Binarizer',
-           'Imputer',
-           'KernelCenterer',
-           'LabelBinarizer',
-           'LabelEncoder',
-           'MinMaxScaler',
-           'Normalizer',
-           'OneHotEncoder',
-           'StandardScaler',
-           'binarize',
-           'normalize',
-           'scale']
-
-
-def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
-    """Compute mean and std deviation for centering, scaling.
-
-    Zero valued std components are reset to 1.0 to avoid NaNs when scaling.
-    """
-    X = np.asarray(X)
-    Xr = np.rollaxis(X, axis)
-
-    if with_mean:
-        mean_ = Xr.mean(axis=0)
-    else:
-        mean_ = None
-
-    if with_std:
-        std_ = Xr.std(axis=0)
-        if isinstance(std_, np.ndarray):
-            std_[std_ == 0.0] = 1.0
-        elif std_ == 0.:
-            std_ = 1.
-    else:
-        std_ = None
-
-    return mean_, std_
-
-
-def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
-    """Standardize a dataset along any axis
-
-    Center to the mean and component wise scale to unit variance.
-
-    Parameters
-    ----------
-    X : array-like or CSR matrix.
-        The data to center and scale.
-
-    axis : int (0 by default)
-        axis used to compute the means and standard deviations along. If 0,
-        independently standardize each feature, otherwise (if 1) standardize
-        each sample.
-
-    with_mean : boolean, True by default
-        If True, center the data before scaling.
-
-    with_std : boolean, True by default
-        If True, scale the data to unit variance (or equivalently,
-        unit standard deviation).
-
-    copy : boolean, optional, default is True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix and if axis is 1).
-
-    Notes
-    -----
-    This implementation will refuse to center scipy.sparse matrices
-    since it would make them non-sparse and would potentially crash the
-    program with memory exhaustion problems.
-
-    Instead the caller is expected to either set explicitly
-    `with_mean=False` (in that case, only variance scaling will be
-    performed on the features of the CSR matrix) or to call `X.toarray()`
-    if he/she expects the materialized dense array to fit in memory.
-
-    To avoid memory copy the caller should pass a CSR matrix.
-
-    See also
-    --------
-    :class:`sklearn.preprocessing.StandardScaler` to perform centering and
-    scaling using the ``Transformer`` API (e.g. as part of a preprocessing
-    :class:`sklearn.pipeline.Pipeline`)
-    """
-    if sparse.issparse(X):
-        if with_mean:
-            raise ValueError(
-                "Cannot center sparse matrices: pass `with_mean=False` instead"
-                " See docstring for motivation and alternatives.")
-        if axis != 0:
-            raise ValueError("Can only scale sparse matrix on axis=0, "
-                             " got axis=%d" % axis)
-        warn_if_not_float(X, estimator='The scale function')
-        if not sparse.isspmatrix_csr(X):
-            X = X.tocsr()
-            copy = False
-        if copy:
-            X = X.copy()
-        _, var = mean_variance_axis0(X)
-        var[var == 0.0] = 1.0
-        inplace_csr_column_scale(X, 1 / np.sqrt(var))
-    else:
-        X = np.asarray(X)
-        warn_if_not_float(X, estimator='The scale function')
-        mean_, std_ = _mean_and_std(
-            X, axis, with_mean=with_mean, with_std=with_std)
-        if copy:
-            X = X.copy()
-        # Xr is a view on the original array that enables easy use of
-        # broadcasting on the axis in which we are interested in
-        Xr = np.rollaxis(X, axis)
-        if with_mean:
-            Xr -= mean_
-        if with_std:
-            Xr /= std_
-    return X
-
-
-class MinMaxScaler(BaseEstimator, TransformerMixin):
-    """Standardizes features by scaling each feature to a given range.
-
-    This estimator scales and translates each feature individually such
-    that it is in the given range on the training set, i.e. between
-    zero and one.
-
-    The standardization is given by::
-        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
-        X_scaled = X_std * (max - min) + min
-
-    where min, max = feature_range.
-
-    This standardization is often used as an alternative to zero mean,
-    unit variance scaling.
-
-    Parameters
-    ----------
-    feature_range: tuple (min, max), default=(0, 1)
-        Desired range of transformed data.
-
-    copy : boolean, optional, default is True
-        Set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array).
-
-    Attributes
-    ----------
-    `min_` : ndarray, shape (n_features,)
-        Per feature adjustment for minimum.
-
-    `scale_` : ndarray, shape (n_features,)
-        Per feature relative scaling of the data.
-    """
-
-    def __init__(self, feature_range=(0, 1), copy=True):
-        self.feature_range = feature_range
-        self.copy = copy
-
-    def fit(self, X, y=None):
-        """Compute the minimum and maximum to be used for later scaling.
-
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data used to compute the per-feature minimum and maximum
-            used for later scaling along the features axis.
-        """
-        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
-        warn_if_not_float(X, estimator=self)
-        feature_range = self.feature_range
-        if feature_range[0] >= feature_range[1]:
-            raise ValueError("Minimum of desired feature range must be smaller"
-                             " than maximum. Got %s." % str(feature_range))
-        data_min = np.min(X, axis=0)
-        data_range = np.max(X, axis=0) - data_min
-        # Do not scale constant features
-        data_range[data_range == 0.0] = 1.0
-        self.scale_ = (feature_range[1] - feature_range[0]) / data_range
-        self.min_ = feature_range[0] - data_min * self.scale_
-        self.data_range = data_range
-        self.data_min = data_min
-        return self
-
-    def transform(self, X):
-        """Scaling features of X according to feature_range.
-
-        Parameters
-        ----------
-        X : array-like with shape [n_samples, n_features]
-            Input data that will be transformed.
-        """
-        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
-        X *= self.scale_
-        X += self.min_
-        return X
-
-    def inverse_transform(self, X):
-        """Undo the scaling of X according to feature_range.
-
-        Parameters
-        ----------
-        X : array-like with shape [n_samples, n_features]
-            Input data that will be transformed.
-        """
-        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
-        X -= self.min_
-        X /= self.scale_
-        return X
-
-
-class StandardScaler(BaseEstimator, TransformerMixin):
-    """Standardize features by removing the mean and scaling to unit variance
-
-    Centering and scaling happen independently on each feature by computing
-    the relevant statistics on the samples in the training set. Mean and
-    standard deviation are then stored to be used on later data using the
-    `transform` method.
-
-    Standardization of a dataset is a common requirement for many
-    machine learning estimators: they might behave badly if the
-    individual feature do not more or less look like standard normally
-    distributed data (e.g. Gaussian with 0 mean and unit variance).
-
-    For instance many elements used in the objective function of
-    a learning algorithm (such as the RBF kernel of Support Vector
-    Machines or the L1 and L2 regularizers of linear models) assume that
-    all features are centered around 0 and have variance in the same
-    order. If a feature has a variance that is orders of magnitude larger
-    that others, it might dominate the objective function and make the
-    estimator unable to learn from other features correctly as expected.
-
-    Parameters
-    ----------
-    with_mean : boolean, True by default
-        If True, center the data before scaling.
-        This does not work (and will raise an exception) when attempted on
-        sparse matrices, because centering them entails building a dense
-        matrix which in common use cases is likely to be too large to fit in
-        memory.
-
-    with_std : boolean, True by default
-        If True, scale the data to unit variance (or equivalently,
-        unit standard deviation).
-
-    copy : boolean, optional, default is True
-        If False, try to avoid a copy and do inplace scaling instead.
-        This is not guaranteed to always work inplace; e.g. if the data is
-        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
-        returned.
-
-    Attributes
-    ----------
-    `mean_` : array of floats with shape [n_features]
-        The mean value for each feature in the training set.
-
-    `std_` : array of floats with shape [n_features]
-        The standard deviation for each feature in the training set.
-
-    See also
-    --------
-    :func:`sklearn.preprocessing.scale` to perform centering and
-    scaling without using the ``Transformer`` object oriented API
-
-    :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True`
-    to further remove the linear correlation across features.
-    """
-
-    def __init__(self, copy=True, with_mean=True, with_std=True):
-        self.with_mean = with_mean
-        self.with_std = with_std
-        self.copy = copy
-
-    def fit(self, X, y=None):
-        """Compute the mean and std to be used for later scaling.
-
-        Parameters
-        ----------
-        X : array-like or CSR matrix with shape [n_samples, n_features]
-            The data used to compute the mean and standard deviation
-            used for later scaling along the features axis.
-        """
-        X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
-        if sparse.issparse(X):
-            if self.with_mean:
-                raise ValueError(
-                    "Cannot center sparse matrices: pass `with_mean=False` "
-                    "instead. See docstring for motivation and alternatives.")
-            warn_if_not_float(X, estimator=self)
-            self.mean_ = None
-
-            if self.with_std:
-                var = mean_variance_axis0(X)[1]
-                self.std_ = np.sqrt(var)
-                self.std_[var == 0.0] = 1.0
-            else:
-                self.std_ = None
-            return self
-        else:
-            warn_if_not_float(X, estimator=self)
-            self.mean_, self.std_ = _mean_and_std(
-                X, axis=0, with_mean=self.with_mean, with_std=self.with_std)
-            return self
-
-    def transform(self, X, y=None, copy=None):
-        """Perform standardization by centering and scaling
-
-        Parameters
-        ----------
-        X : array-like with shape [n_samples, n_features]
-            The data used to scale along the features axis.
-        """
-        copy = copy if copy is not None else self.copy
-        X = check_arrays(X, copy=copy, sparse_format="csr")[0]
-        if sparse.issparse(X):
-            if self.with_mean:
-                raise ValueError(
-                    "Cannot center sparse matrices: pass `with_mean=False` "
-                    "instead See docstring for motivation and alternatives.")
-            if self.std_ is not None:
-                warn_if_not_float(X, estimator=self)
-                inplace_csr_column_scale(X, 1 / self.std_)
-        else:
-            warn_if_not_float(X, estimator=self)
-            if self.with_mean:
-                X -= self.mean_
-            if self.with_std:
-                X /= self.std_
-        return X
-
-    def inverse_transform(self, X, copy=None):
-        """Scale back the data to the original representation
-
-        Parameters
-        ----------
-        X : array-like with shape [n_samples, n_features]
-            The data used to scale along the features axis.
-        """
-        copy = copy if copy is not None else self.copy
-        if sparse.issparse(X):
-            if self.with_mean:
-                raise ValueError(
-                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
-                    "instead See docstring for motivation and alternatives.")
-            if not sparse.isspmatrix_csr(X):
-                X = X.tocsr()
-                copy = False
-            if copy:
-                X = X.copy()
-            if self.std_ is not None:
-                inplace_csr_column_scale(X, self.std_)
-        else:
-            X = np.asarray(X)
-            if copy:
-                X = X.copy()
-            if self.with_std:
-                X *= self.std_
-            if self.with_mean:
-                X += self.mean_
-        return X
-
-
-class Scaler(StandardScaler):
-    def __init__(self, copy=True, with_mean=True, with_std=True):
-        warnings.warn("Scaler was renamed to StandardScaler. The old name "
-                      " will be removed in 0.15.", DeprecationWarning)
-        super(Scaler, self).__init__(copy, with_mean, with_std)
-
-
-def normalize(X, norm='l2', axis=1, copy=True):
-    """Normalize a dataset along any axis
-
-    Parameters
-    ----------
-    X : array or scipy.sparse matrix with shape [n_samples, n_features]
-        The data to normalize, element by element.
-        scipy.sparse matrices should be in CSR format to avoid an
-        un-necessary copy.
-
-    norm : 'l1' or 'l2', optional ('l2' by default)
-        The norm to use to normalize each non zero sample (or each non-zero
-        feature if axis is 0).
-
-    axis : 0 or 1, optional (1 by default)
-        axis used to normalize the data along. If 1, independently normalize
-        each sample, otherwise (if 0) normalize each feature.
-
-    copy : boolean, optional, default is True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix and if axis is 1).
-
-    See also
-    --------
-    :class:`sklearn.preprocessing.Normalizer` to perform normalization
-    using the ``Transformer`` API (e.g. as part of a preprocessing
-    :class:`sklearn.pipeline.Pipeline`)
-    """
-    if norm not in ('l1', 'l2'):
-        raise ValueError("'%s' is not a supported norm" % norm)
-
-    if axis == 0:
-        sparse_format = 'csc'
-    elif axis == 1:
-        sparse_format = 'csr'
-    else:
-        raise ValueError("'%d' is not a supported axis" % axis)
-
-    X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
-    warn_if_not_float(X, 'The normalize function')
-    if axis == 0:
-        X = X.T
-
-    if sparse.issparse(X):
-        if norm == 'l1':
-            inplace_csr_row_normalize_l1(X)
-        elif norm == 'l2':
-            inplace_csr_row_normalize_l2(X)
-    else:
-        if norm == 'l1':
-            norms = np.abs(X).sum(axis=1)[:, np.newaxis]
-            norms[norms == 0.0] = 1.0
-        elif norm == 'l2':
-            norms = np.sqrt(np.sum(X ** 2, axis=1))[:, np.newaxis]
-            norms[norms == 0.0] = 1.0
-        X /= norms
-
-    if axis == 0:
-        X = X.T
-
-    return X
-
-
-class Normalizer(BaseEstimator, TransformerMixin):
-    """Normalize samples individually to unit norm
-
-    Each sample (i.e. each row of the data matrix) with at least one
-    non zero component is rescaled independently of other samples so
-    that its norm (l1 or l2) equals one.
-
-    This transformer is able to work both with dense numpy arrays and
-    scipy.sparse matrix (use CSR format if you want to avoid the burden of
-    a copy / conversion).
-
-    Scaling inputs to unit norms is a common operation for text
-    classification or clustering for instance. For instance the dot
-    product of two l2-normalized TF-IDF vectors is the cosine similarity
-    of the vectors and is the base similarity metric for the Vector
-    Space Model commonly used by the Information Retrieval community.
-
-    Parameters
-    ----------
-    norm : 'l1' or 'l2', optional ('l2' by default)
-        The norm to use to normalize each non zero sample.
-
-    copy : boolean, optional, default is True
-        set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix).
-
-    Notes
-    -----
-    This estimator is stateless (besides constructor parameters), the
-    fit method does nothing but is useful when used in a pipeline.
-
-    See also
-    --------
-    :func:`sklearn.preprocessing.normalize` equivalent function
-    without the object oriented API
-    """
-
-    def __init__(self, norm='l2', copy=True):
-        self.norm = norm
-        self.copy = copy
-
-    def fit(self, X, y=None):
-        """Do nothing and return the estimator unchanged
-
-        This method is just there to implement the usual API and hence
-        work in pipelines.
-        """
-        atleast2d_or_csr(X)
-        return self
-
-    def transform(self, X, y=None, copy=None):
-        """Scale each non zero row of X to unit norm
-
-        Parameters
-        ----------
-        X : array or scipy.sparse matrix with shape [n_samples, n_features]
-            The data to normalize, row by row. scipy.sparse matrices should be
-            in CSR format to avoid an un-necessary copy.
-        """
-        copy = copy if copy is not None else self.copy
-        atleast2d_or_csr(X)
-        return normalize(X, norm=self.norm, axis=1, copy=copy)
-
-
-def binarize(X, threshold=0.0, copy=True):
-    """Boolean thresholding of array-like or scipy.sparse matrix
-
-    Parameters
-    ----------
-    X : array or scipy.sparse matrix with shape [n_samples, n_features]
-        The data to binarize, element by element.
-        scipy.sparse matrices should be in CSR or CSC format to avoid an
-        un-necessary copy.
-
-    threshold : float, optional (0.0 by default)
-        Feature values below or equal to this are replaced by 0, above it by 1.
-        Threshold may not be less than 0 for operations on sparse matrices.
-
-    copy : boolean, optional, default is True
-        set to False to perform inplace binarization and avoid a copy
-        (if the input is already a numpy array or a scipy.sparse CSR / CSC
-        matrix and if axis is 1).
-
-    See also
-    --------
-    :class:`sklearn.preprocessing.Binarizer` to perform binarization
-    using the ``Transformer`` API (e.g. as part of a preprocessing
-    :class:`sklearn.pipeline.Pipeline`)
-    """
-    sparse_format = "csr"  # We force sparse format to be either csr or csc.
-    if hasattr(X, "format"):
-        if X.format in ["csr", "csc"]:
-            sparse_format = X.format
-
-    X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
-    if sparse.issparse(X):
-        if threshold < 0:
-            raise ValueError('Cannot binarize a sparse matrix with threshold '
-                             '< 0')
-        cond = X.data > threshold
-        not_cond = np.logical_not(cond)
-        X.data[cond] = 1
-        X.data[not_cond] = 0
-        X.eliminate_zeros()
-    else:
-        cond = X > threshold
-        not_cond = np.logical_not(cond)
-        X[cond] = 1
-        X[not_cond] = 0
-    return X
-
-
-class Binarizer(BaseEstimator, TransformerMixin):
-    """Binarize data (set feature values to 0 or 1) according to a threshold
-
-    Values greater than the threshold map to 1, while values less than
-    or equal to the threshold map to 0. With the default threshold of 0,
-    only positive values map to 1.
-
-    Binarization is a common operation on text count data where the
-    analyst can decide to only consider the presence or absence of a
-    feature rather than a quantified number of occurrences for instance.
-
-    It can also be used as a pre-processing step for estimators that
-    consider boolean random variables (e.g. modelled using the Bernoulli
-    distribution in a Bayesian setting).
-
-    Parameters
-    ----------
-    threshold : float, optional (0.0 by default)
-        Feature values below or equal to this are replaced by 0, above it by 1.
-        Threshold may not be less than 0 for operations on sparse matrices.
-
-    copy : boolean, optional, default is True
-        set to False to perform inplace binarization and avoid a copy (if
-        the input is already a numpy array or a scipy.sparse CSR matrix).
-
-    Notes
-    -----
-    If the input is a sparse matrix, only the non-zero values are subject
-    to update by the Binarizer class.
-
-    This estimator is stateless (besides constructor parameters), the
-    fit method does nothing but is useful when used in a pipeline.
-    """
-
-    def __init__(self, threshold=0.0, copy=True):
-        self.threshold = threshold
-        self.copy = copy
-
-    def fit(self, X, y=None):
-        """Do nothing and return the estimator unchanged
-
-        This method is just there to implement the usual API and hence
-        work in pipelines.
-        """
-        atleast2d_or_csr(X)
-        return self
-
-    def transform(self, X, y=None, copy=None):
-        """Binarize each element of X
-
-        Parameters
-        ----------
-        X : array or scipy.sparse matrix with shape [n_samples, n_features]
-            The data to binarize, element by element.
-            scipy.sparse matrices should be in CSR format to avoid an
-            un-necessary copy.
-        """
-        copy = copy if copy is not None else self.copy
-        return binarize(X, threshold=self.threshold, copy=copy)
-
-
-def _transform_selected(X, transform, selected="all", copy=True):
-    """Apply a transform function to portion of selected features
-
-    Parameters
-    ----------
-    X : array-like or sparse matrix, shape=(n_samples, n_features)
-        Dense array or sparse matrix.
-
-    transform : callable
-        A callable transform(X) -> X_transformed
-
-    copy : boolean, optional
-        Copy X even if it could be avoided.
-
-    selected: "all" or array of indices or mask
-        Specify which features to apply the transform to.
-
-    Returns
-    -------
-    X : array or sparse matrix, shape=(n_samples, n_features_new)
-    """
-    if selected == "all":
-        return transform(X)
-
-    X = atleast2d_or_csc(X, copy=copy)
-
-    if len(selected) == 0:
-        return X
-
-    n_features = X.shape[1]
-    ind = np.arange(n_features)
-    sel = np.zeros(n_features, dtype=bool)
-    sel[np.asarray(selected)] = True
-    not_sel = np.logical_not(sel)
-    n_selected = np.sum(sel)
-
-    if n_selected == 0:
-        # No features selected.
-        return X
-    elif n_selected == n_features:
-        # All features selected.
-        return transform(X)
-    else:
-        X_sel = transform(X[:, ind[sel]])
-        X_not_sel = X[:, ind[not_sel]]
-
-        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
-            return sparse.hstack((X_sel, X_not_sel))
-        else:
-            return np.hstack((X_sel, X_not_sel))
-
-
-class OneHotEncoder(BaseEstimator, TransformerMixin):
-    """Encode categorical integer features using a one-hot aka one-of-K scheme.
-
-    The input to this transformer should be a matrix of integers, denoting
-    the values taken on by categorical (discrete) features. The output will be
-    a sparse matrix were each column corresponds to one possible value of one
-    feature. It is assumed that input features take on values in the range
-    [0, n_values).
-
-    This encoding is needed for feeding categorical data to many scikit-learn
-    estimators, notably linear models and SVMs with the standard kernels.
-
-    Parameters
-    ----------
-    n_values : 'auto', int or array of ints
-        Number of values per feature.
-
-        - 'auto' : determine value range from training data.
-        - int : maximum value for all features.
-        - array : maximum value per feature.
-
-    categorical_features: "all" or array of indices or mask
-        Specify what features are treated as categorical.
-
-        - 'all' (default): All features are treated as categorical.
-        - array of indices: Array of categorical feature indices.
-        - mask: Array of length n_features and with dtype=bool.
-
-        Non-categorical features are always stacked to the right of the matrix.
-
-    dtype : number type, default=np.float
-        Desired dtype of output.
-
-    Attributes
-    ----------
-    `active_features_` : array
-        Indices for active features, meaning values that actually occur
-        in the training set. Only available when n_values is ``'auto'``.
-
-    `feature_indices_` : array of shape (n_features,)
-        Indices to feature ranges.
-        Feature ``i`` in the original data is mapped to features
-        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
-        (and then potentially masked by `active_features_` afterwards)
-
-    `n_values_` : array of shape (n_features,)
-        Maximum number of values per feature.
-
-    Examples
-    --------
-    Given a dataset with three features and two samples, we let the encoder
-    find the maximum value per feature and transform the data to a binary
-    one-hot encoding.
-
-    >>> from sklearn.preprocessing import OneHotEncoder
-    >>> enc = OneHotEncoder()
-    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
-[1, 0, 2]])  # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
-           n_values='auto')
-    >>> enc.n_values_
-    array([2, 3, 4])
-    >>> enc.feature_indices_
-    array([0, 2, 5, 9])
-    >>> enc.transform([[0, 1, 1]]).toarray()
-    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])
-
-    See also
-    --------
-    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
-      dictionary items (also handles string-valued features).
-    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
-      encoding of dictionary items or strings.
-    """
-    def __init__(self, n_values="auto", categorical_features="all",
-                 dtype=np.float):
-        self.n_values = n_values
-        self.categorical_features = categorical_features
-        self.dtype = dtype
-
-    def fit(self, X, y=None):
-        """Fit OneHotEncoder to X.
-
-        Parameters
-        ----------
-        X : array-like, shape=(n_samples, n_feature)
-            Input array of type int.
-
-        Returns
-        -------
-        self
-        """
-        self.fit_transform(X)
-        return self
-
-    def _fit_transform(self, X):
-        """Assumes X contains only categorical features."""
-        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
-        if np.any(X < 0):
-            raise ValueError("X needs to contain only non-negative integers.")
-        n_samples, n_features = X.shape
-        if self.n_values == 'auto':
-            n_values = np.max(X, axis=0) + 1
-        elif isinstance(self.n_values, numbers.Integral):
-            n_values = np.empty(n_features, dtype=np.int)
-            n_values.fill(self.n_values)
-        else:
-            try:
-                n_values = np.asarray(self.n_values, dtype=int)
-            except (ValueError, TypeError):
-                raise TypeError("Wrong type for parameter `n_values`. Expected"
-                                " 'auto', int or array of ints, got %r"
-                                % type(X))
-            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
-                raise ValueError("Shape mismatch: if n_values is an array,"
-                                 " it has to be of shape (n_features,).")
-        self.n_values_ = n_values
-        n_values = np.hstack([[0], n_values])
-        indices = np.cumsum(n_values)
-        self.feature_indices_ = indices
-
-        column_indices = (X + indices[:-1]).ravel()
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
-
-        if self.n_values == 'auto':
-            mask = np.array(out.sum(axis=0)).ravel() != 0
-            active_features = np.where(mask)[0]
-            out = out[:, active_features]
-            self.active_features_ = active_features
-
-        return out
-
-    def fit_transform(self, X, y=None):
-        """Fit OneHotEncoder to X, then transform X.
-
-        Equivalent to self.fit(X).transform(X), but more convenient and more
-        efficient. See fit for the parameters, transform for the return value.
-        """
-        return _transform_selected(X, self._fit_transform,
-                                   self.categorical_features, copy=True)
-
-    def _transform(self, X):
-        """Asssumes X contains only categorical features."""
-        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
-        if np.any(X < 0):
-            raise ValueError("X needs to contain only non-negative integers.")
-        n_samples, n_features = X.shape
-
-        indices = self.feature_indices_
-        if n_features != indices.shape[0] - 1:
-            raise ValueError("X has different shape than during fitting."
-                             " Expected %d, got %d."
-                             % (indices.shape[0] - 1, n_features))
-
-        n_values_check = np.max(X, axis=0) + 1
-        if (n_values_check > self.n_values_).any():
-            raise ValueError("Feature out of bounds. Try setting n_values.")
-
-        column_indices = (X + indices[:-1]).ravel()
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
-        if self.n_values == 'auto':
-            out = out[:, self.active_features_]
-        return out
-
-    def transform(self, X):
-        """Transform X using one-hot encoding.
-
-        Parameters
-        ----------
-        X : array-like, shape=(n_samples, n_features)
-            Input array of type int.
-
-        Returns
-        -------
-        X_out : sparse matrix, dtype=int
-            Transformed input.
-        """
-        return _transform_selected(X, self._transform,
-                                   self.categorical_features, copy=True)
-
-
-class LabelEncoder(BaseEstimator, TransformerMixin):
-    """Encode labels with value between 0 and n_classes-1.
-
-    Attributes
-    ----------
-    `classes_`: array of shape [n_class]
-        Holds the label for each class.
-
-    Examples
-    --------
-    `LabelEncoder` can be used to normalize labels.
-
-    >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder()
-    >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
-    >>> le.classes_
-    array([1, 2, 6])
-    >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
-    array([0, 0, 1, 2]...)
-    >>> le.inverse_transform([0, 0, 1, 2])
-    array([1, 1, 2, 6])
-
-    It can also be used to transform non-numerical labels (as long as they are
-    hashable and comparable) to numerical labels.
-
-    >>> le = preprocessing.LabelEncoder()
-    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
-    >>> list(le.classes_)
-    ['amsterdam', 'paris', 'tokyo']
-    >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
-    array([2, 2, 1]...)
-    >>> list(le.inverse_transform([2, 2, 1]))
-    ['tokyo', 'tokyo', 'paris']
-
-    """
-
-    def _check_fitted(self):
-        if not hasattr(self, "classes_"):
-            raise ValueError("LabelNormalizer was not fitted yet.")
-
-    def fit(self, y):
-        """Fit label encoder
-
-        Parameters
-        ----------
-        y : array-like of shape [n_samples]
-            Target values.
-
-        Returns
-        -------
-        self : returns an instance of self.
-        """
-        self.classes_ = np.unique(y)
-        return self
-
-    def fit_transform(self, y):
-        """Fit label encoder and return encoded labels
-
-        Parameters
-        ----------
-        y : array-like of shape [n_samples]
-            Target values.
-
-        Returns
-        -------
-        y : array-like of shape [n_samples]
-        """
-        self.classes_, y = unique(y, return_inverse=True)
-        return y
-
-    def transform(self, y):
-        """Transform labels to normalized encoding.
-
-        Parameters
-        ----------
-        y : array-like of shape [n_samples]
-            Target values.
-
-        Returns
-        -------
-        y : array-like of shape [n_samples]
-        """
-        self._check_fitted()
-
-        classes = np.unique(y)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
-
-        return np.searchsorted(self.classes_, y)
-
-    def inverse_transform(self, y):
-        """Transform labels back to original encoding.
-
-        Parameters
-        ----------
-        y : numpy array of shape [n_samples]
-            Target values.
-
-        Returns
-        -------
-        y : numpy array of shape [n_samples]
-        """
-        self._check_fitted()
-
-        y = np.asarray(y)
-        return self.classes_[y]
-
-
-class LabelBinarizer(BaseEstimator, TransformerMixin):
-    """Binarize labels in a one-vs-all fashion
-
-    Several regression and binary classification algorithms are
-    available in the scikit. A simple way to extend these algorithms
-    to the multi-class classification case is to use the so-called
-    one-vs-all scheme.
-
-    At learning time, this simply consists in learning one regressor
-    or binary classifier per class. In doing so, one needs to convert
-    multi-class labels to binary labels (belong or does not belong
-    to the class). LabelBinarizer makes this process easy with the
-    transform method.
-
-    At prediction time, one assigns the class for which the corresponding
-    model gave the greatest confidence. LabelBinarizer makes this easy
-    with the inverse_transform method.
-
-    Parameters
-    ----------
-
-    neg_label: int (default: 0)
-        Value with which negative labels must be encoded.
-
-    pos_label: int (default: 1)
-        Value with which positive labels must be encoded.
-
-    Attributes
-    ----------
-    `classes_`: array of shape [n_class]
-        Holds the label for each class.
-
-    `multilabel_`: boolean
-        True if the transformer was fitted on a multilabel rather than a
-        multiclass set of labels.
-
-    Examples
-    --------
-    >>> from sklearn import preprocessing
-    >>> lb = preprocessing.LabelBinarizer()
-    >>> lb.fit([1, 2, 6, 4, 2])
-    LabelBinarizer(neg_label=0, pos_label=1)
-    >>> lb.classes_
-    array([1, 2, 4, 6])
-    >>> lb.multilabel_
-    False
-    >>> lb.transform([1, 6])
-    array([[1, 0, 0, 0],
-           [0, 0, 0, 1]])
-
-    >>> lb.fit_transform([(1, 2), (3,)])
-    array([[1, 1, 0],
-           [0, 0, 1]])
-    >>> lb.classes_
-    array([1, 2, 3])
-    >>> lb.multilabel_
-    True
-
-    See also
-    --------
-    label_binarize : function to perform the transform operation of
-        LabelBinarizer with fixed classes.
-    """
-
-    def __init__(self, neg_label=0, pos_label=1):
-        if neg_label >= pos_label:
-            raise ValueError("neg_label must be strictly less than pos_label.")
-
-        self.neg_label = neg_label
-        self.pos_label = pos_label
-
-    @property
-    @deprecated("Attribute 'multilabel' was renamed to 'multilabel_' in "
-                "0.14 and will be removed in 0.16")
-    def multilabel(self):
-        return self.multilabel_
-
-    def _check_fitted(self):
-        if not hasattr(self, "classes_"):
-            raise ValueError("LabelBinarizer was not fitted yet.")
-
-    def fit(self, y):
-        """Fit label binarizer
-
-        Parameters
-        ----------
-        y : numpy array of shape [n_samples] or sequence of sequences
-            Target values. In the multilabel case the nested sequences can
-            have variable lengths.
-
-        Returns
-        -------
-        self : returns an instance of self.
-        """
-        y_type = type_of_target(y)
-        self.multilabel_ = y_type.startswith('multilabel')
-        if self.multilabel_:
-            self.indicator_matrix_ = y_type == 'multilabel-indicator'
-
-        self.classes_ = unique_labels(y)
-
-        return self
-
-    def transform(self, y):
-        """Transform multi-class labels to binary labels
-
-        The output of transform is sometimes referred to by some authors as the
-        1-of-K coding scheme.
-
-        Parameters
-        ----------
-        y : numpy array of shape [n_samples] or sequence of sequences
-            Target values. In the multilabel case the nested sequences can
-            have variable lengths.
-
-        Returns
-        -------
-        Y : numpy array of shape [n_samples, n_classes]
-        """
-        self._check_fitted()
-
-        y_is_multilabel = type_of_target(y).startswith('multilabel')
-
-        if y_is_multilabel and not self.multilabel_:
-            raise ValueError("The object was not fitted with multilabel"
-                             " input.")
-
-        return label_binarize(y, self.classes_,
-                              multilabel=self.multilabel_,
-                              pos_label=self.pos_label,
-                              neg_label=self.neg_label)
-
-    def inverse_transform(self, Y, threshold=None):
-        """Transform binary labels back to multi-class labels
-
-        Parameters
-        ----------
-        Y : numpy array of shape [n_samples, n_classes]
-            Target values.
-
-        threshold : float or None
-            Threshold used in the binary and multi-label cases.
-
-            Use 0 when:
-                - Y contains the output of decision_function (classifier)
-            Use 0.5 when:
-                - Y contains the output of predict_proba
-
-            If None, the threshold is assumed to be half way between
-            neg_label and pos_label.
-
-        Returns
-        -------
-        y : numpy array of shape [n_samples] or sequence of sequences
-            Target values. In the multilabel case the nested sequences can
-            have variable lengths.
-
-        Notes
-        -----
-        In the case when the binary labels are fractional
-        (probabilistic), inverse_transform chooses the class with the
-        greatest value. Typically, this allows to use the output of a
-        linear model's decision_function method directly as the input
-        of inverse_transform.
-        """
-        self._check_fitted()
-
-        if threshold is None:
-            half = (self.pos_label - self.neg_label) / 2.0
-            threshold = self.neg_label + half
-
-        if self.multilabel_:
-            Y = np.array(Y > threshold, dtype=int)
-            # Return the predictions in the same format as in fit
-            if self.indicator_matrix_:
-                # Label indicator matrix format
-                return Y
-            else:
-                # Lists of tuples format
-                return [tuple(self.classes_[np.flatnonzero(Y[i])])
-                        for i in range(Y.shape[0])]
-
-        if len(Y.shape) == 1 or Y.shape[1] == 1:
-            y = np.array(Y.ravel() > threshold, dtype=int)
-
-        else:
-            y = Y.argmax(axis=1)
-
-        return self.classes_[y]
-
-
-def label_binarize(y, classes, multilabel=False, neg_label=0, pos_label=1):
-    """Binarize labels in a one-vs-all fashion
-
-    Several regression and binary classification algorithms are
-    available in the scikit. A simple way to extend these algorithms
-    to the multi-class classification case is to use the so-called
-    one-vs-all scheme.
-
-    This function makes it possible to compute this transformation for a
-    fixed set of class labels known ahead of time.
-
-    Parameters
-    ----------
-    y : array-like
-        Sequence of integer labels to encode.
-
-    classes : array of shape [n_classes]
-        Uniquely holds the label for each class.
-
-    multilabel : boolean
-        Set to true if y is encoding a multilabel tasks (with a variable
-        number of label assignements per sample) rather than a multiclass task
-        where one sample has one and only one label assigned.
-
-    neg_label: int (default: 0)
-        Value with which negative labels must be encoded.
-
-    pos_label: int (default: 1)
-        Value with which positive labels must be encoded.
-
-    Examples
-    --------
-    >>> from sklearn.preprocessing import label_binarize
-    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
-    array([[1, 0, 0, 0],
-           [0, 0, 0, 1]])
-
-    The class ordering is preserved:
-
-    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
-    array([[1, 0, 0, 0],
-           [0, 1, 0, 0]])
-
-    >>> label_binarize([(1, 2), (6,), ()], multilabel=True,
-    ...                classes=[1, 6, 4, 2])
-    array([[1, 0, 0, 1],
-           [0, 1, 0, 0],
-           [0, 0, 0, 0]])
-
-    See also
-    --------
-    label_binarize : function to perform the transform operation of
-        LabelBinarizer with fixed classes.
-    """
-    y_type = type_of_target(y)
-
-    if multilabel or len(classes) > 2:
-        if y_type == 'multilabel-indicator':
-            # nothing to do as y is already a label indicator matrix
-            return y
-
-        Y = np.zeros((len(y), len(classes)), dtype=np.int)
-    else:
-        Y = np.zeros((len(y), 1), dtype=np.int)
-
-    Y += neg_label
-
-    y_is_multilabel = y_type.startswith('multilabel')
-
-    if multilabel:
-        if not y_is_multilabel:
-            raise ValueError("y should be a list of label lists/tuples,"
-                             "got %r" % (y,))
-
-        # inverse map: label => column index
-        imap = dict((v, k) for k, v in enumerate(classes))
-
-        for i, label_tuple in enumerate(y):
-            for label in label_tuple:
-                Y[i, imap[label]] = pos_label
-
-        return Y
-
-    else:
-        y = np.asarray(y)
-
-        if len(classes) == 2:
-            Y[y == classes[1], 0] = pos_label
-            return Y
-
-        elif len(classes) >= 2:
-            for i, k in enumerate(classes):
-                Y[y == k, i] = pos_label
-            return Y
-
-        else:
-            # Only one class, returns a matrix with all negative labels.
-            return Y
-
-
-class KernelCenterer(BaseEstimator, TransformerMixin):
-    """Center a kernel matrix
-
-    Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a
-    function mapping x to a Hilbert space. KernelCenterer centers (i.e.,
-    normalize to have zero mean) the data without explicitly computing phi(x).
-    It is equivalent to centering phi(x) with
-    sklearn.preprocessing.StandardScaler(with_std=False).
-    """
-
-    def fit(self, K, y=None):
-        """Fit KernelCenterer
-
-        Parameters
-        ----------
-        K : numpy array of shape [n_samples, n_samples]
-            Kernel matrix.
-
-        Returns
-        -------
-        self : returns an instance of self.
-        """
-        K = array2d(K)
-        n_samples = K.shape[0]
-        self.K_fit_rows_ = np.sum(K, axis=0) / n_samples
-        self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples
-        return self
-
-    def transform(self, K, y=None, copy=True):
-        """Center kernel matrix.
-
-        Parameters
-        ----------
-        K : numpy array of shape [n_samples1, n_samples2]
-            Kernel matrix.
-
-        Returns
-        -------
-        K_new : numpy array of shape [n_samples1, n_samples2]
-        """
-        K = array2d(K)
-        if copy:
-            K = K.copy()
-
-        K_pred_cols = (np.sum(K, axis=1) /
-                       self.K_fit_rows_.shape[0])[:, np.newaxis]
-
-        K -= self.K_fit_rows_
-        K -= K_pred_cols
-        K += self.K_fit_all_
-
-        return K
-
-
-def add_dummy_feature(X, value=1.0):
-    """Augment dataset with an additional dummy feature.
-
-    This is useful for fitting an intercept term with implementations which
-    cannot otherwise fit it directly.
-
-    Parameters
-    ----------
-    X : array or scipy.sparse matrix with shape [n_samples, n_features]
-        Data.
-
-    value : float
-        Value to use for the dummy feature.
-
-    Returns
-    -------
-
-    X : array or scipy.sparse matrix with shape [n_samples, n_features + 1]
-        Same data with dummy feature added as first column.
-
-    Examples
-    --------
-
-    >>> from sklearn.preprocessing import add_dummy_feature
-    >>> add_dummy_feature([[0, 1], [1, 0]])
-    array([[ 1.,  0.,  1.],
-           [ 1.,  1.,  0.]])
-    """
-    X = safe_asarray(X)
-    n_samples, n_features = X.shape
-    shape = (n_samples, n_features + 1)
-    if sparse.issparse(X):
-        if sparse.isspmatrix_coo(X):
-            # Shift columns to the right.
-            col = X.col + 1
-            # Column indices of dummy feature are 0 everywhere.
-            col = np.concatenate((np.zeros(n_samples), col))
-            # Row indices of dummy feature are 0, ..., n_samples-1.
-            row = np.concatenate((np.arange(n_samples), X.row))
-            # Prepend the dummy feature n_samples times.
-            data = np.concatenate((np.ones(n_samples) * value, X.data))
-            return sparse.coo_matrix((data, (row, col)), shape)
-        elif sparse.isspmatrix_csc(X):
-            # Shift index pointers since we need to add n_samples elements.
-            indptr = X.indptr + n_samples
-            # indptr[0] must be 0.
-            indptr = np.concatenate((np.array([0]), indptr))
-            # Row indices of dummy feature are 0, ..., n_samples-1.
-            indices = np.concatenate((np.arange(n_samples), X.indices))
-            # Prepend the dummy feature n_samples times.
-            data = np.concatenate((np.ones(n_samples) * value, X.data))
-            return sparse.csc_matrix((data, indices, indptr), shape)
-        else:
-            klass = X.__class__
-            return klass(add_dummy_feature(X.tocoo(), value))
-    else:
-        return np.hstack((np.ones((n_samples, 1)) * value, X))
-
-
-def _get_mask(X, value_to_mask):
-    """Compute the boolean mask X == missing_values."""
-    if value_to_mask == "NaN" or np.isnan(value_to_mask):
-        return np.isnan(X)
-    else:
-        return X == value_to_mask
-
-
-def _get_median(negative_elements, n_zeros, positive_elements):
-    """Compute the median of the array formed by negative_elements,
-       n_zeros zeros and positive_elements. This function is used
-       to support sparse matrices."""
-    negative_elements = np.sort(negative_elements, kind='heapsort')
-    positive_elements = np.sort(positive_elements, kind='heapsort')
-
-    n_elems = len(negative_elements) + n_zeros + len(positive_elements)
-    if not n_elems:
-        return np.nan
-
-    median_position = (n_elems - 1) / 2.0
-
-    if round(median_position) == median_position:
-        median = _get_elem_at_rank(negative_elements, n_zeros,
-                                   positive_elements, median_position)
-    else:
-        a = _get_elem_at_rank(negative_elements, n_zeros,
-                              positive_elements, math.floor(median_position))
-        b = _get_elem_at_rank(negative_elements, n_zeros,
-                              positive_elements, math.ceil(median_position))
-        median = (a + b) / 2.0
-
-    return median
-
-
-def _get_elem_at_rank(negative_elements, n_zeros, positive_elements, k):
-    """Compute the kth largest element of the array formed by
-       negative_elements, n_zeros zeros and positive_elements."""
-    len_neg = len(negative_elements)
-    len_pos = len(positive_elements)
-
-    if k < len_neg:
-        return negative_elements[k]
-    elif k >= len_neg + n_zeros:
-        return positive_elements[k - len_neg - n_zeros]
-    else:
-        return 0
-
-
-def _most_frequent(array, extra_value, n_repeat):
-    """Compute the most frequent value in a 1d array extended with
-       [extra_value] * n_repeat, where extra_value is assumed to be not part
-       of the array."""
-    # Compute the most frequent value in array only
-    if array.size > 0:
-        mode = stats.mode(array)
-        most_frequent_value = mode[0][0]
-        most_frequent_count = mode[1][0]
-    else:
-        most_frequent_value = 0
-        most_frequent_count = 0
-
-    # Compare to array + [extra_value] * n_repeat
-    if most_frequent_count == 0 and n_repeat == 0:
-        return np.nan
-    elif most_frequent_count < n_repeat:
-        return extra_value
-    elif most_frequent_count > n_repeat:
-        return most_frequent_value
-    elif most_frequent_count == n_repeat:
-        # Ties the breaks. Copy the behaviour of scipy.stats.mode
-        if most_frequent_value < extra_value:
-            return most_frequent_value
-        else:
-            return extra_value
-
-
-class Imputer(BaseEstimator, TransformerMixin):
-    """Imputation transformer for completing missing values.
-
-    Parameters
-    ----------
-    missing_values : integer or string, optional (default="NaN")
-        The placeholder for the missing values. All occurences of
-        `missing_values` will be imputed. For missing values encoded as np.nan,
-        use the string value "NaN".
-
-    strategy : string, optional (default="mean")
-        The imputation strategy.
-          - If "mean", then replace missing values using the mean along
-            the axis.
-          - If "median", then replace missing values using the median along
-            the axis.
-          - If "most_frequent", then replace missing using the most frequent
-            value along the axis.
-
-    axis : integer, optional (default=0)
-        The axis along which to impute.
-         - If `axis=0`, then impute along columns.
-         - If `axis=1`, then impute along rows.
-
-    verbose : integer, optional (default=0)
-        Controls the verbosity of the imputer.
-
-    copy : boolean, optional (default=True)
-        If True, a copy of X will be created. If False, imputation will
-        be done in-place.
-
-    Attributes
-    ----------
-    `statistics_` : array of shape (n_features,) or (n_samples,)
-        The statistics along the imputation axis.
-
-    Notes
-    -----
-    - When ``axis=0``, columns which only contained missing values at `fit`
-      are discarded upon `transform`.
-    - When ``axis=1``, an exception is raised if there are rows for which it is
-      not possible to fill in the missing values (e.g., because they only
-      contain missing values).
-    """
-    def __init__(self, missing_values="NaN", strategy="mean",
-                 axis=0, verbose=0, copy=True):
-        self.missing_values = missing_values
-        self.strategy = strategy
-        self.axis = axis
-        self.verbose = verbose
-        self.copy = copy
-
-    def fit(self, X, y=None):
-        """Fit the imputer on X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Input data, where ``n_samples`` is the number of samples and
-            ``n_features`` is the number of features.
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-        # Check parameters
-        allowed_strategies = ["mean", "median", "most_frequent"]
-        if self.strategy not in allowed_strategies:
-            raise ValueError("Can only use these strategies: {0} "
-                             " got strategy={1}".format(allowed_strategies,
-                                                        self.strategy))
-
-        if self.axis not in [0, 1]:
-            raise ValueError("Can only impute missing values on axis 0 and 1, "
-                             " got axis={0}".format(self.axis))
-
-        # Since two different arrays can be provided in fit(X) and
-        # transform(X), the imputation data will be computed in transform()
-        # when the imputation is done per sample (i.e., when axis=1).
-        if self.axis == 0:
-            X = atleast2d_or_csc(X, dtype=np.float64, force_all_finite=False)
-
-            if sparse.issparse(X):
-                self.statistics_ = self._sparse_fit(X,
-                                                    self.strategy,
-                                                    self.missing_values,
-                                                    self.axis)
-            else:
-                self.statistics_ = self._dense_fit(X,
-                                                   self.strategy,
-                                                   self.missing_values,
-                                                   self.axis)
-
-        return self
-
-    def _sparse_fit(self, X, strategy, missing_values, axis):
-        """Fit the transformer on sparse data."""
-        # Imputation is done "by column", so if we want to do it
-        # by row we only need to convert the matrix to csr format.
-        if axis == 1:
-            X = X.tocsr()
-        else:
-            X = X.tocsc()
-
-        # Count the zeros
-        if missing_values == 0:
-            n_zeros_axis = np.zeros(X.shape[not axis])
-        else:
-            n_zeros_axis = X.shape[axis] - np.diff(X.indptr)
-
-        # Mean
-        if strategy == "mean":
-            if missing_values != 0:
-                n_non_missing = n_zeros_axis
-
-                # Mask the missing elements
-                mask_missing_values = _get_mask(X.data, missing_values)
-                mask_valids = np.logical_not(mask_missing_values)
-
-                # Sum only the valid elements
-                new_data = X.data.copy()
-                new_data[mask_missing_values] = 0
-                X = sparse.csc_matrix((new_data, X.indices, X.indptr),
-                                      copy=False)
-                sums = X.sum(axis=0)
-
-                # Count the elements != 0
-                mask_non_zeros = sparse.csc_matrix(
-                    (mask_valids.astype(np.float64),
-                     X.indices,
-                     X.indptr), copy=False)
-                s = mask_non_zeros.sum(axis=0)
-                n_non_missing = np.add(n_non_missing, s)
-
-            else:
-                sums = X.sum(axis=axis)
-                n_non_missing = np.diff(X.indptr)
-
-            # Ignore the error, columns with a np.nan statistics_
-            # are not an error at this point. These columns will
-            # be removed in transform
-            with np.errstate(all="ignore"):
-                return np.ravel(sums) / np.ravel(n_non_missing)
-
-        # Median + Most frequent
-        else:
-            # Remove the missing values, for each column
-            columns_all = np.hsplit(X.data, X.indptr[1:-1])
-            mask_missing_values = _get_mask(X.data, missing_values)
-            mask_valids = np.hsplit(np.logical_not(mask_missing_values),
-                                    X.indptr[1:-1])
-
-            columns = [col[mask.astype(np.bool)]
-                       for col, mask in zip(columns_all, mask_valids)]
-
-            # Median
-            if strategy == "median":
-                median = np.empty(len(columns))
-                for i, column in enumerate(columns):
-
-                    negatives = column[column < 0]
-                    positives = column[column > 0]
-                    median[i] = _get_median(negatives,
-                                            n_zeros_axis[i],
-                                            positives)
-
-                return median
-
-            # Most frequent
-            elif strategy == "most_frequent":
-                most_frequent = np.empty(len(columns))
-
-                for i, column in enumerate(columns):
-                    most_frequent[i] = _most_frequent(column,
-                                                      0,
-                                                      n_zeros_axis[i])
-
-                return most_frequent
-
-    def _dense_fit(self, X, strategy, missing_values, axis):
-        """Fit the transformer on dense data."""
-        X = array2d(X, force_all_finite=False)
-        mask = _get_mask(X, missing_values)
-        masked_X = ma.masked_array(X, mask=mask)
-
-        # Mean
-        if strategy == "mean":
-            mean_masked = np.ma.mean(masked_X, axis=axis)
-            # Avoid the warning "Warning: converting a masked element to nan."
-            mean = np.ma.getdata(mean_masked)
-            mean[np.ma.getmask(mean_masked)] = np.nan
-
-            return mean
-
-        # Median
-        elif strategy == "median":
-            median_masked = np.ma.median(masked_X, axis=axis)
-            # Avoid the warning "Warning: converting a masked element to nan."
-            median = np.ma.getdata(median_masked)
-            median[np.ma.getmask(median_masked)] = np.nan
-
-            return median
-
-        # Most frequent
-        elif strategy == "most_frequent":
-            # scipy.stats.mstats.mode cannot be used because it will no work
-            # properly if the first element is masked and if it's frequency
-            # is equal to the frequency of the most frequent valid element
-            # See https://github.com/scipy/scipy/issues/2636
-
-            # To be able access the elements by columns
-            if axis == 0:
-                X = X.transpose()
-                mask = mask.transpose()
-
-            most_frequent = np.empty(X.shape[0])
-
-            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
-                row_mask = np.logical_not(row_mask).astype(np.bool)
-                row = row[row_mask]
-                most_frequent[i] = _most_frequent(row, np.nan, 0)
-
-            return most_frequent
-
-    def transform(self, X):
-        """Impute all missing values in X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-            The input data to complete.
-        """
-        if self.copy and not isinstance(X, list):
-            X = X.copy()
-
-        # Since two different arrays can be provided in fit(X) and
-        # transform(X), the imputation data need to be recomputed
-        # when the imputation is done per sample
-        if self.axis == 1:
-            X = atleast2d_or_csr(X, force_all_finite=False).astype(np.float)
-
-            if sparse.issparse(X):
-                self.statistics_ = self._sparse_fit(X,
-                                                    self.strategy,
-                                                    self.missing_values,
-                                                    self.axis)
-
-            else:
-                self.statistics_ = self._dense_fit(X,
-                                                   self.strategy,
-                                                   self.missing_values,
-                                                   self.axis)
-        else:
-            X = atleast2d_or_csc(X, force_all_finite=False).astype(np.float)
-
-        # Delete the invalid rows/columns
-        invalid_mask = np.isnan(self.statistics_)
-        valid_mask = np.logical_not(invalid_mask)
-        valid_statistics = self.statistics_[valid_mask]
-        valid_statistics_indexes = np.where(valid_mask)[0]
-        missing = np.arange(X.shape[not self.axis])[invalid_mask]
-
-        if self.axis == 0 and invalid_mask.any():
-            if self.verbose:
-                warnings.warn("Deleting features without "
-                              "observed values: %s" % missing)
-            X = X[:, valid_statistics_indexes]
-        elif self.axis == 1 and invalid_mask.any():
-            raise ValueError("Some rows only contain "
-                             "missing values: %s" % missing)
-
-        # Do actual imputation
-        if sparse.issparse(X) and self.missing_values != 0:
-            if self.axis == 0:
-                X = X.tocsr()
-            else:
-                X = X.tocsc()
-
-            mask = _get_mask(X.data, self.missing_values)
-            indexes = X.indices[mask]
-
-            X.data[mask] = valid_statistics[indexes].astype(X.dtype)
-        else:
-            if sparse.issparse(X):
-                X = X.toarray()
-
-            mask = _get_mask(X, self.missing_values)
-            n_missing = np.sum(mask, axis=self.axis)
-            values = np.repeat(valid_statistics, n_missing)
-
-            if self.axis == 0:
-                coordinates = np.where(mask.transpose())[::-1]
-            else:
-                coordinates = mask
-
-            X[coordinates] = values
-
-        return X
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
new file mode 100644
index 0000000000000..5dc8d5dcd4b13
--- /dev/null
+++ b/sklearn/preprocessing/__init__.py
@@ -0,0 +1,36 @@
+"""
+The :mod:`sklearn.preprocessing` module includes scaling, centering,
+normalization, binarization and imputation methods.
+"""
+
+from .data import Binarizer
+from .data import KernelCenterer
+from .data import MinMaxScaler
+from .data import Normalizer
+from .data import StandardScaler
+from .data import add_dummy_feature
+from .data import binarize
+from .data import normalize
+from .data import scale
+
+from .label import LabelBinarizer
+from .label import LabelEncoder
+from .label import OneHotEncoder
+
+from .imputation import Imputer
+
+__all__ = [
+    'Binarizer',
+    'Imputer',
+    'KernelCenterer',
+    'LabelBinarizer',
+    'LabelEncoder',
+    'MinMaxScaler',
+    'Normalizer',
+    'OneHotEncoder',
+    'StandardScaler',
+    'add_dummy_feature',
+    'binarize',
+    'normalize',
+    'scale',
+]
\ No newline at end of file
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
new file mode 100644
index 0000000000000..cd5a3aab5786c
--- /dev/null
+++ b/sklearn/preprocessing/data.py
@@ -0,0 +1,751 @@
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Mathieu Blondel <mathieu@mblondel.org>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+#          Andreas Mueller <amueller@ais.uni-bonn.de>
+# License: BSD 3 clause
+
+import warnings
+import numbers
+import math
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from scipy import stats
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_arrays
+from ..utils import array2d
+from ..utils import as_float_array
+from ..utils import atleast2d_or_csr
+from ..utils import atleast2d_or_csc
+from ..utils import safe_asarray
+from ..utils import warn_if_not_float
+from ..utils.fixes import unique
+from ..utils import deprecated
+
+from ..utils.multiclass import unique_labels
+from ..utils.multiclass import type_of_target
+
+from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
+from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
+from ..utils.sparsefuncs import inplace_csr_column_scale
+from ..utils.sparsefuncs import mean_variance_axis0
+from ..externals import six
+
+zip = six.moves.zip
+map = six.moves.map
+
+__all__ = [
+    'Binarizer',
+    'KernelCenterer',
+    'MinMaxScaler',
+    'Normalizer',
+    'StandardScaler',
+    'add_dummy_feature',
+    'binarize',
+    'normalize',
+    'scale',
+]
+
+def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
+    """Compute mean and std deviation for centering, scaling.
+
+    Zero valued std components are reset to 1.0 to avoid NaNs when scaling.
+    """
+    X = np.asarray(X)
+    Xr = np.rollaxis(X, axis)
+
+    if with_mean:
+        mean_ = Xr.mean(axis=0)
+    else:
+        mean_ = None
+
+    if with_std:
+        std_ = Xr.std(axis=0)
+        if isinstance(std_, np.ndarray):
+            std_[std_ == 0.0] = 1.0
+        elif std_ == 0.:
+            std_ = 1.
+    else:
+        std_ = None
+
+    return mean_, std_
+
+
+def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
+    """Standardize a dataset along any axis
+
+    Center to the mean and component wise scale to unit variance.
+
+    Parameters
+    ----------
+    X : array-like or CSR matrix.
+        The data to center and scale.
+
+    axis : int (0 by default)
+        axis used to compute the means and standard deviations along. If 0,
+        independently standardize each feature, otherwise (if 1) standardize
+        each sample.
+
+    with_mean : boolean, True by default
+        If True, center the data before scaling.
+
+    with_std : boolean, True by default
+        If True, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    copy : boolean, optional, default is True
+        set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array or a scipy.sparse
+        CSR matrix and if axis is 1).
+
+    Notes
+    -----
+    This implementation will refuse to center scipy.sparse matrices
+    since it would make them non-sparse and would potentially crash the
+    program with memory exhaustion problems.
+
+    Instead the caller is expected to either set explicitly
+    `with_mean=False` (in that case, only variance scaling will be
+    performed on the features of the CSR matrix) or to call `X.toarray()`
+    if he/she expects the materialized dense array to fit in memory.
+
+    To avoid memory copy the caller should pass a CSR matrix.
+
+    See also
+    --------
+    :class:`sklearn.preprocessing.StandardScaler` to perform centering and
+    scaling using the ``Transformer`` API (e.g. as part of a preprocessing
+    :class:`sklearn.pipeline.Pipeline`)
+    """
+    if sparse.issparse(X):
+        if with_mean:
+            raise ValueError(
+                "Cannot center sparse matrices: pass `with_mean=False` instead"
+                " See docstring for motivation and alternatives.")
+        if axis != 0:
+            raise ValueError("Can only scale sparse matrix on axis=0, "
+                             " got axis=%d" % axis)
+        warn_if_not_float(X, estimator='The scale function')
+        if not sparse.isspmatrix_csr(X):
+            X = X.tocsr()
+            copy = False
+        if copy:
+            X = X.copy()
+        _, var = mean_variance_axis0(X)
+        var[var == 0.0] = 1.0
+        inplace_csr_column_scale(X, 1 / np.sqrt(var))
+    else:
+        X = np.asarray(X)
+        warn_if_not_float(X, estimator='The scale function')
+        mean_, std_ = _mean_and_std(
+            X, axis, with_mean=with_mean, with_std=with_std)
+        if copy:
+            X = X.copy()
+        # Xr is a view on the original array that enables easy use of
+        # broadcasting on the axis in which we are interested in
+        Xr = np.rollaxis(X, axis)
+        if with_mean:
+            Xr -= mean_
+        if with_std:
+            Xr /= std_
+    return X
+
+
+class MinMaxScaler(BaseEstimator, TransformerMixin):
+    """Standardizes features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, i.e. between
+    zero and one.
+
+    The standardization is given by::
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    This standardization is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    Parameters
+    ----------
+    feature_range: tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    copy : boolean, optional, default is True
+        Set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array).
+
+    Attributes
+    ----------
+    `min_` : ndarray, shape (n_features,)
+        Per feature adjustment for minimum.
+
+    `scale_` : ndarray, shape (n_features,)
+        Per feature relative scaling of the data.
+    """
+
+    def __init__(self, feature_range=(0, 1), copy=True):
+        self.feature_range = feature_range
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Compute the minimum and maximum to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+        """
+        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
+        warn_if_not_float(X, estimator=self)
+        feature_range = self.feature_range
+        if feature_range[0] >= feature_range[1]:
+            raise ValueError("Minimum of desired feature range must be smaller"
+                             " than maximum. Got %s." % str(feature_range))
+        data_min = np.min(X, axis=0)
+        data_range = np.max(X, axis=0) - data_min
+        # Do not scale constant features
+        data_range[data_range == 0.0] = 1.0
+        self.scale_ = (feature_range[1] - feature_range[0]) / data_range
+        self.min_ = feature_range[0] - data_min * self.scale_
+        self.data_range = data_range
+        self.data_min = data_min
+        return self
+
+    def transform(self, X):
+        """Scaling features of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like with shape [n_samples, n_features]
+            Input data that will be transformed.
+        """
+        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
+        X *= self.scale_
+        X += self.min_
+        return X
+
+    def inverse_transform(self, X):
+        """Undo the scaling of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like with shape [n_samples, n_features]
+            Input data that will be transformed.
+        """
+        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
+        X -= self.min_
+        X /= self.scale_
+        return X
+
+
+class StandardScaler(BaseEstimator, TransformerMixin):
+    """Standardize features by removing the mean and scaling to unit variance
+
+    Centering and scaling happen independently on each feature by computing
+    the relevant statistics on the samples in the training set. Mean and
+    standard deviation are then stored to be used on later data using the
+    `transform` method.
+
+    Standardization of a dataset is a common requirement for many
+    machine learning estimators: they might behave badly if the
+    individual feature do not more or less look like standard normally
+    distributed data (e.g. Gaussian with 0 mean and unit variance).
+
+    For instance many elements used in the objective function of
+    a learning algorithm (such as the RBF kernel of Support Vector
+    Machines or the L1 and L2 regularizers of linear models) assume that
+    all features are centered around 0 and have variance in the same
+    order. If a feature has a variance that is orders of magnitude larger
+    that others, it might dominate the objective function and make the
+    estimator unable to learn from other features correctly as expected.
+
+    Parameters
+    ----------
+    with_mean : boolean, True by default
+        If True, center the data before scaling.
+        This does not work (and will raise an exception) when attempted on
+        sparse matrices, because centering them entails building a dense
+        matrix which in common use cases is likely to be too large to fit in
+        memory.
+
+    with_std : boolean, True by default
+        If True, scale the data to unit variance (or equivalently,
+        unit standard deviation).
+
+    copy : boolean, optional, default is True
+        If False, try to avoid a copy and do inplace scaling instead.
+        This is not guaranteed to always work inplace; e.g. if the data is
+        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
+        returned.
+
+    Attributes
+    ----------
+    `mean_` : array of floats with shape [n_features]
+        The mean value for each feature in the training set.
+
+    `std_` : array of floats with shape [n_features]
+        The standard deviation for each feature in the training set.
+
+    See also
+    --------
+    :func:`sklearn.preprocessing.scale` to perform centering and
+    scaling without using the ``Transformer`` object oriented API
+
+    :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True`
+    to further remove the linear correlation across features.
+    """
+
+    def __init__(self, copy=True, with_mean=True, with_std=True):
+        self.with_mean = with_mean
+        self.with_std = with_std
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Compute the mean and std to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like or CSR matrix with shape [n_samples, n_features]
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+        """
+        X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot center sparse matrices: pass `with_mean=False` "
+                    "instead. See docstring for motivation and alternatives.")
+            warn_if_not_float(X, estimator=self)
+            self.mean_ = None
+
+            if self.with_std:
+                var = mean_variance_axis0(X)[1]
+                self.std_ = np.sqrt(var)
+                self.std_[var == 0.0] = 1.0
+            else:
+                self.std_ = None
+            return self
+        else:
+            warn_if_not_float(X, estimator=self)
+            self.mean_, self.std_ = _mean_and_std(
+                X, axis=0, with_mean=self.with_mean, with_std=self.with_std)
+            return self
+
+    def transform(self, X, y=None, copy=None):
+        """Perform standardization by centering and scaling
+
+        Parameters
+        ----------
+        X : array-like with shape [n_samples, n_features]
+            The data used to scale along the features axis.
+        """
+        copy = copy if copy is not None else self.copy
+        X = check_arrays(X, copy=copy, sparse_format="csr")[0]
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot center sparse matrices: pass `with_mean=False` "
+                    "instead See docstring for motivation and alternatives.")
+            if self.std_ is not None:
+                warn_if_not_float(X, estimator=self)
+                inplace_csr_column_scale(X, 1 / self.std_)
+        else:
+            warn_if_not_float(X, estimator=self)
+            if self.with_mean:
+                X -= self.mean_
+            if self.with_std:
+                X /= self.std_
+        return X
+
+    def inverse_transform(self, X, copy=None):
+        """Scale back the data to the original representation
+
+        Parameters
+        ----------
+        X : array-like with shape [n_samples, n_features]
+            The data used to scale along the features axis.
+        """
+        copy = copy if copy is not None else self.copy
+        if sparse.issparse(X):
+            if self.with_mean:
+                raise ValueError(
+                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
+                    "instead See docstring for motivation and alternatives.")
+            if not sparse.isspmatrix_csr(X):
+                X = X.tocsr()
+                copy = False
+            if copy:
+                X = X.copy()
+            if self.std_ is not None:
+                inplace_csr_column_scale(X, self.std_)
+        else:
+            X = np.asarray(X)
+            if copy:
+                X = X.copy()
+            if self.with_std:
+                X *= self.std_
+            if self.with_mean:
+                X += self.mean_
+        return X
+
+
+class Scaler(StandardScaler):
+    def __init__(self, copy=True, with_mean=True, with_std=True):
+        warnings.warn("Scaler was renamed to StandardScaler. The old name "
+                      " will be removed in 0.15.", DeprecationWarning)
+        super(Scaler, self).__init__(copy, with_mean, with_std)
+
+
+def normalize(X, norm='l2', axis=1, copy=True):
+    """Normalize a dataset along any axis
+
+    Parameters
+    ----------
+    X : array or scipy.sparse matrix with shape [n_samples, n_features]
+        The data to normalize, element by element.
+        scipy.sparse matrices should be in CSR format to avoid an
+        un-necessary copy.
+
+    norm : 'l1' or 'l2', optional ('l2' by default)
+        The norm to use to normalize each non zero sample (or each non-zero
+        feature if axis is 0).
+
+    axis : 0 or 1, optional (1 by default)
+        axis used to normalize the data along. If 1, independently normalize
+        each sample, otherwise (if 0) normalize each feature.
+
+    copy : boolean, optional, default is True
+        set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array or a scipy.sparse
+        CSR matrix and if axis is 1).
+
+    See also
+    --------
+    :class:`sklearn.preprocessing.Normalizer` to perform normalization
+    using the ``Transformer`` API (e.g. as part of a preprocessing
+    :class:`sklearn.pipeline.Pipeline`)
+    """
+    if norm not in ('l1', 'l2'):
+        raise ValueError("'%s' is not a supported norm" % norm)
+
+    if axis == 0:
+        sparse_format = 'csc'
+    elif axis == 1:
+        sparse_format = 'csr'
+    else:
+        raise ValueError("'%d' is not a supported axis" % axis)
+
+    X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
+    warn_if_not_float(X, 'The normalize function')
+    if axis == 0:
+        X = X.T
+
+    if sparse.issparse(X):
+        if norm == 'l1':
+            inplace_csr_row_normalize_l1(X)
+        elif norm == 'l2':
+            inplace_csr_row_normalize_l2(X)
+    else:
+        if norm == 'l1':
+            norms = np.abs(X).sum(axis=1)[:, np.newaxis]
+            norms[norms == 0.0] = 1.0
+        elif norm == 'l2':
+            norms = np.sqrt(np.sum(X ** 2, axis=1))[:, np.newaxis]
+            norms[norms == 0.0] = 1.0
+        X /= norms
+
+    if axis == 0:
+        X = X.T
+
+    return X
+
+
+class Normalizer(BaseEstimator, TransformerMixin):
+    """Normalize samples individually to unit norm
+
+    Each sample (i.e. each row of the data matrix) with at least one
+    non zero component is rescaled independently of other samples so
+    that its norm (l1 or l2) equals one.
+
+    This transformer is able to work both with dense numpy arrays and
+    scipy.sparse matrix (use CSR format if you want to avoid the burden of
+    a copy / conversion).
+
+    Scaling inputs to unit norms is a common operation for text
+    classification or clustering for instance. For instance the dot
+    product of two l2-normalized TF-IDF vectors is the cosine similarity
+    of the vectors and is the base similarity metric for the Vector
+    Space Model commonly used by the Information Retrieval community.
+
+    Parameters
+    ----------
+    norm : 'l1' or 'l2', optional ('l2' by default)
+        The norm to use to normalize each non zero sample.
+
+    copy : boolean, optional, default is True
+        set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array or a scipy.sparse
+        CSR matrix).
+
+    Notes
+    -----
+    This estimator is stateless (besides constructor parameters), the
+    fit method does nothing but is useful when used in a pipeline.
+
+    See also
+    --------
+    :func:`sklearn.preprocessing.normalize` equivalent function
+    without the object oriented API
+    """
+
+    def __init__(self, norm='l2', copy=True):
+        self.norm = norm
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Do nothing and return the estimator unchanged
+
+        This method is just there to implement the usual API and hence
+        work in pipelines.
+        """
+        atleast2d_or_csr(X)
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        """Scale each non zero row of X to unit norm
+
+        Parameters
+        ----------
+        X : array or scipy.sparse matrix with shape [n_samples, n_features]
+            The data to normalize, row by row. scipy.sparse matrices should be
+            in CSR format to avoid an un-necessary copy.
+        """
+        copy = copy if copy is not None else self.copy
+        atleast2d_or_csr(X)
+        return normalize(X, norm=self.norm, axis=1, copy=copy)
+
+
+def binarize(X, threshold=0.0, copy=True):
+    """Boolean thresholding of array-like or scipy.sparse matrix
+
+    Parameters
+    ----------
+    X : array or scipy.sparse matrix with shape [n_samples, n_features]
+        The data to binarize, element by element.
+        scipy.sparse matrices should be in CSR or CSC format to avoid an
+        un-necessary copy.
+
+    threshold : float, optional (0.0 by default)
+        Feature values below or equal to this are replaced by 0, above it by 1.
+        Threshold may not be less than 0 for operations on sparse matrices.
+
+    copy : boolean, optional, default is True
+        set to False to perform inplace binarization and avoid a copy
+        (if the input is already a numpy array or a scipy.sparse CSR / CSC
+        matrix and if axis is 1).
+
+    See also
+    --------
+    :class:`sklearn.preprocessing.Binarizer` to perform binarization
+    using the ``Transformer`` API (e.g. as part of a preprocessing
+    :class:`sklearn.pipeline.Pipeline`)
+    """
+    sparse_format = "csr"  # We force sparse format to be either csr or csc.
+    if hasattr(X, "format"):
+        if X.format in ["csr", "csc"]:
+            sparse_format = X.format
+
+    X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
+    if sparse.issparse(X):
+        if threshold < 0:
+            raise ValueError('Cannot binarize a sparse matrix with threshold '
+                             '< 0')
+        cond = X.data > threshold
+        not_cond = np.logical_not(cond)
+        X.data[cond] = 1
+        X.data[not_cond] = 0
+        X.eliminate_zeros()
+    else:
+        cond = X > threshold
+        not_cond = np.logical_not(cond)
+        X[cond] = 1
+        X[not_cond] = 0
+    return X
+
+
+class Binarizer(BaseEstimator, TransformerMixin):
+    """Binarize data (set feature values to 0 or 1) according to a threshold
+
+    Values greater than the threshold map to 1, while values less than
+    or equal to the threshold map to 0. With the default threshold of 0,
+    only positive values map to 1.
+
+    Binarization is a common operation on text count data where the
+    analyst can decide to only consider the presence or absence of a
+    feature rather than a quantified number of occurrences for instance.
+
+    It can also be used as a pre-processing step for estimators that
+    consider boolean random variables (e.g. modelled using the Bernoulli
+    distribution in a Bayesian setting).
+
+    Parameters
+    ----------
+    threshold : float, optional (0.0 by default)
+        Feature values below or equal to this are replaced by 0, above it by 1.
+        Threshold may not be less than 0 for operations on sparse matrices.
+
+    copy : boolean, optional, default is True
+        set to False to perform inplace binarization and avoid a copy (if
+        the input is already a numpy array or a scipy.sparse CSR matrix).
+
+    Notes
+    -----
+    If the input is a sparse matrix, only the non-zero values are subject
+    to update by the Binarizer class.
+
+    This estimator is stateless (besides constructor parameters), the
+    fit method does nothing but is useful when used in a pipeline.
+    """
+
+    def __init__(self, threshold=0.0, copy=True):
+        self.threshold = threshold
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Do nothing and return the estimator unchanged
+
+        This method is just there to implement the usual API and hence
+        work in pipelines.
+        """
+        atleast2d_or_csr(X)
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        """Binarize each element of X
+
+        Parameters
+        ----------
+        X : array or scipy.sparse matrix with shape [n_samples, n_features]
+            The data to binarize, element by element.
+            scipy.sparse matrices should be in CSR format to avoid an
+            un-necessary copy.
+        """
+        copy = copy if copy is not None else self.copy
+        return binarize(X, threshold=self.threshold, copy=copy)
+
+class KernelCenterer(BaseEstimator, TransformerMixin):
+    """Center a kernel matrix
+
+    Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a
+    function mapping x to a Hilbert space. KernelCenterer centers (i.e.,
+    normalize to have zero mean) the data without explicitly computing phi(x).
+    It is equivalent to centering phi(x) with
+    sklearn.preprocessing.StandardScaler(with_std=False).
+    """
+
+    def fit(self, K, y=None):
+        """Fit KernelCenterer
+
+        Parameters
+        ----------
+        K : numpy array of shape [n_samples, n_samples]
+            Kernel matrix.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        K = array2d(K)
+        n_samples = K.shape[0]
+        self.K_fit_rows_ = np.sum(K, axis=0) / n_samples
+        self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples
+        return self
+
+    def transform(self, K, y=None, copy=True):
+        """Center kernel matrix.
+
+        Parameters
+        ----------
+        K : numpy array of shape [n_samples1, n_samples2]
+            Kernel matrix.
+
+        Returns
+        -------
+        K_new : numpy array of shape [n_samples1, n_samples2]
+        """
+        K = array2d(K)
+        if copy:
+            K = K.copy()
+
+        K_pred_cols = (np.sum(K, axis=1) /
+                       self.K_fit_rows_.shape[0])[:, np.newaxis]
+
+        K -= self.K_fit_rows_
+        K -= K_pred_cols
+        K += self.K_fit_all_
+
+        return K
+
+
+def add_dummy_feature(X, value=1.0):
+    """Augment dataset with an additional dummy feature.
+
+    This is useful for fitting an intercept term with implementations which
+    cannot otherwise fit it directly.
+
+    Parameters
+    ----------
+    X : array or scipy.sparse matrix with shape [n_samples, n_features]
+        Data.
+
+    value : float
+        Value to use for the dummy feature.
+
+    Returns
+    -------
+
+    X : array or scipy.sparse matrix with shape [n_samples, n_features + 1]
+        Same data with dummy feature added as first column.
+
+    Examples
+    --------
+
+    >>> from sklearn.preprocessing import add_dummy_feature
+    >>> add_dummy_feature([[0, 1], [1, 0]])
+    array([[ 1.,  0.,  1.],
+           [ 1.,  1.,  0.]])
+    """
+    X = safe_asarray(X)
+    n_samples, n_features = X.shape
+    shape = (n_samples, n_features + 1)
+    if sparse.issparse(X):
+        if sparse.isspmatrix_coo(X):
+            # Shift columns to the right.
+            col = X.col + 1
+            # Column indices of dummy feature are 0 everywhere.
+            col = np.concatenate((np.zeros(n_samples), col))
+            # Row indices of dummy feature are 0, ..., n_samples-1.
+            row = np.concatenate((np.arange(n_samples), X.row))
+            # Prepend the dummy feature n_samples times.
+            data = np.concatenate((np.ones(n_samples) * value, X.data))
+            return sparse.coo_matrix((data, (row, col)), shape)
+        elif sparse.isspmatrix_csc(X):
+            # Shift index pointers since we need to add n_samples elements.
+            indptr = X.indptr + n_samples
+            # indptr[0] must be 0.
+            indptr = np.concatenate((np.array([0]), indptr))
+            # Row indices of dummy feature are 0, ..., n_samples-1.
+            indices = np.concatenate((np.arange(n_samples), X.indices))
+            # Prepend the dummy feature n_samples times.
+            data = np.concatenate((np.ones(n_samples) * value, X.data))
+            return sparse.csc_matrix((data, indices, indptr), shape)
+        else:
+            klass = X.__class__
+            return klass(add_dummy_feature(X.tocoo(), value))
+    else:
+        return np.hstack((np.ones((n_samples, 1)) * value, X))
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
new file mode 100644
index 0000000000000..9bdf528c5a873
--- /dev/null
+++ b/sklearn/preprocessing/imputation.py
@@ -0,0 +1,414 @@
+# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
+# License: BSD 3 clause
+
+import warnings
+import numbers
+import math
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from scipy import stats
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_arrays
+from ..utils import array2d
+from ..utils import as_float_array
+from ..utils import atleast2d_or_csr
+from ..utils import atleast2d_or_csc
+from ..utils import safe_asarray
+from ..utils import warn_if_not_float
+from ..utils.fixes import unique
+from ..utils import deprecated
+
+from ..utils.multiclass import unique_labels
+from ..utils.multiclass import type_of_target
+
+from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
+from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
+from ..utils.sparsefuncs import inplace_csr_column_scale
+from ..utils.sparsefuncs import mean_variance_axis0
+from ..externals import six
+
+zip = six.moves.zip
+map = six.moves.map
+
+__all__ = [
+    'Imputer',
+]
+
+def _get_mask(X, value_to_mask):
+    """Compute the boolean mask X == missing_values."""
+    if value_to_mask == "NaN" or np.isnan(value_to_mask):
+        return np.isnan(X)
+    else:
+        return X == value_to_mask
+
+
+def _get_median(negative_elements, n_zeros, positive_elements):
+    """Compute the median of the array formed by negative_elements,
+       n_zeros zeros and positive_elements. This function is used
+       to support sparse matrices."""
+    negative_elements = np.sort(negative_elements, kind='heapsort')
+    positive_elements = np.sort(positive_elements, kind='heapsort')
+
+    n_elems = len(negative_elements) + n_zeros + len(positive_elements)
+    if not n_elems:
+        return np.nan
+
+    median_position = (n_elems - 1) / 2.0
+
+    if round(median_position) == median_position:
+        median = _get_elem_at_rank(negative_elements, n_zeros,
+                                   positive_elements, median_position)
+    else:
+        a = _get_elem_at_rank(negative_elements, n_zeros,
+                              positive_elements, math.floor(median_position))
+        b = _get_elem_at_rank(negative_elements, n_zeros,
+                              positive_elements, math.ceil(median_position))
+        median = (a + b) / 2.0
+
+    return median
+
+
+def _get_elem_at_rank(negative_elements, n_zeros, positive_elements, k):
+    """Compute the kth largest element of the array formed by
+       negative_elements, n_zeros zeros and positive_elements."""
+    len_neg = len(negative_elements)
+    len_pos = len(positive_elements)
+
+    if k < len_neg:
+        return negative_elements[k]
+    elif k >= len_neg + n_zeros:
+        return positive_elements[k - len_neg - n_zeros]
+    else:
+        return 0
+
+
+def _most_frequent(array, extra_value, n_repeat):
+    """Compute the most frequent value in a 1d array extended with
+       [extra_value] * n_repeat, where extra_value is assumed to be not part
+       of the array."""
+    # Compute the most frequent value in array only
+    if array.size > 0:
+        mode = stats.mode(array)
+        most_frequent_value = mode[0][0]
+        most_frequent_count = mode[1][0]
+    else:
+        most_frequent_value = 0
+        most_frequent_count = 0
+
+    # Compare to array + [extra_value] * n_repeat
+    if most_frequent_count == 0 and n_repeat == 0:
+        return np.nan
+    elif most_frequent_count < n_repeat:
+        return extra_value
+    elif most_frequent_count > n_repeat:
+        return most_frequent_value
+    elif most_frequent_count == n_repeat:
+        # Ties the breaks. Copy the behaviour of scipy.stats.mode
+        if most_frequent_value < extra_value:
+            return most_frequent_value
+        else:
+            return extra_value
+
+
+class Imputer(BaseEstimator, TransformerMixin):
+    """Imputation transformer for completing missing values.
+
+    Parameters
+    ----------
+    missing_values : integer or string, optional (default="NaN")
+        The placeholder for the missing values. All occurences of
+        `missing_values` will be imputed. For missing values encoded as np.nan,
+        use the string value "NaN".
+
+    strategy : string, optional (default="mean")
+        The imputation strategy.
+          - If "mean", then replace missing values using the mean along
+            the axis.
+          - If "median", then replace missing values using the median along
+            the axis.
+          - If "most_frequent", then replace missing using the most frequent
+            value along the axis.
+
+    axis : integer, optional (default=0)
+        The axis along which to impute.
+         - If `axis=0`, then impute along columns.
+         - If `axis=1`, then impute along rows.
+
+    verbose : integer, optional (default=0)
+        Controls the verbosity of the imputer.
+
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place.
+
+    Attributes
+    ----------
+    `statistics_` : array of shape (n_features,) or (n_samples,)
+        The statistics along the imputation axis.
+
+    Notes
+    -----
+    - When ``axis=0``, columns which only contained missing values at `fit`
+      are discarded upon `transform`.
+    - When ``axis=1``, an exception is raised if there are rows for which it is
+      not possible to fill in the missing values (e.g., because they only
+      contain missing values).
+    """
+    def __init__(self, missing_values="NaN", strategy="mean",
+                 axis=0, verbose=0, copy=True):
+        self.missing_values = missing_values
+        self.strategy = strategy
+        self.axis = axis
+        self.verbose = verbose
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # Check parameters
+        allowed_strategies = ["mean", "median", "most_frequent"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Can only use these strategies: {0} "
+                             " got strategy={1}".format(allowed_strategies,
+                                                        self.strategy))
+
+        if self.axis not in [0, 1]:
+            raise ValueError("Can only impute missing values on axis 0 and 1, "
+                             " got axis={0}".format(self.axis))
+
+        # Since two different arrays can be provided in fit(X) and
+        # transform(X), the imputation data will be computed in transform()
+        # when the imputation is done per sample (i.e., when axis=1).
+        if self.axis == 0:
+            X = atleast2d_or_csc(X, dtype=np.float64, force_all_finite=False)
+
+            if sparse.issparse(X):
+                self.statistics_ = self._sparse_fit(X,
+                                                    self.strategy,
+                                                    self.missing_values,
+                                                    self.axis)
+            else:
+                self.statistics_ = self._dense_fit(X,
+                                                   self.strategy,
+                                                   self.missing_values,
+                                                   self.axis)
+
+        return self
+
+    def _sparse_fit(self, X, strategy, missing_values, axis):
+        """Fit the transformer on sparse data."""
+        # Imputation is done "by column", so if we want to do it
+        # by row we only need to convert the matrix to csr format.
+        if axis == 1:
+            X = X.tocsr()
+        else:
+            X = X.tocsc()
+
+        # Count the zeros
+        if missing_values == 0:
+            n_zeros_axis = np.zeros(X.shape[not axis])
+        else:
+            n_zeros_axis = X.shape[axis] - np.diff(X.indptr)
+
+        # Mean
+        if strategy == "mean":
+            if missing_values != 0:
+                n_non_missing = n_zeros_axis
+
+                # Mask the missing elements
+                mask_missing_values = _get_mask(X.data, missing_values)
+                mask_valids = np.logical_not(mask_missing_values)
+
+                # Sum only the valid elements
+                new_data = X.data.copy()
+                new_data[mask_missing_values] = 0
+                X = sparse.csc_matrix((new_data, X.indices, X.indptr),
+                                      copy=False)
+                sums = X.sum(axis=0)
+
+                # Count the elements != 0
+                mask_non_zeros = sparse.csc_matrix(
+                    (mask_valids.astype(np.float64),
+                     X.indices,
+                     X.indptr), copy=False)
+                s = mask_non_zeros.sum(axis=0)
+                n_non_missing = np.add(n_non_missing, s)
+
+            else:
+                sums = X.sum(axis=axis)
+                n_non_missing = np.diff(X.indptr)
+
+            # Ignore the error, columns with a np.nan statistics_
+            # are not an error at this point. These columns will
+            # be removed in transform
+            with np.errstate(all="ignore"):
+                return np.ravel(sums) / np.ravel(n_non_missing)
+
+        # Median + Most frequent
+        else:
+            # Remove the missing values, for each column
+            columns_all = np.hsplit(X.data, X.indptr[1:-1])
+            mask_missing_values = _get_mask(X.data, missing_values)
+            mask_valids = np.hsplit(np.logical_not(mask_missing_values),
+                                    X.indptr[1:-1])
+
+            columns = [col[mask.astype(np.bool)]
+                       for col, mask in zip(columns_all, mask_valids)]
+
+            # Median
+            if strategy == "median":
+                median = np.empty(len(columns))
+                for i, column in enumerate(columns):
+
+                    negatives = column[column < 0]
+                    positives = column[column > 0]
+                    median[i] = _get_median(negatives,
+                                            n_zeros_axis[i],
+                                            positives)
+
+                return median
+
+            # Most frequent
+            elif strategy == "most_frequent":
+                most_frequent = np.empty(len(columns))
+
+                for i, column in enumerate(columns):
+                    most_frequent[i] = _most_frequent(column,
+                                                      0,
+                                                      n_zeros_axis[i])
+
+                return most_frequent
+
+    def _dense_fit(self, X, strategy, missing_values, axis):
+        """Fit the transformer on dense data."""
+        X = array2d(X, force_all_finite=False)
+        mask = _get_mask(X, missing_values)
+        masked_X = ma.masked_array(X, mask=mask)
+
+        # Mean
+        if strategy == "mean":
+            mean_masked = np.ma.mean(masked_X, axis=axis)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            mean = np.ma.getdata(mean_masked)
+            mean[np.ma.getmask(mean_masked)] = np.nan
+
+            return mean
+
+        # Median
+        elif strategy == "median":
+            median_masked = np.ma.median(masked_X, axis=axis)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            median = np.ma.getdata(median_masked)
+            median[np.ma.getmask(median_masked)] = np.nan
+
+            return median
+
+        # Most frequent
+        elif strategy == "most_frequent":
+            # scipy.stats.mstats.mode cannot be used because it will no work
+            # properly if the first element is masked and if it's frequency
+            # is equal to the frequency of the most frequent valid element
+            # See https://github.com/scipy/scipy/issues/2636
+
+            # To be able access the elements by columns
+            if axis == 0:
+                X = X.transpose()
+                mask = mask.transpose()
+
+            most_frequent = np.empty(X.shape[0])
+
+            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
+                row_mask = np.logical_not(row_mask).astype(np.bool)
+                row = row[row_mask]
+                most_frequent[i] = _most_frequent(row, np.nan, 0)
+
+            return most_frequent
+
+    def transform(self, X):
+        """Impute all missing values in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            The input data to complete.
+        """
+        if self.copy and not isinstance(X, list):
+            X = X.copy()
+
+        # Since two different arrays can be provided in fit(X) and
+        # transform(X), the imputation data need to be recomputed
+        # when the imputation is done per sample
+        if self.axis == 1:
+            X = atleast2d_or_csr(X, force_all_finite=False).astype(np.float)
+
+            if sparse.issparse(X):
+                self.statistics_ = self._sparse_fit(X,
+                                                    self.strategy,
+                                                    self.missing_values,
+                                                    self.axis)
+
+            else:
+                self.statistics_ = self._dense_fit(X,
+                                                   self.strategy,
+                                                   self.missing_values,
+                                                   self.axis)
+        else:
+            X = atleast2d_or_csc(X, force_all_finite=False).astype(np.float)
+
+        # Delete the invalid rows/columns
+        invalid_mask = np.isnan(self.statistics_)
+        valid_mask = np.logical_not(invalid_mask)
+        valid_statistics = self.statistics_[valid_mask]
+        valid_statistics_indexes = np.where(valid_mask)[0]
+        missing = np.arange(X.shape[not self.axis])[invalid_mask]
+
+        if self.axis == 0 and invalid_mask.any():
+            if self.verbose:
+                warnings.warn("Deleting features without "
+                              "observed values: %s" % missing)
+            X = X[:, valid_statistics_indexes]
+        elif self.axis == 1 and invalid_mask.any():
+            raise ValueError("Some rows only contain "
+                             "missing values: %s" % missing)
+
+        # Do actual imputation
+        if sparse.issparse(X) and self.missing_values != 0:
+            if self.axis == 0:
+                X = X.tocsr()
+            else:
+                X = X.tocsc()
+
+            mask = _get_mask(X.data, self.missing_values)
+            indexes = X.indices[mask]
+
+            X.data[mask] = valid_statistics[indexes].astype(X.dtype)
+        else:
+            if sparse.issparse(X):
+                X = X.toarray()
+
+            mask = _get_mask(X, self.missing_values)
+            n_missing = np.sum(mask, axis=self.axis)
+            values = np.repeat(valid_statistics, n_missing)
+
+            if self.axis == 0:
+                coordinates = np.where(mask.transpose())[::-1]
+            else:
+                coordinates = mask
+
+            X[coordinates] = values
+
+        return X
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
new file mode 100644
index 0000000000000..61573973910e5
--- /dev/null
+++ b/sklearn/preprocessing/label.py
@@ -0,0 +1,687 @@
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Mathieu Blondel <mathieu@mblondel.org>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+#          Andreas Mueller <amueller@ais.uni-bonn.de>
+# License: BSD 3 clause
+
+import warnings
+import numbers
+import math
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from scipy import stats
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_arrays
+from ..utils import array2d
+from ..utils import as_float_array
+from ..utils import atleast2d_or_csr
+from ..utils import atleast2d_or_csc
+from ..utils import safe_asarray
+from ..utils import warn_if_not_float
+from ..utils.fixes import unique
+from ..utils import deprecated
+
+from ..utils.multiclass import unique_labels
+from ..utils.multiclass import type_of_target
+
+from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
+from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
+from ..utils.sparsefuncs import inplace_csr_column_scale
+from ..utils.sparsefuncs import mean_variance_axis0
+from ..externals import six
+
+zip = six.moves.zip
+map = six.moves.map
+
+__all__ = [
+    'LabelBinarizer',
+    'LabelEncoder',
+    'OneHotEncoder',
+]
+
+def _transform_selected(X, transform, selected="all", copy=True):
+    """Apply a transform function to portion of selected features
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape=(n_samples, n_features)
+        Dense array or sparse matrix.
+
+    transform : callable
+        A callable transform(X) -> X_transformed
+
+    copy : boolean, optional
+        Copy X even if it could be avoided.
+
+    selected: "all" or array of indices or mask
+        Specify which features to apply the transform to.
+
+    Returns
+    -------
+    X : array or sparse matrix, shape=(n_samples, n_features_new)
+    """
+    if selected == "all":
+        return transform(X)
+
+    X = atleast2d_or_csc(X, copy=copy)
+
+    if len(selected) == 0:
+        return X
+
+    n_features = X.shape[1]
+    ind = np.arange(n_features)
+    sel = np.zeros(n_features, dtype=bool)
+    sel[np.asarray(selected)] = True
+    not_sel = np.logical_not(sel)
+    n_selected = np.sum(sel)
+
+    if n_selected == 0:
+        # No features selected.
+        return X
+    elif n_selected == n_features:
+        # All features selected.
+        return transform(X)
+    else:
+        X_sel = transform(X[:, ind[sel]])
+        X_not_sel = X[:, ind[not_sel]]
+
+        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
+            return sparse.hstack((X_sel, X_not_sel))
+        else:
+            return np.hstack((X_sel, X_not_sel))
+
+
+class OneHotEncoder(BaseEstimator, TransformerMixin):
+    """Encode categorical integer features using a one-hot aka one-of-K scheme.
+
+    The input to this transformer should be a matrix of integers, denoting
+    the values taken on by categorical (discrete) features. The output will be
+    a sparse matrix were each column corresponds to one possible value of one
+    feature. It is assumed that input features take on values in the range
+    [0, n_values).
+
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
+
+    Parameters
+    ----------
+    n_values : 'auto', int or array of ints
+        Number of values per feature.
+
+        - 'auto' : determine value range from training data.
+        - int : maximum value for all features.
+        - array : maximum value per feature.
+
+    categorical_features: "all" or array of indices or mask
+        Specify what features are treated as categorical.
+
+        - 'all' (default): All features are treated as categorical.
+        - array of indices: Array of categorical feature indices.
+        - mask: Array of length n_features and with dtype=bool.
+
+        Non-categorical features are always stacked to the right of the matrix.
+
+    dtype : number type, default=np.float
+        Desired dtype of output.
+
+    Attributes
+    ----------
+    `active_features_` : array
+        Indices for active features, meaning values that actually occur
+        in the training set. Only available when n_values is ``'auto'``.
+
+    `feature_indices_` : array of shape (n_features,)
+        Indices to feature ranges.
+        Feature ``i`` in the original data is mapped to features
+        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
+        (and then potentially masked by `active_features_` afterwards)
+
+    `n_values_` : array of shape (n_features,)
+        Maximum number of values per feature.
+
+    Examples
+    --------
+    Given a dataset with three features and two samples, we let the encoder
+    find the maximum value per feature and transform the data to a binary
+    one-hot encoding.
+
+    >>> from sklearn.preprocessing import OneHotEncoder
+    >>> enc = OneHotEncoder()
+    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
+[1, 0, 2]])  # doctest: +ELLIPSIS
+    OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
+           n_values='auto')
+    >>> enc.n_values_
+    array([2, 3, 4])
+    >>> enc.feature_indices_
+    array([0, 2, 5, 9])
+    >>> enc.transform([[0, 1, 1]]).toarray()
+    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])
+
+    See also
+    --------
+    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
+      dictionary items (also handles string-valued features).
+    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
+      encoding of dictionary items or strings.
+    """
+    def __init__(self, n_values="auto", categorical_features="all",
+                 dtype=np.float):
+        self.n_values = n_values
+        self.categorical_features = categorical_features
+        self.dtype = dtype
+
+    def fit(self, X, y=None):
+        """Fit OneHotEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_feature)
+            Input array of type int.
+
+        Returns
+        -------
+        self
+        """
+        self.fit_transform(X)
+        return self
+
+    def _fit_transform(self, X):
+        """Assumes X contains only categorical features."""
+        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
+        if np.any(X < 0):
+            raise ValueError("X needs to contain only non-negative integers.")
+        n_samples, n_features = X.shape
+        if self.n_values == 'auto':
+            n_values = np.max(X, axis=0) + 1
+        elif isinstance(self.n_values, numbers.Integral):
+            n_values = np.empty(n_features, dtype=np.int)
+            n_values.fill(self.n_values)
+        else:
+            try:
+                n_values = np.asarray(self.n_values, dtype=int)
+            except (ValueError, TypeError):
+                raise TypeError("Wrong type for parameter `n_values`. Expected"
+                                " 'auto', int or array of ints, got %r"
+                                % type(X))
+            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
+                raise ValueError("Shape mismatch: if n_values is an array,"
+                                 " it has to be of shape (n_features,).")
+        self.n_values_ = n_values
+        n_values = np.hstack([[0], n_values])
+        indices = np.cumsum(n_values)
+        self.feature_indices_ = indices
+
+        column_indices = (X + indices[:-1]).ravel()
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                n_features)
+        data = np.ones(n_samples * n_features)
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+
+        if self.n_values == 'auto':
+            mask = np.array(out.sum(axis=0)).ravel() != 0
+            active_features = np.where(mask)[0]
+            out = out[:, active_features]
+            self.active_features_ = active_features
+
+        return out
+
+    def fit_transform(self, X, y=None):
+        """Fit OneHotEncoder to X, then transform X.
+
+        Equivalent to self.fit(X).transform(X), but more convenient and more
+        efficient. See fit for the parameters, transform for the return value.
+        """
+        return _transform_selected(X, self._fit_transform,
+                                   self.categorical_features, copy=True)
+
+    def _transform(self, X):
+        """Asssumes X contains only categorical features."""
+        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
+        if np.any(X < 0):
+            raise ValueError("X needs to contain only non-negative integers.")
+        n_samples, n_features = X.shape
+
+        indices = self.feature_indices_
+        if n_features != indices.shape[0] - 1:
+            raise ValueError("X has different shape than during fitting."
+                             " Expected %d, got %d."
+                             % (indices.shape[0] - 1, n_features))
+
+        n_values_check = np.max(X, axis=0) + 1
+        if (n_values_check > self.n_values_).any():
+            raise ValueError("Feature out of bounds. Try setting n_values.")
+
+        column_indices = (X + indices[:-1]).ravel()
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                n_features)
+        data = np.ones(n_samples * n_features)
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+        if self.n_values == 'auto':
+            out = out[:, self.active_features_]
+        return out
+
+    def transform(self, X):
+        """Transform X using one-hot encoding.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            Input array of type int.
+
+        Returns
+        -------
+        X_out : sparse matrix, dtype=int
+            Transformed input.
+        """
+        return _transform_selected(X, self._transform,
+                                   self.categorical_features, copy=True)
+
+
+class LabelEncoder(BaseEstimator, TransformerMixin):
+    """Encode labels with value between 0 and n_classes-1.
+
+    Attributes
+    ----------
+    `classes_`: array of shape [n_class]
+        Holds the label for each class.
+
+    Examples
+    --------
+    `LabelEncoder` can be used to normalize labels.
+
+    >>> from sklearn import preprocessing
+    >>> le = preprocessing.LabelEncoder()
+    >>> le.fit([1, 2, 2, 6])
+    LabelEncoder()
+    >>> le.classes_
+    array([1, 2, 6])
+    >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
+    array([0, 0, 1, 2]...)
+    >>> le.inverse_transform([0, 0, 1, 2])
+    array([1, 1, 2, 6])
+
+    It can also be used to transform non-numerical labels (as long as they are
+    hashable and comparable) to numerical labels.
+
+    >>> le = preprocessing.LabelEncoder()
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder()
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
+    array([2, 2, 1]...)
+    >>> list(le.inverse_transform([2, 2, 1]))
+    ['tokyo', 'tokyo', 'paris']
+
+    """
+
+    def _check_fitted(self):
+        if not hasattr(self, "classes_"):
+            raise ValueError("LabelNormalizer was not fitted yet.")
+
+    def fit(self, y):
+        """Fit label encoder
+
+        Parameters
+        ----------
+        y : array-like of shape [n_samples]
+            Target values.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        self.classes_ = np.unique(y)
+        return self
+
+    def fit_transform(self, y):
+        """Fit label encoder and return encoded labels
+
+        Parameters
+        ----------
+        y : array-like of shape [n_samples]
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape [n_samples]
+        """
+        self.classes_, y = unique(y, return_inverse=True)
+        return y
+
+    def transform(self, y):
+        """Transform labels to normalized encoding.
+
+        Parameters
+        ----------
+        y : array-like of shape [n_samples]
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape [n_samples]
+        """
+        self._check_fitted()
+
+        classes = np.unique(y)
+        if len(np.intersect1d(classes, self.classes_)) < len(classes):
+            diff = np.setdiff1d(classes, self.classes_)
+            raise ValueError("y contains new labels: %s" % str(diff))
+
+        return np.searchsorted(self.classes_, y)
+
+    def inverse_transform(self, y):
+        """Transform labels back to original encoding.
+
+        Parameters
+        ----------
+        y : numpy array of shape [n_samples]
+            Target values.
+
+        Returns
+        -------
+        y : numpy array of shape [n_samples]
+        """
+        self._check_fitted()
+
+        y = np.asarray(y)
+        return self.classes_[y]
+
+
+class LabelBinarizer(BaseEstimator, TransformerMixin):
+    """Binarize labels in a one-vs-all fashion
+
+    Several regression and binary classification algorithms are
+    available in the scikit. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    At learning time, this simply consists in learning one regressor
+    or binary classifier per class. In doing so, one needs to convert
+    multi-class labels to binary labels (belong or does not belong
+    to the class). LabelBinarizer makes this process easy with the
+    transform method.
+
+    At prediction time, one assigns the class for which the corresponding
+    model gave the greatest confidence. LabelBinarizer makes this easy
+    with the inverse_transform method.
+
+    Parameters
+    ----------
+
+    neg_label: int (default: 0)
+        Value with which negative labels must be encoded.
+
+    pos_label: int (default: 1)
+        Value with which positive labels must be encoded.
+
+    Attributes
+    ----------
+    `classes_`: array of shape [n_class]
+        Holds the label for each class.
+
+    `multilabel_`: boolean
+        True if the transformer was fitted on a multilabel rather than a
+        multiclass set of labels.
+
+    Examples
+    --------
+    >>> from sklearn import preprocessing
+    >>> lb = preprocessing.LabelBinarizer()
+    >>> lb.fit([1, 2, 6, 4, 2])
+    LabelBinarizer(neg_label=0, pos_label=1)
+    >>> lb.classes_
+    array([1, 2, 4, 6])
+    >>> lb.multilabel_
+    False
+    >>> lb.transform([1, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    >>> lb.fit_transform([(1, 2), (3,)])
+    array([[1, 1, 0],
+           [0, 0, 1]])
+    >>> lb.classes_
+    array([1, 2, 3])
+    >>> lb.multilabel_
+    True
+
+    See also
+    --------
+    label_binarize : function to perform the transform operation of
+        LabelBinarizer with fixed classes.
+    """
+
+    def __init__(self, neg_label=0, pos_label=1):
+        if neg_label >= pos_label:
+            raise ValueError("neg_label must be strictly less than pos_label.")
+
+        self.neg_label = neg_label
+        self.pos_label = pos_label
+
+    @property
+    @deprecated("Attribute 'multilabel' was renamed to 'multilabel_' in "
+                "0.14 and will be removed in 0.16")
+    def multilabel(self):
+        return self.multilabel_
+
+    def _check_fitted(self):
+        if not hasattr(self, "classes_"):
+            raise ValueError("LabelBinarizer was not fitted yet.")
+
+    def fit(self, y):
+        """Fit label binarizer
+
+        Parameters
+        ----------
+        y : numpy array of shape [n_samples] or sequence of sequences
+            Target values. In the multilabel case the nested sequences can
+            have variable lengths.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        y_type = type_of_target(y)
+        self.multilabel_ = y_type.startswith('multilabel')
+        if self.multilabel_:
+            self.indicator_matrix_ = y_type == 'multilabel-indicator'
+
+        self.classes_ = unique_labels(y)
+
+        return self
+
+    def transform(self, y):
+        """Transform multi-class labels to binary labels
+
+        The output of transform is sometimes referred to by some authors as the
+        1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : numpy array of shape [n_samples] or sequence of sequences
+            Target values. In the multilabel case the nested sequences can
+            have variable lengths.
+
+        Returns
+        -------
+        Y : numpy array of shape [n_samples, n_classes]
+        """
+        self._check_fitted()
+
+        y_is_multilabel = type_of_target(y).startswith('multilabel')
+
+        if y_is_multilabel and not self.multilabel_:
+            raise ValueError("The object was not fitted with multilabel"
+                             " input.")
+
+        return label_binarize(y, self.classes_,
+                              multilabel=self.multilabel_,
+                              pos_label=self.pos_label,
+                              neg_label=self.neg_label)
+
+    def inverse_transform(self, Y, threshold=None):
+        """Transform binary labels back to multi-class labels
+
+        Parameters
+        ----------
+        Y : numpy array of shape [n_samples, n_classes]
+            Target values.
+
+        threshold : float or None
+            Threshold used in the binary and multi-label cases.
+
+            Use 0 when:
+                - Y contains the output of decision_function (classifier)
+            Use 0.5 when:
+                - Y contains the output of predict_proba
+
+            If None, the threshold is assumed to be half way between
+            neg_label and pos_label.
+
+        Returns
+        -------
+        y : numpy array of shape [n_samples] or sequence of sequences
+            Target values. In the multilabel case the nested sequences can
+            have variable lengths.
+
+        Notes
+        -----
+        In the case when the binary labels are fractional
+        (probabilistic), inverse_transform chooses the class with the
+        greatest value. Typically, this allows to use the output of a
+        linear model's decision_function method directly as the input
+        of inverse_transform.
+        """
+        self._check_fitted()
+
+        if threshold is None:
+            half = (self.pos_label - self.neg_label) / 2.0
+            threshold = self.neg_label + half
+
+        if self.multilabel_:
+            Y = np.array(Y > threshold, dtype=int)
+            # Return the predictions in the same format as in fit
+            if self.indicator_matrix_:
+                # Label indicator matrix format
+                return Y
+            else:
+                # Lists of tuples format
+                return [tuple(self.classes_[np.flatnonzero(Y[i])])
+                        for i in range(Y.shape[0])]
+
+        if len(Y.shape) == 1 or Y.shape[1] == 1:
+            y = np.array(Y.ravel() > threshold, dtype=int)
+
+        else:
+            y = Y.argmax(axis=1)
+
+        return self.classes_[y]
+
+
+def label_binarize(y, classes, multilabel=False, neg_label=0, pos_label=1):
+    """Binarize labels in a one-vs-all fashion
+
+    Several regression and binary classification algorithms are
+    available in the scikit. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    This function makes it possible to compute this transformation for a
+    fixed set of class labels known ahead of time.
+
+    Parameters
+    ----------
+    y : array-like
+        Sequence of integer labels to encode.
+
+    classes : array of shape [n_classes]
+        Uniquely holds the label for each class.
+
+    multilabel : boolean
+        Set to true if y is encoding a multilabel tasks (with a variable
+        number of label assignements per sample) rather than a multiclass task
+        where one sample has one and only one label assigned.
+
+    neg_label: int (default: 0)
+        Value with which negative labels must be encoded.
+
+    pos_label: int (default: 1)
+        Value with which positive labels must be encoded.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import label_binarize
+    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    The class ordering is preserved:
+
+    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
+    array([[1, 0, 0, 0],
+           [0, 1, 0, 0]])
+
+    >>> label_binarize([(1, 2), (6,), ()], multilabel=True,
+    ...                classes=[1, 6, 4, 2])
+    array([[1, 0, 0, 1],
+           [0, 1, 0, 0],
+           [0, 0, 0, 0]])
+
+    See also
+    --------
+    label_binarize : function to perform the transform operation of
+        LabelBinarizer with fixed classes.
+    """
+    y_type = type_of_target(y)
+
+    if multilabel or len(classes) > 2:
+        if y_type == 'multilabel-indicator':
+            # nothing to do as y is already a label indicator matrix
+            return y
+
+        Y = np.zeros((len(y), len(classes)), dtype=np.int)
+    else:
+        Y = np.zeros((len(y), 1), dtype=np.int)
+
+    Y += neg_label
+
+    y_is_multilabel = y_type.startswith('multilabel')
+
+    if multilabel:
+        if not y_is_multilabel:
+            raise ValueError("y should be a list of label lists/tuples,"
+                             "got %r" % (y,))
+
+        # inverse map: label => column index
+        imap = dict((v, k) for k, v in enumerate(classes))
+
+        for i, label_tuple in enumerate(y):
+            for label in label_tuple:
+                Y[i, imap[label]] = pos_label
+
+        return Y
+
+    else:
+        y = np.asarray(y)
+
+        if len(classes) == 2:
+            Y[y == classes[1], 0] = pos_label
+            return Y
+
+        elif len(classes) >= 2:
+            for i, k in enumerate(classes):
+                Y[y == k, i] = pos_label
+            return Y
+
+        else:
+            # Only one class, returns a matrix with all negative labels.
+            return Y

From c7dd3e5f561941812e9f84953a835f3ec722e9e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20Tr=C3=A9segnie?= <nicolas.tresegnie@gmail.com>
Date: Fri, 26 Jul 2013 12:07:06 +0200
Subject: [PATCH 2/6] Imp splitting of test_preprocessing.py

---
 sklearn/preprocessing/__init__.py             |   1 +
 sklearn/preprocessing/label.py                |   1 +
 sklearn/preprocessing/tests/__init__.py       |   0
 .../tests/test_data.py}                       | 532 +-----------------
 .../preprocessing/tests/test_imputation.py    | 282 ++++++++++
 sklearn/preprocessing/tests/test_label.py     | 320 +++++++++++
 6 files changed, 606 insertions(+), 530 deletions(-)
 create mode 100644 sklearn/preprocessing/tests/__init__.py
 rename sklearn/{tests/test_preprocessing.py => preprocessing/tests/test_data.py} (51%)
 create mode 100644 sklearn/preprocessing/tests/test_imputation.py
 create mode 100644 sklearn/preprocessing/tests/test_label.py

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index 5dc8d5dcd4b13..da548216be75e 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -13,6 +13,7 @@
 from .data import normalize
 from .data import scale
 
+from .label import label_binarize
 from .label import LabelBinarizer
 from .label import LabelEncoder
 from .label import OneHotEncoder
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 61573973910e5..c8e1f90eafd31 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -37,6 +37,7 @@
 map = six.moves.map
 
 __all__ = [
+    'label_binarize',
     'LabelBinarizer',
     'LabelEncoder',
     'OneHotEncoder',
diff --git a/sklearn/preprocessing/tests/__init__.py b/sklearn/preprocessing/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/preprocessing/tests/test_data.py
similarity index 51%
rename from sklearn/tests/test_preprocessing.py
rename to sklearn/preprocessing/tests/test_data.py
index d5e2ee88d0a5e..67d0cbe0f2746 100644
--- a/sklearn/tests/test_preprocessing.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -15,7 +15,7 @@
 from sklearn.preprocessing import Binarizer
 from sklearn.preprocessing import KernelCenterer
 from sklearn.preprocessing import LabelBinarizer
-from sklearn.preprocessing import _transform_selected
+
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import Normalizer
@@ -36,13 +36,11 @@
 
 iris = datasets.load_iris()
 
-
 def toarray(a):
     if hasattr(a, "toarray"):
         a = a.toarray()
     return a
 
-
 def test_scaler_1d():
     """Test scaling of dataset along single axis"""
     rng = np.random.RandomState(0)
@@ -475,283 +473,6 @@ def test_binarizer():
     assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
 
 
-def test_label_binarizer():
-    lb = LabelBinarizer()
-
-    # two-class case
-    inp = ["neg", "pos", "pos", "neg"]
-    expected = np.array([[0, 1, 1, 0]]).T
-    got = lb.fit_transform(inp)
-    assert_false(lb.multilabel_)
-    assert_array_equal(lb.classes_, ["neg", "pos"])
-    assert_array_equal(expected, got)
-    assert_array_equal(lb.inverse_transform(got), inp)
-
-    # multi-class case
-    inp = ["spam", "ham", "eggs", "ham", "0"]
-    expected = np.array([[0, 0, 0, 1],
-                         [0, 0, 1, 0],
-                         [0, 1, 0, 0],
-                         [0, 0, 1, 0],
-                         [1, 0, 0, 0]])
-    got = lb.fit_transform(inp)
-    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
-    assert_false(lb.multilabel_)
-    assert_array_equal(expected, got)
-    assert_array_equal(lb.inverse_transform(got), inp)
-
-
-def test_label_binarizer_set_label_encoding():
-    lb = LabelBinarizer(neg_label=-2, pos_label=2)
-
-    # two-class case
-    inp = np.array([0, 1, 1, 0])
-    expected = np.array([[-2, 2, 2, -2]]).T
-    got = lb.fit_transform(inp)
-    assert_false(lb.multilabel_)
-    assert_array_equal(expected, got)
-    assert_array_equal(lb.inverse_transform(got), inp)
-
-    # multi-class case
-    inp = np.array([3, 2, 1, 2, 0])
-    expected = np.array([[-2, -2, -2, +2],
-                         [-2, -2, +2, -2],
-                         [-2, +2, -2, -2],
-                         [-2, -2, +2, -2],
-                         [+2, -2, -2, -2]])
-    got = lb.fit_transform(inp)
-    assert_false(lb.multilabel_)
-    assert_array_equal(expected, got)
-    assert_array_equal(lb.inverse_transform(got), inp)
-
-
-def test_label_binarizer_multilabel():
-    lb = LabelBinarizer()
-
-    # test input as lists of tuples
-    inp = [(2, 3), (1,), (1, 2)]
-    indicator_mat = np.array([[0, 1, 1],
-                              [1, 0, 0],
-                              [1, 1, 0]])
-    got = lb.fit_transform(inp)
-    assert_true(lb.multilabel_)
-    assert_array_equal(indicator_mat, got)
-    assert_equal(lb.inverse_transform(got), inp)
-
-    # test input as label indicator matrix
-    lb.fit(indicator_mat)
-    assert_array_equal(indicator_mat,
-                       lb.inverse_transform(indicator_mat))
-
-    # regression test for the two-class multilabel case
-    lb = LabelBinarizer()
-    inp = [[1, 0], [0], [1], [0, 1]]
-    expected = np.array([[1, 1],
-                         [1, 0],
-                         [0, 1],
-                         [1, 1]])
-    got = lb.fit_transform(inp)
-    assert_true(lb.multilabel_)
-    assert_array_equal(expected, got)
-    assert_equal([set(x) for x in lb.inverse_transform(got)],
-                 [set(x) for x in inp])
-
-
-def test_label_binarizer_errors():
-    """Check that invalid arguments yield ValueError"""
-    one_class = np.array([0, 0, 0, 0])
-    lb = LabelBinarizer().fit(one_class)
-    assert_false(lb.multilabel_)
-
-    multi_label = [(2, 3), (0,), (0, 2)]
-    assert_raises(ValueError, lb.transform, multi_label)
-
-    lb = LabelBinarizer()
-    assert_raises(ValueError, lb.transform, [])
-    assert_raises(ValueError, lb.inverse_transform, [])
-
-    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
-    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
-
-
-def test_one_hot_encoder():
-    """Test OneHotEncoder's fit and transform."""
-    X = [[3, 2, 1], [0, 1, 1]]
-    enc = OneHotEncoder()
-    # discover max values automatically
-    X_trans = enc.fit_transform(X).toarray()
-    assert_equal(X_trans.shape, (2, 5))
-    assert_array_equal(enc.active_features_,
-                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
-    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
-
-    # check outcome
-    assert_array_equal(X_trans,
-                       [[0., 1., 0., 1., 1.],
-                        [1., 0., 1., 0., 1.]])
-
-    # max value given as 3
-    enc = OneHotEncoder(n_values=4)
-    X_trans = enc.fit_transform(X)
-    assert_equal(X_trans.shape, (2, 4 * 3))
-    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
-
-    # max value given per feature
-    enc = OneHotEncoder(n_values=[3, 2, 2])
-    X = [[1, 0, 1], [0, 1, 1]]
-    X_trans = enc.fit_transform(X)
-    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
-    assert_array_equal(enc.n_values_, [3, 2, 2])
-    # check that testing with larger feature works:
-    X = np.array([[2, 0, 1], [0, 1, 1]])
-    enc.transform(X)
-
-    # test that an error is raise when out of bounds:
-    X_too_large = [[0, 2, 1], [0, 1, 1]]
-    assert_raises(ValueError, enc.transform, X_too_large)
-
-    # test that error is raised when wrong number of features
-    assert_raises(ValueError, enc.transform, X[:, :-1])
-    # test that error is raised when wrong number of features in fit
-    # with prespecified n_values
-    assert_raises(ValueError, enc.fit, X[:, :-1])
-    # test exception on wrong init param
-    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
-
-    enc = OneHotEncoder()
-    # test negative input to fit
-    assert_raises(ValueError, enc.fit, [[0], [-1]])
-
-    # test negative input to transform
-    enc.fit([[0], [1]])
-    assert_raises(ValueError, enc.transform, [[0], [-1]])
-
-
-def _check_transform_selected(X, X_expected, sel):
-    for M in (X, sparse.csr_matrix(X)):
-        Xtr = _transform_selected(M, Binarizer().transform, sel)
-        assert_array_equal(toarray(Xtr), X_expected)
-
-
-def test_transform_selected():
-    X = [[3, 2, 1], [0, 1, 1]]
-
-    X_expected = [[1, 2, 1], [0, 1, 1]]
-    _check_transform_selected(X, X_expected, [0])
-    _check_transform_selected(X, X_expected, [True, False, False])
-
-    X_expected = [[1, 1, 1], [0, 1, 1]]
-    _check_transform_selected(X, X_expected, [0, 1, 2])
-    _check_transform_selected(X, X_expected, [True, True, True])
-    _check_transform_selected(X, X_expected, "all")
-
-    _check_transform_selected(X, X, [])
-    _check_transform_selected(X, X, [False, False, False])
-
-
-def _run_one_hot(X, X2, cat):
-    enc = OneHotEncoder(categorical_features=cat)
-    Xtr = enc.fit_transform(X)
-    X2tr = enc.transform(X2)
-    return Xtr, X2tr
-
-
-def _check_one_hot(X, X2, cat, n_features):
-    ind = np.where(cat)[0]
-    # With mask
-    A, B = _run_one_hot(X, X2, cat)
-    # With indices
-    C, D = _run_one_hot(X, X2, ind)
-    # Check shape
-    assert_equal(A.shape, (2, n_features))
-    assert_equal(B.shape, (1, n_features))
-    assert_equal(C.shape, (2, n_features))
-    assert_equal(D.shape, (1, n_features))
-    # Check that mask and indices give the same results
-    assert_array_equal(toarray(A), toarray(C))
-    assert_array_equal(toarray(B), toarray(D))
-
-
-def test_one_hot_encoder_categorical_features():
-    X = np.array([[3, 2, 1], [0, 1, 1]])
-    X2 = np.array([[1, 1, 1]])
-
-    cat = [True, False, False]
-    _check_one_hot(X, X2, cat, 4)
-
-    # Edge case: all non-categorical
-    cat = [False, False, False]
-    _check_one_hot(X, X2, cat, 3)
-
-    # Edge case: all categorical
-    cat = [True, True, True]
-    _check_one_hot(X, X2, cat, 5)
-
-
-def test_label_encoder():
-    """Test LabelEncoder's transform and inverse_transform methods"""
-    le = LabelEncoder()
-    le.fit([1, 1, 4, 5, -1, 0])
-    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
-    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
-                       [1, 2, 3, 3, 4, 0, 0])
-    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
-                       [0, 1, 4, 4, 5, -1, -1])
-    assert_raises(ValueError, le.transform, [0, 6])
-
-
-def test_label_encoder_fit_transform():
-    """Test fit_transform"""
-    le = LabelEncoder()
-    ret = le.fit_transform([1, 1, 4, 5, -1, 0])
-    assert_array_equal(ret, [2, 2, 3, 4, 0, 1])
-
-    le = LabelEncoder()
-    ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
-    assert_array_equal(ret, [1, 1, 2, 0])
-
-
-def test_label_encoder_string_labels():
-    """Test LabelEncoder's transform and inverse_transform methods with
-    non-numeric labels"""
-    le = LabelEncoder()
-    le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"])
-    assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]),
-                       [2, 2, 1])
-    assert_array_equal(le.inverse_transform([2, 2, 1]),
-                       ["tokyo", "tokyo", "paris"])
-    assert_raises(ValueError, le.transform, ["london"])
-
-
-def test_label_encoder_errors():
-    """Check that invalid arguments yield ValueError"""
-    le = LabelEncoder()
-    assert_raises(ValueError, le.transform, [])
-    assert_raises(ValueError, le.inverse_transform, [])
-
-
-def test_label_binarizer_iris():
-    lb = LabelBinarizer()
-    Y = lb.fit_transform(iris.target)
-    clfs = [SGDClassifier().fit(iris.data, Y[:, k])
-            for k in range(len(lb.classes_))]
-    Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T
-    y_pred = lb.inverse_transform(Y_pred)
-    accuracy = np.mean(iris.target == y_pred)
-    y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data)
-    accuracy2 = np.mean(iris.target == y_pred2)
-    assert_almost_equal(accuracy, accuracy2)
-
-
-def test_label_binarizer_multilabel_unlabeled():
-    """Check that LabelBinarizer can handle an unlabeled sample"""
-    lb = LabelBinarizer()
-    y = [[1, 2], [1], []]
-    Y = np.array([[1, 1],
-                  [1, 0],
-                  [0, 0]])
-    assert_array_equal(lb.fit_transform(y), Y)
 
 
 def test_center_kernel():
@@ -812,253 +533,4 @@ def test_add_dummy_feature_csr():
     X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]])
     X = add_dummy_feature(X)
     assert_true(sparse.isspmatrix_csr(X), X)
-    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
-
-
-def _check_statistics(X, X_true,
-                      strategy, statistics, missing_values):
-    """Utility function for testing imputation for a given strategy.
-
-    Test:
-        - along the two axes
-        - with dense and sparse arrays
-
-    Check that:
-        - the statistics (mean, median, mode) are correct
-        - the missing values are imputed correctly"""
-
-    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
-              "axis = %%s, sparse = %%s".format(strategy, missing_values)
-
-    # Normal matrix, axis = 0
-    imputer = Imputer(missing_values, strategy=strategy, axis=0)
-    X_trans = imputer.fit(X).transform(X.copy())
-    assert_array_equal(imputer.statistics_, statistics,
-                       err_msg.format(0, False))
-    assert_array_equal(X_trans, X_true, err_msg.format(0, False))
-
-    # Normal matrix, axis = 1
-    imputer = Imputer(missing_values, strategy=strategy, axis=1)
-    imputer.fit(X.transpose())
-    if np.isnan(statistics).any():
-        assert_raises(ValueError, imputer.transform, X.copy().transpose())
-    else:
-        X_trans = imputer.transform(X.copy().transpose())
-        assert_array_equal(imputer.statistics_, statistics,
-                           err_msg.format(1, False))
-        assert_array_equal(X_trans, X_true.transpose(),
-                           err_msg.format(1, False))
-
-    # Sparse matrix, axis = 0
-    imputer = Imputer(missing_values, strategy=strategy, axis=0)
-    imputer.fit(sparse.csc_matrix(X))
-    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))
-
-    if sparse.issparse(X_trans):
-        X_trans = X_trans.toarray()
-
-    assert_array_equal(imputer.statistics_, statistics,
-                       err_msg.format(0, True))
-    assert_array_equal(X_trans, X_true, err_msg.format(0, True))
-
-    # Sparse matrix, axis = 1
-    imputer = Imputer(missing_values, strategy=strategy, axis=1)
-    imputer.fit(sparse.csc_matrix(X.transpose()))
-    if np.isnan(statistics).any():
-        assert_raises(ValueError, imputer.transform,
-                      sparse.csc_matrix(X.copy().transpose()))
-    else:
-        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))
-
-        if sparse.issparse(X_trans):
-            X_trans = X_trans.toarray()
-
-        assert_array_equal(imputer.statistics_, statistics,
-                           err_msg.format(1, True))
-        assert_array_equal(X_trans, X_true.transpose(),
-                           err_msg.format(1, True))
-
-
-def test_imputation_mean_median_only_zero():
-    """Test imputation using the mean and median strategies, when
-       missing_values == 0."""
-    X = np.array([
-        [np.nan, 0, 0,  0,  5],
-        [np.nan, 1, 0,  np.nan,  3],
-        [np.nan, 2, 0,  0, 0],
-        [np.nan, 6, 0,  5,  13],
-    ])
-
-    X_imputed_mean = np.array([
-        [3,  5],
-        [1,  3],
-        [2,  7],
-        [6, 13],
-    ])
-    statistics_mean = [np.nan, 3, np.nan, np.nan, 7]
-
-    X_imputed_median = np.array([
-        [2, 5,  5],
-        [1, np.nan,  3],
-        [2, 5, 5],
-        [6, 5,  13],
-    ])
-    statistics_median = [np.nan, 2, np.nan, 5, 5]
-
-    _check_statistics(X, X_imputed_mean, "mean", statistics_mean, 0)
-    _check_statistics(X, X_imputed_median, "median", statistics_median, 0)
-
-
-def test_imputation_mean_median():
-    """Test imputation using the mean and median strategies, when
-       missing_values != 0."""
-    rng = np.random.RandomState(0)
-
-    dim = 10
-    dec = 10
-    shape = (dim * dim, dim + dec)
-
-    zeros = np.zeros(shape[0])
-    values = np.arange(1, shape[0]+1)
-    values[4::2] = - values[4::2]
-
-    tests = [("mean", "NaN", lambda z, v, p: np.mean(np.hstack((z, v)))),
-             ("mean", 0, lambda z, v, p: np.mean(v)),
-             ("median", "NaN", lambda z, v, p: np.median(np.hstack((z, v)))),
-             ("median", 0, lambda z, v, p: np.median(v))]
-
-    for strategy, test_missing_values, true_value_fun in tests:
-        X = np.empty(shape)
-        X_true = np.empty(shape)
-        true_statistics = np.empty(shape[1])
-
-        # Create a matrix X with columns
-        #    - with only zeros,
-        #    - with only missing values
-        #    - with zeros, missing values and values
-        # And a matrix X_true containing all true values
-        for j in range(shape[1]):
-            nb_zeros = (j - dec + 1 > 0) * (j - dec + 1) * (j - dec + 1)
-            nb_missing_values = max(shape[0] + dec * dec
-                                    - (j + dec) * (j + dec), 0)
-            nb_values = shape[0] - nb_zeros - nb_missing_values
-
-            z = zeros[:nb_zeros]
-            p = np.repeat(test_missing_values, nb_missing_values)
-            v = values[rng.permutation(len(values))[:nb_values]]
-
-            true_statistics[j] = true_value_fun(z, v, p)
-
-            # Create the columns
-            X[:, j] = np.hstack((v, z, p))
-
-            if 0 == test_missing_values:
-                X_true[:, j] = np.hstack((v,
-                                          np.repeat(
-                                              true_statistics[j],
-                                              nb_missing_values + nb_zeros)))
-            else:
-                X_true[:, j] = np.hstack((v,
-                                          z,
-                                          np.repeat(true_statistics[j],
-                                                    nb_missing_values)))
-
-            # Shuffle them the same way
-            np.random.RandomState(j).shuffle(X[:, j])
-            np.random.RandomState(j).shuffle(X_true[:, j])
-
-        # Mean doesn't support columns containing NaNs, median does
-        if strategy == "median":
-            cols_to_keep = ~np.isnan(X_true).any(axis=0)
-        else:
-            cols_to_keep = ~np.isnan(X_true).all(axis=0)
-
-        X_true = X_true[:, cols_to_keep]
-
-        _check_statistics(X, X_true, strategy,
-                          true_statistics, test_missing_values)
-
-
-def test_imputation_most_frequent():
-    """Test imputation using the most-frequent strategy."""
-    X = np.array([
-        [-1, -1,  0,  5],
-        [-1,  2, -1,  3],
-        [-1,  1,  3, -1],
-        [-1,  2,  3,  7],
-    ])
-
-    X_true = np.array([
-        [2,  0,  5],
-        [2,  3,  3],
-        [1,  3,  3],
-        [2,  3,  7],
-    ])
-
-    # scipy.stats.mode, used in Imputer, doesn't return the first most
-    # frequent as promised in the doc but the lowest most frequent. When this
-    # test will fail after an update of scipy, Imputer will need to be updated
-    # to be consistent with the new (correct) behaviour
-    _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
-
-
-def test_imputation_pipeline_grid_search():
-    """Test imputation within a pipeline + gridsearch."""
-    pipeline = Pipeline([('imputer', Imputer(missing_values=0)),
-                         ('tree', tree.DecisionTreeRegressor(random_state=0))])
-
-    parameters = {
-        'imputer__strategy': ["mean", "median", "most_frequent"],
-        'imputer__axis': [0, 1]
-    }
-
-    l = 100
-    X = sparse_random_matrix(l, l, density=0.10)
-    Y = sparse_random_matrix(l, 1, density=0.10).todense()
-    gs = grid_search.GridSearchCV(pipeline, parameters)
-    gs.fit(X, Y)
-
-
-def test_imputation_pickle():
-    """Test for pickling imputers."""
-    import pickle
-
-    l = 100
-    X = sparse_random_matrix(l, l, density=0.10)
-
-    for strategy in ["mean", "median", "most_frequent"]:
-        imputer = Imputer(missing_values=0, strategy=strategy)
-        imputer.fit(X)
-
-        imputer_pickled = pickle.loads(pickle.dumps(imputer))
-
-        assert_array_equal(imputer.transform(X.copy()),
-                           imputer_pickled.transform(X.copy()),
-                           "Fail to transform the data after pickling "
-                           "(strategy = %s)" % (strategy))
-
-
-def test_imputation_copy():
-    """Test imputation with copy=True."""
-    l = 5
-
-    # Test default behaviour and with copy=True
-    for params in [{}, {'copy': True}]:
-        X = sparse_random_matrix(l, l, density=0.75, random_state=0)
-
-        # Dense
-        imputer = Imputer(missing_values=0, strategy="mean", **params)
-        Xt = imputer.fit(X).transform(X)
-        Xt[0, 0] = np.nan
-        # Check that the objects are different and that they don't use
-        # the same buffer
-        assert_false(np.all(X.todense() == Xt))
-
-        # Sparse
-        imputer = Imputer(missing_values=0, strategy="mean", **params)
-        X = X.todense()
-        Xt = imputer.fit(X).transform(X)
-        Xt[0, 0] = np.nan
-        # Check that the objects are different and that they don't use
-        # the same buffer
-        assert_false(np.all(X == Xt))
+    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
\ No newline at end of file
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
new file mode 100644
index 0000000000000..efeb1d6bc56d6
--- /dev/null
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -0,0 +1,282 @@
+import warnings
+import numpy as np
+import numpy.linalg as la
+from scipy import sparse
+
+from sklearn.utils.testing import assert_almost_equal
+from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_true
+from sklearn.utils.testing import assert_false
+
+from sklearn.utils.sparsefuncs import mean_variance_axis0
+from sklearn.preprocessing import Binarizer
+from sklearn.preprocessing import KernelCenterer
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import Normalizer
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import add_dummy_feature
+
+from sklearn.preprocessing import Imputer
+from sklearn.pipeline import Pipeline
+from sklearn import grid_search
+from sklearn import tree
+from sklearn.random_projection import sparse_random_matrix
+
+from sklearn import datasets
+from sklearn.linear_model.stochastic_gradient import SGDClassifier
+
+def _check_statistics(X, X_true,
+                      strategy, statistics, missing_values):
+    """Utility function for testing imputation for a given strategy.
+
+    Test:
+        - along the two axes
+        - with dense and sparse arrays
+
+    Check that:
+        - the statistics (mean, median, mode) are correct
+        - the missing values are imputed correctly"""
+
+    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
+              "axis = %%s, sparse = %%s".format(strategy, missing_values)
+
+    # Normal matrix, axis = 0
+    imputer = Imputer(missing_values, strategy=strategy, axis=0)
+    X_trans = imputer.fit(X).transform(X.copy())
+    assert_array_equal(imputer.statistics_, statistics,
+                       err_msg.format(0, False))
+    assert_array_equal(X_trans, X_true, err_msg.format(0, False))
+
+    # Normal matrix, axis = 1
+    imputer = Imputer(missing_values, strategy=strategy, axis=1)
+    imputer.fit(X.transpose())
+    if np.isnan(statistics).any():
+        assert_raises(ValueError, imputer.transform, X.copy().transpose())
+    else:
+        X_trans = imputer.transform(X.copy().transpose())
+        assert_array_equal(imputer.statistics_, statistics,
+                           err_msg.format(1, False))
+        assert_array_equal(X_trans, X_true.transpose(),
+                           err_msg.format(1, False))
+
+    # Sparse matrix, axis = 0
+    imputer = Imputer(missing_values, strategy=strategy, axis=0)
+    imputer.fit(sparse.csc_matrix(X))
+    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))
+
+    if sparse.issparse(X_trans):
+        X_trans = X_trans.toarray()
+
+    assert_array_equal(imputer.statistics_, statistics,
+                       err_msg.format(0, True))
+    assert_array_equal(X_trans, X_true, err_msg.format(0, True))
+
+    # Sparse matrix, axis = 1
+    imputer = Imputer(missing_values, strategy=strategy, axis=1)
+    imputer.fit(sparse.csc_matrix(X.transpose()))
+    if np.isnan(statistics).any():
+        assert_raises(ValueError, imputer.transform,
+                      sparse.csc_matrix(X.copy().transpose()))
+    else:
+        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))
+
+        if sparse.issparse(X_trans):
+            X_trans = X_trans.toarray()
+
+        assert_array_equal(imputer.statistics_, statistics,
+                           err_msg.format(1, True))
+        assert_array_equal(X_trans, X_true.transpose(),
+                           err_msg.format(1, True))
+
+
+def test_imputation_mean_median_only_zero():
+    """Test imputation using the mean and median strategies, when
+       missing_values == 0."""
+    X = np.array([
+        [np.nan, 0, 0,  0,  5],
+        [np.nan, 1, 0,  np.nan,  3],
+        [np.nan, 2, 0,  0, 0],
+        [np.nan, 6, 0,  5,  13],
+    ])
+
+    X_imputed_mean = np.array([
+        [3,  5],
+        [1,  3],
+        [2,  7],
+        [6, 13],
+    ])
+    statistics_mean = [np.nan, 3, np.nan, np.nan, 7]
+
+    X_imputed_median = np.array([
+        [2, 5,  5],
+        [1, np.nan,  3],
+        [2, 5, 5],
+        [6, 5,  13],
+    ])
+    statistics_median = [np.nan, 2, np.nan, 5, 5]
+
+    _check_statistics(X, X_imputed_mean, "mean", statistics_mean, 0)
+    _check_statistics(X, X_imputed_median, "median", statistics_median, 0)
+
+
+def test_imputation_mean_median():
+    """Test imputation using the mean and median strategies, when
+       missing_values != 0."""
+    rng = np.random.RandomState(0)
+
+    dim = 10
+    dec = 10
+    shape = (dim * dim, dim + dec)
+
+    zeros = np.zeros(shape[0])
+    values = np.arange(1, shape[0]+1)
+    values[4::2] = - values[4::2]
+
+    tests = [("mean", "NaN", lambda z, v, p: np.mean(np.hstack((z, v)))),
+             ("mean", 0, lambda z, v, p: np.mean(v)),
+             ("median", "NaN", lambda z, v, p: np.median(np.hstack((z, v)))),
+             ("median", 0, lambda z, v, p: np.median(v))]
+
+    for strategy, test_missing_values, true_value_fun in tests:
+        X = np.empty(shape)
+        X_true = np.empty(shape)
+        true_statistics = np.empty(shape[1])
+
+        # Create a matrix X with columns
+        #    - with only zeros,
+        #    - with only missing values
+        #    - with zeros, missing values and values
+        # And a matrix X_true containing all true values
+        for j in range(shape[1]):
+            nb_zeros = (j - dec + 1 > 0) * (j - dec + 1) * (j - dec + 1)
+            nb_missing_values = max(shape[0] + dec * dec
+                                    - (j + dec) * (j + dec), 0)
+            nb_values = shape[0] - nb_zeros - nb_missing_values
+
+            z = zeros[:nb_zeros]
+            p = np.repeat(test_missing_values, nb_missing_values)
+            v = values[rng.permutation(len(values))[:nb_values]]
+
+            true_statistics[j] = true_value_fun(z, v, p)
+
+            # Create the columns
+            X[:, j] = np.hstack((v, z, p))
+
+            if 0 == test_missing_values:
+                X_true[:, j] = np.hstack((v,
+                                          np.repeat(
+                                              true_statistics[j],
+                                              nb_missing_values + nb_zeros)))
+            else:
+                X_true[:, j] = np.hstack((v,
+                                          z,
+                                          np.repeat(true_statistics[j],
+                                                    nb_missing_values)))
+
+            # Shuffle them the same way
+            np.random.RandomState(j).shuffle(X[:, j])
+            np.random.RandomState(j).shuffle(X_true[:, j])
+
+        # Mean doesn't support columns containing NaNs, median does
+        if strategy == "median":
+            cols_to_keep = ~np.isnan(X_true).any(axis=0)
+        else:
+            cols_to_keep = ~np.isnan(X_true).all(axis=0)
+
+        X_true = X_true[:, cols_to_keep]
+
+        _check_statistics(X, X_true, strategy,
+                          true_statistics, test_missing_values)
+
+
+def test_imputation_most_frequent():
+    """Test imputation using the most-frequent strategy."""
+    X = np.array([
+        [-1, -1,  0,  5],
+        [-1,  2, -1,  3],
+        [-1,  1,  3, -1],
+        [-1,  2,  3,  7],
+    ])
+
+    X_true = np.array([
+        [2,  0,  5],
+        [2,  3,  3],
+        [1,  3,  3],
+        [2,  3,  7],
+    ])
+
+    # scipy.stats.mode, used in Imputer, doesn't return the first most
+    # frequent as promised in the doc but the lowest most frequent. When this
+    # test will fail after an update of scipy, Imputer will need to be updated
+    # to be consistent with the new (correct) behaviour
+    _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
+
+
+def test_imputation_pipeline_grid_search():
+    """Test imputation within a pipeline + gridsearch."""
+    pipeline = Pipeline([('imputer', Imputer(missing_values=0)),
+                         ('tree', tree.DecisionTreeRegressor(random_state=0))])
+
+    parameters = {
+        'imputer__strategy': ["mean", "median", "most_frequent"],
+        'imputer__axis': [0, 1]
+    }
+
+    l = 100
+    X = sparse_random_matrix(l, l, density=0.10)
+    Y = sparse_random_matrix(l, 1, density=0.10).todense()
+    gs = grid_search.GridSearchCV(pipeline, parameters)
+    gs.fit(X, Y)
+
+
+def test_imputation_pickle():
+    """Test for pickling imputers."""
+    import pickle
+
+    l = 100
+    X = sparse_random_matrix(l, l, density=0.10)
+
+    for strategy in ["mean", "median", "most_frequent"]:
+        imputer = Imputer(missing_values=0, strategy=strategy)
+        imputer.fit(X)
+
+        imputer_pickled = pickle.loads(pickle.dumps(imputer))
+
+        assert_array_equal(imputer.transform(X.copy()),
+                           imputer_pickled.transform(X.copy()),
+                           "Fail to transform the data after pickling "
+                           "(strategy = %s)" % (strategy))
+
+
+def test_imputation_copy():
+    """Test imputation with copy=True."""
+    l = 5
+
+    # Test default behaviour and with copy=True
+    for params in [{}, {'copy': True}]:
+        X = sparse_random_matrix(l, l, density=0.75, random_state=0)
+
+        # Dense
+        imputer = Imputer(missing_values=0, strategy="mean", **params)
+        Xt = imputer.fit(X).transform(X)
+        Xt[0, 0] = np.nan
+        # Check that the objects are different and that they don't use
+        # the same buffer
+        assert_false(np.all(X.todense() == Xt))
+
+        # Sparse
+        imputer = Imputer(missing_values=0, strategy="mean", **params)
+        X = X.todense()
+        Xt = imputer.fit(X).transform(X)
+        Xt[0, 0] = np.nan
+        # Check that the objects are different and that they don't use
+        # the same buffer
+        assert_false(np.all(X == Xt))
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
new file mode 100644
index 0000000000000..1aefbdd11872e
--- /dev/null
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -0,0 +1,320 @@
+import warnings
+import numpy as np
+import numpy.linalg as la
+from scipy import sparse
+
+from sklearn.utils.testing import assert_almost_equal
+from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_true
+from sklearn.utils.testing import assert_false
+
+from sklearn.utils.sparsefuncs import mean_variance_axis0
+from sklearn.preprocessing import Binarizer
+from sklearn.preprocessing import KernelCenterer
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.preprocessing.label import _transform_selected
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import Normalizer
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import add_dummy_feature
+
+from sklearn.preprocessing import Imputer
+from sklearn.pipeline import Pipeline
+from sklearn import grid_search
+from sklearn import tree
+from sklearn.random_projection import sparse_random_matrix
+
+from sklearn import datasets
+from sklearn.linear_model.stochastic_gradient import SGDClassifier
+
+iris = datasets.load_iris()
+
+def toarray(a):
+    if hasattr(a, "toarray"):
+        a = a.toarray()
+    return a
+
+def test_label_binarizer():
+    lb = LabelBinarizer()
+
+    # two-class case
+    inp = ["neg", "pos", "pos", "neg"]
+    expected = np.array([[0, 1, 1, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_false(lb.multilabel_)
+    assert_array_equal(lb.classes_, ["neg", "pos"])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    # multi-class case
+    inp = ["spam", "ham", "eggs", "ham", "0"]
+    expected = np.array([[0, 0, 0, 1],
+                         [0, 0, 1, 0],
+                         [0, 1, 0, 0],
+                         [0, 0, 1, 0],
+                         [1, 0, 0, 0]])
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
+    assert_false(lb.multilabel_)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+def test_label_binarizer_set_label_encoding():
+    lb = LabelBinarizer(neg_label=-2, pos_label=2)
+
+    # two-class case
+    inp = np.array([0, 1, 1, 0])
+    expected = np.array([[-2, 2, 2, -2]]).T
+    got = lb.fit_transform(inp)
+    assert_false(lb.multilabel_)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    # multi-class case
+    inp = np.array([3, 2, 1, 2, 0])
+    expected = np.array([[-2, -2, -2, +2],
+                         [-2, -2, +2, -2],
+                         [-2, +2, -2, -2],
+                         [-2, -2, +2, -2],
+                         [+2, -2, -2, -2]])
+    got = lb.fit_transform(inp)
+    assert_false(lb.multilabel_)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+def test_label_binarizer_multilabel():
+    lb = LabelBinarizer()
+
+    # test input as lists of tuples
+    inp = [(2, 3), (1,), (1, 2)]
+    indicator_mat = np.array([[0, 1, 1],
+                              [1, 0, 0],
+                              [1, 1, 0]])
+    got = lb.fit_transform(inp)
+    assert_true(lb.multilabel_)
+    assert_array_equal(indicator_mat, got)
+    assert_equal(lb.inverse_transform(got), inp)
+
+    # test input as label indicator matrix
+    lb.fit(indicator_mat)
+    assert_array_equal(indicator_mat,
+                       lb.inverse_transform(indicator_mat))
+
+    # regression test for the two-class multilabel case
+    lb = LabelBinarizer()
+    inp = [[1, 0], [0], [1], [0, 1]]
+    expected = np.array([[1, 1],
+                         [1, 0],
+                         [0, 1],
+                         [1, 1]])
+    got = lb.fit_transform(inp)
+    assert_true(lb.multilabel_)
+    assert_array_equal(expected, got)
+    assert_equal([set(x) for x in lb.inverse_transform(got)],
+                 [set(x) for x in inp])
+
+
+def test_label_binarizer_errors():
+    """Check that invalid arguments yield ValueError"""
+    one_class = np.array([0, 0, 0, 0])
+    lb = LabelBinarizer().fit(one_class)
+    assert_false(lb.multilabel_)
+
+    multi_label = [(2, 3), (0,), (0, 2)]
+    assert_raises(ValueError, lb.transform, multi_label)
+
+    lb = LabelBinarizer()
+    assert_raises(ValueError, lb.transform, [])
+    assert_raises(ValueError, lb.inverse_transform, [])
+
+    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
+    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
+
+
+def test_one_hot_encoder():
+    """Test OneHotEncoder's fit and transform."""
+    X = [[3, 2, 1], [0, 1, 1]]
+    enc = OneHotEncoder()
+    # discover max values automatically
+    X_trans = enc.fit_transform(X).toarray()
+    assert_equal(X_trans.shape, (2, 5))
+    assert_array_equal(enc.active_features_,
+                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+
+    # check outcome
+    assert_array_equal(X_trans,
+                       [[0., 1., 0., 1., 1.],
+                        [1., 0., 1., 0., 1.]])
+
+    # max value given as 3
+    enc = OneHotEncoder(n_values=4)
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 4 * 3))
+    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
+
+    # max value given per feature
+    enc = OneHotEncoder(n_values=[3, 2, 2])
+    X = [[1, 0, 1], [0, 1, 1]]
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
+    assert_array_equal(enc.n_values_, [3, 2, 2])
+    # check that testing with larger feature works:
+    X = np.array([[2, 0, 1], [0, 1, 1]])
+    enc.transform(X)
+
+    # test that an error is raise when out of bounds:
+    X_too_large = [[0, 2, 1], [0, 1, 1]]
+    assert_raises(ValueError, enc.transform, X_too_large)
+
+    # test that error is raised when wrong number of features
+    assert_raises(ValueError, enc.transform, X[:, :-1])
+    # test that error is raised when wrong number of features in fit
+    # with prespecified n_values
+    assert_raises(ValueError, enc.fit, X[:, :-1])
+    # test exception on wrong init param
+    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
+
+    enc = OneHotEncoder()
+    # test negative input to fit
+    assert_raises(ValueError, enc.fit, [[0], [-1]])
+
+    # test negative input to transform
+    enc.fit([[0], [1]])
+    assert_raises(ValueError, enc.transform, [[0], [-1]])
+
+
+def _check_transform_selected(X, X_expected, sel):
+    for M in (X, sparse.csr_matrix(X)):
+        Xtr = _transform_selected(M, Binarizer().transform, sel)
+        assert_array_equal(toarray(Xtr), X_expected)
+
+
+def test_transform_selected():
+    X = [[3, 2, 1], [0, 1, 1]]
+
+    X_expected = [[1, 2, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0])
+    _check_transform_selected(X, X_expected, [True, False, False])
+
+    X_expected = [[1, 1, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0, 1, 2])
+    _check_transform_selected(X, X_expected, [True, True, True])
+    _check_transform_selected(X, X_expected, "all")
+
+    _check_transform_selected(X, X, [])
+    _check_transform_selected(X, X, [False, False, False])
+
+
+def _run_one_hot(X, X2, cat):
+    enc = OneHotEncoder(categorical_features=cat)
+    Xtr = enc.fit_transform(X)
+    X2tr = enc.transform(X2)
+    return Xtr, X2tr
+
+
+def _check_one_hot(X, X2, cat, n_features):
+    ind = np.where(cat)[0]
+    # With mask
+    A, B = _run_one_hot(X, X2, cat)
+    # With indices
+    C, D = _run_one_hot(X, X2, ind)
+    # Check shape
+    assert_equal(A.shape, (2, n_features))
+    assert_equal(B.shape, (1, n_features))
+    assert_equal(C.shape, (2, n_features))
+    assert_equal(D.shape, (1, n_features))
+    # Check that mask and indices give the same results
+    assert_array_equal(toarray(A), toarray(C))
+    assert_array_equal(toarray(B), toarray(D))
+
+
+def test_one_hot_encoder_categorical_features():
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
+
+    cat = [True, False, False]
+    _check_one_hot(X, X2, cat, 4)
+
+    # Edge case: all non-categorical
+    cat = [False, False, False]
+    _check_one_hot(X, X2, cat, 3)
+
+    # Edge case: all categorical
+    cat = [True, True, True]
+    _check_one_hot(X, X2, cat, 5)
+
+
+def test_label_encoder():
+    """Test LabelEncoder's transform and inverse_transform methods"""
+    le = LabelEncoder()
+    le.fit([1, 1, 4, 5, -1, 0])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
+                       [1, 2, 3, 3, 4, 0, 0])
+    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
+                       [0, 1, 4, 4, 5, -1, -1])
+    assert_raises(ValueError, le.transform, [0, 6])
+
+
+def test_label_encoder_fit_transform():
+    """Test fit_transform"""
+    le = LabelEncoder()
+    ret = le.fit_transform([1, 1, 4, 5, -1, 0])
+    assert_array_equal(ret, [2, 2, 3, 4, 0, 1])
+
+    le = LabelEncoder()
+    ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
+    assert_array_equal(ret, [1, 1, 2, 0])
+
+
+def test_label_encoder_string_labels():
+    """Test LabelEncoder's transform and inverse_transform methods with
+    non-numeric labels"""
+    le = LabelEncoder()
+    le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"])
+    assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]),
+                       [2, 2, 1])
+    assert_array_equal(le.inverse_transform([2, 2, 1]),
+                       ["tokyo", "tokyo", "paris"])
+    assert_raises(ValueError, le.transform, ["london"])
+
+
+def test_label_encoder_errors():
+    """Check that invalid arguments yield ValueError"""
+    le = LabelEncoder()
+    assert_raises(ValueError, le.transform, [])
+    assert_raises(ValueError, le.inverse_transform, [])
+
+
+def test_label_binarizer_iris():
+    lb = LabelBinarizer()
+    Y = lb.fit_transform(iris.target)
+    clfs = [SGDClassifier().fit(iris.data, Y[:, k])
+            for k in range(len(lb.classes_))]
+    Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T
+    y_pred = lb.inverse_transform(Y_pred)
+    accuracy = np.mean(iris.target == y_pred)
+    y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data)
+    accuracy2 = np.mean(iris.target == y_pred2)
+    assert_almost_equal(accuracy, accuracy2)
+
+
+def test_label_binarizer_multilabel_unlabeled():
+    """Check that LabelBinarizer can handle an unlabeled sample"""
+    lb = LabelBinarizer()
+    y = [[1, 2], [1], []]
+    Y = np.array([[1, 1],
+                  [1, 0],
+                  [0, 0]])
+    assert_array_equal(lb.fit_transform(y), Y)
\ No newline at end of file

From 0698f11e720def5f5026d5e7f4ef9466377fcb0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20Tr=C3=A9segnie?= <nicolas.tresegnie@gmail.com>
Date: Fri, 26 Jul 2013 12:20:35 +0200
Subject: [PATCH 3/6] Del unused imports in preprocessing + pep8

---
 sklearn/preprocessing/__init__.py             |  2 +-
 sklearn/preprocessing/data.py                 | 13 ++------
 sklearn/preprocessing/imputation.py           | 11 +------
 sklearn/preprocessing/label.py                | 10 +-----
 sklearn/preprocessing/tests/test_data.py      | 30 ++++++------------
 .../preprocessing/tests/test_imputation.py    | 23 +-------------
 sklearn/preprocessing/tests/test_label.py     | 31 +++++--------------
 7 files changed, 24 insertions(+), 96 deletions(-)

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index da548216be75e..4302f53b70d6d 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -34,4 +34,4 @@
     'binarize',
     'normalize',
     'scale',
-]
\ No newline at end of file
+]
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index cd5a3aab5786c..e5a476e6527c9 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -5,27 +5,16 @@
 # License: BSD 3 clause
 
 import warnings
-import numbers
-import math
 
 import numpy as np
-import numpy.ma as ma
 from scipy import sparse
-from scipy import stats
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_arrays
 from ..utils import array2d
-from ..utils import as_float_array
 from ..utils import atleast2d_or_csr
-from ..utils import atleast2d_or_csc
 from ..utils import safe_asarray
 from ..utils import warn_if_not_float
-from ..utils.fixes import unique
-from ..utils import deprecated
-
-from ..utils.multiclass import unique_labels
-from ..utils.multiclass import type_of_target
 
 from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
 from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
@@ -48,6 +37,7 @@
     'scale',
 ]
 
+
 def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
     """Compute mean and std deviation for centering, scaling.
 
@@ -638,6 +628,7 @@ def transform(self, X, y=None, copy=None):
         copy = copy if copy is not None else self.copy
         return binarize(X, threshold=self.threshold, copy=copy)
 
+
 class KernelCenterer(BaseEstimator, TransformerMixin):
     """Center a kernel matrix
 
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 9bdf528c5a873..0a804663c3d56 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -2,7 +2,6 @@
 # License: BSD 3 clause
 
 import warnings
-import numbers
 import math
 
 import numpy as np
@@ -11,18 +10,9 @@
 from scipy import stats
 
 from ..base import BaseEstimator, TransformerMixin
-from ..utils import check_arrays
 from ..utils import array2d
-from ..utils import as_float_array
 from ..utils import atleast2d_or_csr
 from ..utils import atleast2d_or_csc
-from ..utils import safe_asarray
-from ..utils import warn_if_not_float
-from ..utils.fixes import unique
-from ..utils import deprecated
-
-from ..utils.multiclass import unique_labels
-from ..utils.multiclass import type_of_target
 
 from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
 from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
@@ -37,6 +27,7 @@
     'Imputer',
 ]
 
+
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
     if value_to_mask == "NaN" or np.isnan(value_to_mask):
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index c8e1f90eafd31..e689032f7e41e 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -4,23 +4,14 @@
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
-import warnings
 import numbers
-import math
 
 import numpy as np
-import numpy.ma as ma
 from scipy import sparse
-from scipy import stats
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_arrays
-from ..utils import array2d
-from ..utils import as_float_array
-from ..utils import atleast2d_or_csr
 from ..utils import atleast2d_or_csc
-from ..utils import safe_asarray
-from ..utils import warn_if_not_float
 from ..utils.fixes import unique
 from ..utils import deprecated
 
@@ -43,6 +34,7 @@
     'OneHotEncoder',
 ]
 
+
 def _transform_selected(X, transform, selected="all", copy=True):
     """Apply a transform function to portion of selected features
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 67d0cbe0f2746..cd620d7877ee7 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -14,33 +14,25 @@
 from sklearn.utils.sparsefuncs import mean_variance_axis0
 from sklearn.preprocessing import Binarizer
 from sklearn.preprocessing import KernelCenterer
-from sklearn.preprocessing import LabelBinarizer
-
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import LabelEncoder
-from sklearn.preprocessing import Normalizer
-from sklearn.preprocessing import normalize
-from sklearn.preprocessing import StandardScaler
-from sklearn.preprocessing import scale
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import add_dummy_feature
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-from sklearn import grid_search
-from sklearn import tree
-from sklearn.random_projection import sparse_random_matrix
+
+from sklearn.preprocessing.data import Normalizer
+from sklearn.preprocessing.data import normalize
+from sklearn.preprocessing.data import StandardScaler
+from sklearn.preprocessing.data import scale
+from sklearn.preprocessing.data import MinMaxScaler
+from sklearn.preprocessing.data import add_dummy_feature
 
 from sklearn import datasets
-from sklearn.linear_model.stochastic_gradient import SGDClassifier
 
 iris = datasets.load_iris()
 
+
 def toarray(a):
     if hasattr(a, "toarray"):
         a = a.toarray()
     return a
 
+
 def test_scaler_1d():
     """Test scaling of dataset along single axis"""
     rng = np.random.RandomState(0)
@@ -473,8 +465,6 @@ def test_binarizer():
     assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
 
 
-
-
 def test_center_kernel():
     """Test that KernelCenterer is equivalent to StandardScaler
        in feature space"""
@@ -533,4 +523,4 @@ def test_add_dummy_feature_csr():
     X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]])
     X = add_dummy_feature(X)
     assert_true(sparse.isspmatrix_csr(X), X)
-    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
\ No newline at end of file
+    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
index efeb1d6bc56d6..6fb9810a87bc4 100644
--- a/sklearn/preprocessing/tests/test_imputation.py
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -1,37 +1,16 @@
-import warnings
 import numpy as np
-import numpy.linalg as la
 from scipy import sparse
 
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
 
-from sklearn.utils.sparsefuncs import mean_variance_axis0
-from sklearn.preprocessing import Binarizer
-from sklearn.preprocessing import KernelCenterer
-from sklearn.preprocessing import LabelBinarizer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import LabelEncoder
-from sklearn.preprocessing import Normalizer
-from sklearn.preprocessing import normalize
-from sklearn.preprocessing import StandardScaler
-from sklearn.preprocessing import scale
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import add_dummy_feature
-
-from sklearn.preprocessing import Imputer
+from sklearn.preprocessing.imputation import Imputer
 from sklearn.pipeline import Pipeline
 from sklearn import grid_search
 from sklearn import tree
 from sklearn.random_projection import sparse_random_matrix
 
-from sklearn import datasets
-from sklearn.linear_model.stochastic_gradient import SGDClassifier
 
 def _check_statistics(X, X_true,
                       strategy, statistics, missing_values):
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 1aefbdd11872e..3672d7d6e62af 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -1,46 +1,31 @@
-import warnings
 import numpy as np
-import numpy.linalg as la
 from scipy import sparse
 
 from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
 
-from sklearn.utils.sparsefuncs import mean_variance_axis0
-from sklearn.preprocessing import Binarizer
-from sklearn.preprocessing import KernelCenterer
-from sklearn.preprocessing import LabelBinarizer
-from sklearn.preprocessing.label import _transform_selected
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import LabelEncoder
-from sklearn.preprocessing import Normalizer
-from sklearn.preprocessing import normalize
-from sklearn.preprocessing import StandardScaler
-from sklearn.preprocessing import scale
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import add_dummy_feature
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-from sklearn import grid_search
-from sklearn import tree
-from sklearn.random_projection import sparse_random_matrix
+from sklearn.preprocessing.label import Binarizer
+from sklearn.preprocessing.label import LabelBinarizer
+from sklearn.preprocessing.label.label import _transform_selected
+from sklearn.preprocessing.label import OneHotEncoder
+from sklearn.preprocessing.label import LabelEncoder
 
 from sklearn import datasets
 from sklearn.linear_model.stochastic_gradient import SGDClassifier
 
 iris = datasets.load_iris()
 
+
 def toarray(a):
     if hasattr(a, "toarray"):
         a = a.toarray()
     return a
 
+
 def test_label_binarizer():
     lb = LabelBinarizer()
 
@@ -317,4 +302,4 @@ def test_label_binarizer_multilabel_unlabeled():
     Y = np.array([[1, 1],
                   [1, 0],
                   [0, 0]])
-    assert_array_equal(lb.fit_transform(y), Y)
\ No newline at end of file
+    assert_array_equal(lb.fit_transform(y), Y)

From 3f89a1f25da4eff5f7a3698a0e3d45c2d8b58e4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20Tr=C3=A9segnie?= <nicolas.tresegnie@gmail.com>
Date: Fri, 26 Jul 2013 12:55:59 +0200
Subject: [PATCH 4/6] Fix imports

---
 sklearn/preprocessing/tests/test_data.py  | 5 ++---
 sklearn/preprocessing/tests/test_label.py | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index cd620d7877ee7..114b0f6ce77f2 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -12,9 +12,8 @@
 from sklearn.utils.testing import assert_false
 
 from sklearn.utils.sparsefuncs import mean_variance_axis0
-from sklearn.preprocessing import Binarizer
-from sklearn.preprocessing import KernelCenterer
-
+from sklearn.preprocessing.data import Binarizer
+from sklearn.preprocessing.data import KernelCenterer
 from sklearn.preprocessing.data import Normalizer
 from sklearn.preprocessing.data import normalize
 from sklearn.preprocessing.data import StandardScaler
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 3672d7d6e62af..9dcfad3095a27 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -8,9 +8,9 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
 
-from sklearn.preprocessing.label import Binarizer
+from sklearn.preprocessing.data import Binarizer
 from sklearn.preprocessing.label import LabelBinarizer
-from sklearn.preprocessing.label.label import _transform_selected
+from sklearn.preprocessing.label import _transform_selected
 from sklearn.preprocessing.label import OneHotEncoder
 from sklearn.preprocessing.label import LabelEncoder
 

From b1453a31ca84b61e335da9048e5be7d828e56bde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20Tr=C3=A9segnie?= <nicolas.tresegnie@gmail.com>
Date: Fri, 26 Jul 2013 15:49:33 +0200
Subject: [PATCH 5/6] Imp move OneHotEncoder to preprocessing/data.py

---
 sklearn/preprocessing/__init__.py         |   4 +-
 sklearn/preprocessing/data.py             | 248 ++++++++++++++++++++++
 sklearn/preprocessing/label.py            | 244 ---------------------
 sklearn/preprocessing/tests/test_data.py  | 116 ++++++++++
 sklearn/preprocessing/tests/test_label.py | 115 ----------
 5 files changed, 367 insertions(+), 360 deletions(-)

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index 4302f53b70d6d..e0e2d09d69d13 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -7,16 +7,17 @@
 from .data import KernelCenterer
 from .data import MinMaxScaler
 from .data import Normalizer
+from .data import Scaler
 from .data import StandardScaler
 from .data import add_dummy_feature
 from .data import binarize
 from .data import normalize
 from .data import scale
+from .data import OneHotEncoder
 
 from .label import label_binarize
 from .label import LabelBinarizer
 from .label import LabelEncoder
-from .label import OneHotEncoder
 
 from .imputation import Imputer
 
@@ -29,6 +30,7 @@
     'MinMaxScaler',
     'Normalizer',
     'OneHotEncoder',
+    'Scaler',
     'StandardScaler',
     'add_dummy_feature',
     'binarize',
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index e5a476e6527c9..bfec06fb8072a 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -4,6 +4,7 @@
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
+import numbers
 import warnings
 
 import numpy as np
@@ -11,6 +12,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_arrays
+from ..utils import atleast2d_or_csc
 from ..utils import array2d
 from ..utils import atleast2d_or_csr
 from ..utils import safe_asarray
@@ -30,6 +32,8 @@
     'KernelCenterer',
     'MinMaxScaler',
     'Normalizer',
+    'OneHotEncoder',
+    'Scaler',
     'StandardScaler',
     'add_dummy_feature',
     'binarize',
@@ -740,3 +744,247 @@ def add_dummy_feature(X, value=1.0):
             return klass(add_dummy_feature(X.tocoo(), value))
     else:
         return np.hstack((np.ones((n_samples, 1)) * value, X))
+
+
+def _transform_selected(X, transform, selected="all", copy=True):
+    """Apply a transform function to portion of selected features
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape=(n_samples, n_features)
+        Dense array or sparse matrix.
+
+    transform : callable
+        A callable transform(X) -> X_transformed
+
+    copy : boolean, optional
+        Copy X even if it could be avoided.
+
+    selected: "all" or array of indices or mask
+        Specify which features to apply the transform to.
+
+    Returns
+    -------
+    X : array or sparse matrix, shape=(n_samples, n_features_new)
+    """
+    if selected == "all":
+        return transform(X)
+
+    X = atleast2d_or_csc(X, copy=copy)
+
+    if len(selected) == 0:
+        return X
+
+    n_features = X.shape[1]
+    ind = np.arange(n_features)
+    sel = np.zeros(n_features, dtype=bool)
+    sel[np.asarray(selected)] = True
+    not_sel = np.logical_not(sel)
+    n_selected = np.sum(sel)
+
+    if n_selected == 0:
+        # No features selected.
+        return X
+    elif n_selected == n_features:
+        # All features selected.
+        return transform(X)
+    else:
+        X_sel = transform(X[:, ind[sel]])
+        X_not_sel = X[:, ind[not_sel]]
+
+        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
+            return sparse.hstack((X_sel, X_not_sel))
+        else:
+            return np.hstack((X_sel, X_not_sel))
+
+
+class OneHotEncoder(BaseEstimator, TransformerMixin):
+    """Encode categorical integer features using a one-hot aka one-of-K scheme.
+
+    The input to this transformer should be a matrix of integers, denoting
+    the values taken on by categorical (discrete) features. The output will be
+    a sparse matrix were each column corresponds to one possible value of one
+    feature. It is assumed that input features take on values in the range
+    [0, n_values).
+
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
+
+    Parameters
+    ----------
+    n_values : 'auto', int or array of ints
+        Number of values per feature.
+
+        - 'auto' : determine value range from training data.
+        - int : maximum value for all features.
+        - array : maximum value per feature.
+
+    categorical_features: "all" or array of indices or mask
+        Specify what features are treated as categorical.
+
+        - 'all' (default): All features are treated as categorical.
+        - array of indices: Array of categorical feature indices.
+        - mask: Array of length n_features and with dtype=bool.
+
+        Non-categorical features are always stacked to the right of the matrix.
+
+    dtype : number type, default=np.float
+        Desired dtype of output.
+
+    Attributes
+    ----------
+    `active_features_` : array
+        Indices for active features, meaning values that actually occur
+        in the training set. Only available when n_values is ``'auto'``.
+
+    `feature_indices_` : array of shape (n_features,)
+        Indices to feature ranges.
+        Feature ``i`` in the original data is mapped to features
+        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
+        (and then potentially masked by `active_features_` afterwards)
+
+    `n_values_` : array of shape (n_features,)
+        Maximum number of values per feature.
+
+    Examples
+    --------
+    Given a dataset with three features and two samples, we let the encoder
+    find the maximum value per feature and transform the data to a binary
+    one-hot encoding.
+
+    >>> from sklearn.preprocessing import OneHotEncoder
+    >>> enc = OneHotEncoder()
+    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
+[1, 0, 2]])  # doctest: +ELLIPSIS
+    OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
+           n_values='auto')
+    >>> enc.n_values_
+    array([2, 3, 4])
+    >>> enc.feature_indices_
+    array([0, 2, 5, 9])
+    >>> enc.transform([[0, 1, 1]]).toarray()
+    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])
+
+    See also
+    --------
+    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
+      dictionary items (also handles string-valued features).
+    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
+      encoding of dictionary items or strings.
+    """
+    def __init__(self, n_values="auto", categorical_features="all",
+                 dtype=np.float):
+        self.n_values = n_values
+        self.categorical_features = categorical_features
+        self.dtype = dtype
+
+    def fit(self, X, y=None):
+        """Fit OneHotEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_feature)
+            Input array of type int.
+
+        Returns
+        -------
+        self
+        """
+        self.fit_transform(X)
+        return self
+
+    def _fit_transform(self, X):
+        """Assumes X contains only categorical features."""
+        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
+        if np.any(X < 0):
+            raise ValueError("X needs to contain only non-negative integers.")
+        n_samples, n_features = X.shape
+        if self.n_values == 'auto':
+            n_values = np.max(X, axis=0) + 1
+        elif isinstance(self.n_values, numbers.Integral):
+            n_values = np.empty(n_features, dtype=np.int)
+            n_values.fill(self.n_values)
+        else:
+            try:
+                n_values = np.asarray(self.n_values, dtype=int)
+            except (ValueError, TypeError):
+                raise TypeError("Wrong type for parameter `n_values`. Expected"
+                                " 'auto', int or array of ints, got %r"
+                                % type(X))
+            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
+                raise ValueError("Shape mismatch: if n_values is an array,"
+                                 " it has to be of shape (n_features,).")
+        self.n_values_ = n_values
+        n_values = np.hstack([[0], n_values])
+        indices = np.cumsum(n_values)
+        self.feature_indices_ = indices
+
+        column_indices = (X + indices[:-1]).ravel()
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                n_features)
+        data = np.ones(n_samples * n_features)
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+
+        if self.n_values == 'auto':
+            mask = np.array(out.sum(axis=0)).ravel() != 0
+            active_features = np.where(mask)[0]
+            out = out[:, active_features]
+            self.active_features_ = active_features
+
+        return out
+
+    def fit_transform(self, X, y=None):
+        """Fit OneHotEncoder to X, then transform X.
+
+        Equivalent to self.fit(X).transform(X), but more convenient and more
+        efficient. See fit for the parameters, transform for the return value.
+        """
+        return _transform_selected(X, self._fit_transform,
+                                   self.categorical_features, copy=True)
+
+    def _transform(self, X):
+        """Asssumes X contains only categorical features."""
+        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
+        if np.any(X < 0):
+            raise ValueError("X needs to contain only non-negative integers.")
+        n_samples, n_features = X.shape
+
+        indices = self.feature_indices_
+        if n_features != indices.shape[0] - 1:
+            raise ValueError("X has different shape than during fitting."
+                             " Expected %d, got %d."
+                             % (indices.shape[0] - 1, n_features))
+
+        n_values_check = np.max(X, axis=0) + 1
+        if (n_values_check > self.n_values_).any():
+            raise ValueError("Feature out of bounds. Try setting n_values.")
+
+        column_indices = (X + indices[:-1]).ravel()
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                n_features)
+        data = np.ones(n_samples * n_features)
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+        if self.n_values == 'auto':
+            out = out[:, self.active_features_]
+        return out
+
+    def transform(self, X):
+        """Transform X using one-hot encoding.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            Input array of type int.
+
+        Returns
+        -------
+        X_out : sparse matrix, dtype=int
+            Transformed input.
+        """
+        return _transform_selected(X, self._transform,
+                                   self.categorical_features, copy=True)
+
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index e689032f7e41e..e8563f010d24c 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -31,253 +31,9 @@
     'label_binarize',
     'LabelBinarizer',
     'LabelEncoder',
-    'OneHotEncoder',
 ]
 
 
-def _transform_selected(X, transform, selected="all", copy=True):
-    """Apply a transform function to portion of selected features
-
-    Parameters
-    ----------
-    X : array-like or sparse matrix, shape=(n_samples, n_features)
-        Dense array or sparse matrix.
-
-    transform : callable
-        A callable transform(X) -> X_transformed
-
-    copy : boolean, optional
-        Copy X even if it could be avoided.
-
-    selected: "all" or array of indices or mask
-        Specify which features to apply the transform to.
-
-    Returns
-    -------
-    X : array or sparse matrix, shape=(n_samples, n_features_new)
-    """
-    if selected == "all":
-        return transform(X)
-
-    X = atleast2d_or_csc(X, copy=copy)
-
-    if len(selected) == 0:
-        return X
-
-    n_features = X.shape[1]
-    ind = np.arange(n_features)
-    sel = np.zeros(n_features, dtype=bool)
-    sel[np.asarray(selected)] = True
-    not_sel = np.logical_not(sel)
-    n_selected = np.sum(sel)
-
-    if n_selected == 0:
-        # No features selected.
-        return X
-    elif n_selected == n_features:
-        # All features selected.
-        return transform(X)
-    else:
-        X_sel = transform(X[:, ind[sel]])
-        X_not_sel = X[:, ind[not_sel]]
-
-        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
-            return sparse.hstack((X_sel, X_not_sel))
-        else:
-            return np.hstack((X_sel, X_not_sel))
-
-
-class OneHotEncoder(BaseEstimator, TransformerMixin):
-    """Encode categorical integer features using a one-hot aka one-of-K scheme.
-
-    The input to this transformer should be a matrix of integers, denoting
-    the values taken on by categorical (discrete) features. The output will be
-    a sparse matrix were each column corresponds to one possible value of one
-    feature. It is assumed that input features take on values in the range
-    [0, n_values).
-
-    This encoding is needed for feeding categorical data to many scikit-learn
-    estimators, notably linear models and SVMs with the standard kernels.
-
-    Parameters
-    ----------
-    n_values : 'auto', int or array of ints
-        Number of values per feature.
-
-        - 'auto' : determine value range from training data.
-        - int : maximum value for all features.
-        - array : maximum value per feature.
-
-    categorical_features: "all" or array of indices or mask
-        Specify what features are treated as categorical.
-
-        - 'all' (default): All features are treated as categorical.
-        - array of indices: Array of categorical feature indices.
-        - mask: Array of length n_features and with dtype=bool.
-
-        Non-categorical features are always stacked to the right of the matrix.
-
-    dtype : number type, default=np.float
-        Desired dtype of output.
-
-    Attributes
-    ----------
-    `active_features_` : array
-        Indices for active features, meaning values that actually occur
-        in the training set. Only available when n_values is ``'auto'``.
-
-    `feature_indices_` : array of shape (n_features,)
-        Indices to feature ranges.
-        Feature ``i`` in the original data is mapped to features
-        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
-        (and then potentially masked by `active_features_` afterwards)
-
-    `n_values_` : array of shape (n_features,)
-        Maximum number of values per feature.
-
-    Examples
-    --------
-    Given a dataset with three features and two samples, we let the encoder
-    find the maximum value per feature and transform the data to a binary
-    one-hot encoding.
-
-    >>> from sklearn.preprocessing import OneHotEncoder
-    >>> enc = OneHotEncoder()
-    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
-[1, 0, 2]])  # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
-           n_values='auto')
-    >>> enc.n_values_
-    array([2, 3, 4])
-    >>> enc.feature_indices_
-    array([0, 2, 5, 9])
-    >>> enc.transform([[0, 1, 1]]).toarray()
-    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])
-
-    See also
-    --------
-    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
-      dictionary items (also handles string-valued features).
-    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
-      encoding of dictionary items or strings.
-    """
-    def __init__(self, n_values="auto", categorical_features="all",
-                 dtype=np.float):
-        self.n_values = n_values
-        self.categorical_features = categorical_features
-        self.dtype = dtype
-
-    def fit(self, X, y=None):
-        """Fit OneHotEncoder to X.
-
-        Parameters
-        ----------
-        X : array-like, shape=(n_samples, n_feature)
-            Input array of type int.
-
-        Returns
-        -------
-        self
-        """
-        self.fit_transform(X)
-        return self
-
-    def _fit_transform(self, X):
-        """Assumes X contains only categorical features."""
-        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
-        if np.any(X < 0):
-            raise ValueError("X needs to contain only non-negative integers.")
-        n_samples, n_features = X.shape
-        if self.n_values == 'auto':
-            n_values = np.max(X, axis=0) + 1
-        elif isinstance(self.n_values, numbers.Integral):
-            n_values = np.empty(n_features, dtype=np.int)
-            n_values.fill(self.n_values)
-        else:
-            try:
-                n_values = np.asarray(self.n_values, dtype=int)
-            except (ValueError, TypeError):
-                raise TypeError("Wrong type for parameter `n_values`. Expected"
-                                " 'auto', int or array of ints, got %r"
-                                % type(X))
-            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
-                raise ValueError("Shape mismatch: if n_values is an array,"
-                                 " it has to be of shape (n_features,).")
-        self.n_values_ = n_values
-        n_values = np.hstack([[0], n_values])
-        indices = np.cumsum(n_values)
-        self.feature_indices_ = indices
-
-        column_indices = (X + indices[:-1]).ravel()
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
-
-        if self.n_values == 'auto':
-            mask = np.array(out.sum(axis=0)).ravel() != 0
-            active_features = np.where(mask)[0]
-            out = out[:, active_features]
-            self.active_features_ = active_features
-
-        return out
-
-    def fit_transform(self, X, y=None):
-        """Fit OneHotEncoder to X, then transform X.
-
-        Equivalent to self.fit(X).transform(X), but more convenient and more
-        efficient. See fit for the parameters, transform for the return value.
-        """
-        return _transform_selected(X, self._fit_transform,
-                                   self.categorical_features, copy=True)
-
-    def _transform(self, X):
-        """Asssumes X contains only categorical features."""
-        X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
-        if np.any(X < 0):
-            raise ValueError("X needs to contain only non-negative integers.")
-        n_samples, n_features = X.shape
-
-        indices = self.feature_indices_
-        if n_features != indices.shape[0] - 1:
-            raise ValueError("X has different shape than during fitting."
-                             " Expected %d, got %d."
-                             % (indices.shape[0] - 1, n_features))
-
-        n_values_check = np.max(X, axis=0) + 1
-        if (n_values_check > self.n_values_).any():
-            raise ValueError("Feature out of bounds. Try setting n_values.")
-
-        column_indices = (X + indices[:-1]).ravel()
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
-        if self.n_values == 'auto':
-            out = out[:, self.active_features_]
-        return out
-
-    def transform(self, X):
-        """Transform X using one-hot encoding.
-
-        Parameters
-        ----------
-        X : array-like, shape=(n_samples, n_features)
-            Input array of type int.
-
-        Returns
-        -------
-        X_out : sparse matrix, dtype=int
-            Transformed input.
-        """
-        return _transform_selected(X, self._transform,
-                                   self.categorical_features, copy=True)
-
-
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 114b0f6ce77f2..37b81c3d03d71 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -12,10 +12,12 @@
 from sklearn.utils.testing import assert_false
 
 from sklearn.utils.sparsefuncs import mean_variance_axis0
+from sklearn.preprocessing.data import _transform_selected
 from sklearn.preprocessing.data import Binarizer
 from sklearn.preprocessing.data import KernelCenterer
 from sklearn.preprocessing.data import Normalizer
 from sklearn.preprocessing.data import normalize
+from sklearn.preprocessing.data import OneHotEncoder
 from sklearn.preprocessing.data import StandardScaler
 from sklearn.preprocessing.data import scale
 from sklearn.preprocessing.data import MinMaxScaler
@@ -523,3 +525,117 @@ def test_add_dummy_feature_csr():
     X = add_dummy_feature(X)
     assert_true(sparse.isspmatrix_csr(X), X)
     assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+
+
+def test_one_hot_encoder():
+    """Test OneHotEncoder's fit and transform."""
+    X = [[3, 2, 1], [0, 1, 1]]
+    enc = OneHotEncoder()
+    # discover max values automatically
+    X_trans = enc.fit_transform(X).toarray()
+    assert_equal(X_trans.shape, (2, 5))
+    assert_array_equal(enc.active_features_,
+                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+
+    # check outcome
+    assert_array_equal(X_trans,
+                       [[0., 1., 0., 1., 1.],
+                        [1., 0., 1., 0., 1.]])
+
+    # max value given as 3
+    enc = OneHotEncoder(n_values=4)
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 4 * 3))
+    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
+
+    # max value given per feature
+    enc = OneHotEncoder(n_values=[3, 2, 2])
+    X = [[1, 0, 1], [0, 1, 1]]
+    X_trans = enc.fit_transform(X)
+    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
+    assert_array_equal(enc.n_values_, [3, 2, 2])
+    # check that testing with larger feature works:
+    X = np.array([[2, 0, 1], [0, 1, 1]])
+    enc.transform(X)
+
+    # test that an error is raise when out of bounds:
+    X_too_large = [[0, 2, 1], [0, 1, 1]]
+    assert_raises(ValueError, enc.transform, X_too_large)
+
+    # test that error is raised when wrong number of features
+    assert_raises(ValueError, enc.transform, X[:, :-1])
+    # test that error is raised when wrong number of features in fit
+    # with prespecified n_values
+    assert_raises(ValueError, enc.fit, X[:, :-1])
+    # test exception on wrong init param
+    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
+
+    enc = OneHotEncoder()
+    # test negative input to fit
+    assert_raises(ValueError, enc.fit, [[0], [-1]])
+
+    # test negative input to transform
+    enc.fit([[0], [1]])
+    assert_raises(ValueError, enc.transform, [[0], [-1]])
+
+
+def _check_transform_selected(X, X_expected, sel):
+    for M in (X, sparse.csr_matrix(X)):
+        Xtr = _transform_selected(M, Binarizer().transform, sel)
+        assert_array_equal(toarray(Xtr), X_expected)
+
+
+def test_transform_selected():
+    X = [[3, 2, 1], [0, 1, 1]]
+
+    X_expected = [[1, 2, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0])
+    _check_transform_selected(X, X_expected, [True, False, False])
+
+    X_expected = [[1, 1, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0, 1, 2])
+    _check_transform_selected(X, X_expected, [True, True, True])
+    _check_transform_selected(X, X_expected, "all")
+
+    _check_transform_selected(X, X, [])
+    _check_transform_selected(X, X, [False, False, False])
+
+
+def _run_one_hot(X, X2, cat):
+    enc = OneHotEncoder(categorical_features=cat)
+    Xtr = enc.fit_transform(X)
+    X2tr = enc.transform(X2)
+    return Xtr, X2tr
+
+
+def _check_one_hot(X, X2, cat, n_features):
+    ind = np.where(cat)[0]
+    # With mask
+    A, B = _run_one_hot(X, X2, cat)
+    # With indices
+    C, D = _run_one_hot(X, X2, ind)
+    # Check shape
+    assert_equal(A.shape, (2, n_features))
+    assert_equal(B.shape, (1, n_features))
+    assert_equal(C.shape, (2, n_features))
+    assert_equal(D.shape, (1, n_features))
+    # Check that mask and indices give the same results
+    assert_array_equal(toarray(A), toarray(C))
+    assert_array_equal(toarray(B), toarray(D))
+
+
+def test_one_hot_encoder_categorical_features():
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
+
+    cat = [True, False, False]
+    _check_one_hot(X, X2, cat, 4)
+
+    # Edge case: all non-categorical
+    cat = [False, False, False]
+    _check_one_hot(X, X2, cat, 3)
+
+    # Edge case: all categorical
+    cat = [True, True, True]
+    _check_one_hot(X, X2, cat, 5)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 9dcfad3095a27..1e0068ae01b7f 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -10,8 +10,6 @@
 
 from sklearn.preprocessing.data import Binarizer
 from sklearn.preprocessing.label import LabelBinarizer
-from sklearn.preprocessing.label import _transform_selected
-from sklearn.preprocessing.label import OneHotEncoder
 from sklearn.preprocessing.label import LabelEncoder
 
 from sklearn import datasets
@@ -125,119 +123,6 @@ def test_label_binarizer_errors():
     assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
 
 
-def test_one_hot_encoder():
-    """Test OneHotEncoder's fit and transform."""
-    X = [[3, 2, 1], [0, 1, 1]]
-    enc = OneHotEncoder()
-    # discover max values automatically
-    X_trans = enc.fit_transform(X).toarray()
-    assert_equal(X_trans.shape, (2, 5))
-    assert_array_equal(enc.active_features_,
-                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
-    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
-
-    # check outcome
-    assert_array_equal(X_trans,
-                       [[0., 1., 0., 1., 1.],
-                        [1., 0., 1., 0., 1.]])
-
-    # max value given as 3
-    enc = OneHotEncoder(n_values=4)
-    X_trans = enc.fit_transform(X)
-    assert_equal(X_trans.shape, (2, 4 * 3))
-    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
-
-    # max value given per feature
-    enc = OneHotEncoder(n_values=[3, 2, 2])
-    X = [[1, 0, 1], [0, 1, 1]]
-    X_trans = enc.fit_transform(X)
-    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
-    assert_array_equal(enc.n_values_, [3, 2, 2])
-    # check that testing with larger feature works:
-    X = np.array([[2, 0, 1], [0, 1, 1]])
-    enc.transform(X)
-
-    # test that an error is raise when out of bounds:
-    X_too_large = [[0, 2, 1], [0, 1, 1]]
-    assert_raises(ValueError, enc.transform, X_too_large)
-
-    # test that error is raised when wrong number of features
-    assert_raises(ValueError, enc.transform, X[:, :-1])
-    # test that error is raised when wrong number of features in fit
-    # with prespecified n_values
-    assert_raises(ValueError, enc.fit, X[:, :-1])
-    # test exception on wrong init param
-    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
-
-    enc = OneHotEncoder()
-    # test negative input to fit
-    assert_raises(ValueError, enc.fit, [[0], [-1]])
-
-    # test negative input to transform
-    enc.fit([[0], [1]])
-    assert_raises(ValueError, enc.transform, [[0], [-1]])
-
-
-def _check_transform_selected(X, X_expected, sel):
-    for M in (X, sparse.csr_matrix(X)):
-        Xtr = _transform_selected(M, Binarizer().transform, sel)
-        assert_array_equal(toarray(Xtr), X_expected)
-
-
-def test_transform_selected():
-    X = [[3, 2, 1], [0, 1, 1]]
-
-    X_expected = [[1, 2, 1], [0, 1, 1]]
-    _check_transform_selected(X, X_expected, [0])
-    _check_transform_selected(X, X_expected, [True, False, False])
-
-    X_expected = [[1, 1, 1], [0, 1, 1]]
-    _check_transform_selected(X, X_expected, [0, 1, 2])
-    _check_transform_selected(X, X_expected, [True, True, True])
-    _check_transform_selected(X, X_expected, "all")
-
-    _check_transform_selected(X, X, [])
-    _check_transform_selected(X, X, [False, False, False])
-
-
-def _run_one_hot(X, X2, cat):
-    enc = OneHotEncoder(categorical_features=cat)
-    Xtr = enc.fit_transform(X)
-    X2tr = enc.transform(X2)
-    return Xtr, X2tr
-
-
-def _check_one_hot(X, X2, cat, n_features):
-    ind = np.where(cat)[0]
-    # With mask
-    A, B = _run_one_hot(X, X2, cat)
-    # With indices
-    C, D = _run_one_hot(X, X2, ind)
-    # Check shape
-    assert_equal(A.shape, (2, n_features))
-    assert_equal(B.shape, (1, n_features))
-    assert_equal(C.shape, (2, n_features))
-    assert_equal(D.shape, (1, n_features))
-    # Check that mask and indices give the same results
-    assert_array_equal(toarray(A), toarray(C))
-    assert_array_equal(toarray(B), toarray(D))
-
-
-def test_one_hot_encoder_categorical_features():
-    X = np.array([[3, 2, 1], [0, 1, 1]])
-    X2 = np.array([[1, 1, 1]])
-
-    cat = [True, False, False]
-    _check_one_hot(X, X2, cat, 4)
-
-    # Edge case: all non-categorical
-    cat = [False, False, False]
-    _check_one_hot(X, X2, cat, 3)
-
-    # Edge case: all categorical
-    cat = [True, True, True]
-    _check_one_hot(X, X2, cat, 5)
-
 
 def test_label_encoder():
     """Test LabelEncoder's transform and inverse_transform methods"""

From 2e950493631780119251e26c13bff724fd787224 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20Tr=C3=A9segnie?= <nicolas.tresegnie@gmail.com>
Date: Fri, 26 Jul 2013 15:53:24 +0200
Subject: [PATCH 6/6] pyflakes and pep8

---
 sklearn/preprocessing/__init__.py         |  1 +
 sklearn/preprocessing/data.py             |  1 -
 sklearn/preprocessing/imputation.py       |  6 ------
 sklearn/preprocessing/label.py            | 10 +---------
 sklearn/preprocessing/tests/test_label.py |  3 ---
 5 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index e0e2d09d69d13..c5034f1d6975c 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -36,4 +36,5 @@
     'binarize',
     'normalize',
     'scale',
+    'label_binarize',
 ]
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index bfec06fb8072a..4650a7664a852 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -987,4 +987,3 @@ def transform(self, X):
         """
         return _transform_selected(X, self._transform,
                                    self.categorical_features, copy=True)
-
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 0a804663c3d56..30190eac7f94d 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -14,10 +14,6 @@
 from ..utils import atleast2d_or_csr
 from ..utils import atleast2d_or_csc
 
-from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
-from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
-from ..utils.sparsefuncs import inplace_csr_column_scale
-from ..utils.sparsefuncs import mean_variance_axis0
 from ..externals import six
 
 zip = six.moves.zip
@@ -66,8 +62,6 @@ def _get_elem_at_rank(negative_elements, n_zeros, positive_elements, k):
     """Compute the kth largest element of the array formed by
        negative_elements, n_zeros zeros and positive_elements."""
     len_neg = len(negative_elements)
-    len_pos = len(positive_elements)
-
     if k < len_neg:
         return negative_elements[k]
     elif k >= len_neg + n_zeros:
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index e8563f010d24c..36d71b7b8db5f 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -4,24 +4,16 @@
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
-import numbers
-
 import numpy as np
-from scipy import sparse
 
 from ..base import BaseEstimator, TransformerMixin
-from ..utils import check_arrays
-from ..utils import atleast2d_or_csc
+
 from ..utils.fixes import unique
 from ..utils import deprecated
 
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 
-from ..utils.sparsefuncs import inplace_csr_row_normalize_l1
-from ..utils.sparsefuncs import inplace_csr_row_normalize_l2
-from ..utils.sparsefuncs import inplace_csr_column_scale
-from ..utils.sparsefuncs import mean_variance_axis0
 from ..externals import six
 
 zip = six.moves.zip
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 1e0068ae01b7f..a66670b4384c7 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -1,5 +1,4 @@
 import numpy as np
-from scipy import sparse
 
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
@@ -8,7 +7,6 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
 
-from sklearn.preprocessing.data import Binarizer
 from sklearn.preprocessing.label import LabelBinarizer
 from sklearn.preprocessing.label import LabelEncoder
 
@@ -123,7 +121,6 @@ def test_label_binarizer_errors():
     assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
 
 
-
 def test_label_encoder():
     """Test LabelEncoder's transform and inverse_transform methods"""
     le = LabelEncoder()